In [1]:
from knobs.knob_infos import spark, redis, rocksdb
from pyDOE import *
from scipy.stats.distributions import norm
import os

In [2]:
def convert_outlier(x, dict_dbms, key):
    '''
        x : normed_data
        dict_dbms : dbms.continuous or dbms.numeric_cat or dbms.string_cat
        key : knob name
    '''
    len_ = len(dict_dbms[key])
        
    if len_ == 3: # continuous data
        x = np.where(x < dict_dbms[key][0], dict_dbms[key][0], x)
        x = np.where(x > dict_dbms[key][1], dict_dbms[key][1], x)
    elif len_ == 2:
        x = np.where(x <= 0, 0, x)
        x = np.where(x > len(dict_dbms[key][0])-1, len(dict_dbms[key][0])-1, x)
    return x

https://pythonhosted.org/pyDOE/randomized.html

# Single LHS

In [None]:
### Latin-Hypercube Sampling ### 
def LH_Sampling(dbms, sample_num):
    lhd = lhs(len(dbms.knob_names), samples=sample_num)
    
    for i, k in enumerate(dbms.knob_names):
        normed_data = norm(loc=dbms.mean[i], scale=dbms.std[i]).ppf(lhd[:, i])
        normed_data = np.round(normed_data)
        
        # If the values are larger than maximum or less than minimum, set the values to be the maximum or minimum values.
        if k in dbms.continuous_names:
            normed_data = convert_outlier(normed_data, dbms.continuous, k)
        if dbms.numeric_cat_names is not None and k in dbms.numeric_cat_names:
            normed_data = convert_outlier(normed_data, dbms.numeric_cat, k)
        if dbms.string_cat_names is not None and k in dbms.string_cat_names:
            normed_data = convert_outlier(normed_data, dbms.string_cat, k)

        lhd[:, i] = normed_data
    lhd = np.round(lhd)

    # values of numeric_cat are index number so replace the index numbers to numerical values.
    for i, k in enumerate(dbms.knob_names):
        if dbms.numeric_cat_names is not None and k in dbms.numeric_cat_names:
            normed_data = lhd[:, i]
            for n, idx in enumerate(normed_data):
                normed_data[n] = dbms.numeric_cat[k][0][int(idx)]
            lhd[:, i] = normed_data.astype(float)
            
    return lhd

In [None]:
spark_samples = LH_Sampling(spark, 20)
redis_samples = LH_Sampling(redis, 20)
rocksdb_samples = LH_Sampling(rocksdb, 20)

# ADDB LHS - 1
- Generate knobs in one file

In [35]:
# ADDB_LHSampling(sample)
sample_num = 10
addb = [spark, redis, rocksdb]
addb_name = ['spark', 'redis', 'rocksdb']
addb_len = [len(spark.knob_names), len(redis.knob_names), len(rocksdb.knob_names)]

In [39]:
def write_knobs(f, selected_db, name, val):
    if name in selected_db.continuous_names:
        f.writelines(f'{name} {val}\n')
    if selected_db.numeric_cat_names is not None and name in selected_db.numeric_cat_names:
        f.writelines(f'{name} {val}\n')
#         f.writelines(f'{name} {selected_db.numeric_cat[name][0][val]}\n')
    if selected_db.string_cat is not None and name in selected_db.string_cat_names:
        f.writelines(f'{name} {selected_db.string_cat[name][0][val]}\n')

def create_conf_file(CONF_FILE, addb_sample, addb, addb_name, addb_len):
    f = open(CONF_FILE, 'w')

    for ld in range(len(addb)):
        selected_db = addb[ld]
        f.writelines(f'[{addb_name[ld]}]\n')
        for i, name in enumerate(selected_db.knob_names):
            i += sum(addb_len[:ld])
            selected_db = addb[ld]
            val = int(addb_sample[i])
            write_knobs(f, selected_db, name, val)
            if i == addb_len[ld]:
                cnt += 1
#                 f.writelines(f'[{addb_name[cnt]}]\n')
        f.writelines('\n')
    f.close()

In [37]:
def generate_addb_samples(sample_num, addb, addb_name, addb_len):
    addb_lhd = lhs(sum(addb_len), samples=sample_num)

    for a, dbms in enumerate(addb):
        for i, k in enumerate(dbms.knob_names):
            idx = sum(addb_len[:a]) + i
            normed_data = norm(loc=dbms.mean[i], scale=dbms.std[i]).ppf(addb_lhd[:, idx])
            normed_data = np.round(normed_data)

            # If the values are larger than maximum or less than minimum, set the values to be the maximum or minimum values.
            if k in dbms.continuous_names:
                normed_data = convert_outlier(normed_data, dbms.continuous, k)
            if dbms.numeric_cat_names is not None and k in dbms.numeric_cat_names:
                normed_data = convert_outlier(normed_data, dbms.numeric_cat, k)
            if dbms.string_cat_names is not None and k in dbms.string_cat_names:
                normed_data = convert_outlier(normed_data, dbms.string_cat, k)

            addb_lhd[:, idx] = normed_data
        addb_lhd = np.round(addb_lhd)

        for i, k in enumerate(dbms.knob_names):
            if dbms.numeric_cat_names is not None and k in dbms.numeric_cat_names:
                idx = sum(addb_len[:a]) + i
                normed_data = addb_lhd[:, idx]
                for n, data in enumerate(normed_data):
                    normed_data[n] = dbms.numeric_cat[k][0][int(data)]
                addb_lhd[:, idx] = normed_data.astype(float)
    
    addb_samples = addb_lhd
    
    CONF_PATH = 'configs/'
    if os.path.isdir(CONF_PATH) is False:
        os.mkdir(CONF_PATH)
        
    for num, addb_sample in enumerate(addb_samples):
        CONF_NAME = f'addb_config{num}.conf'
        create_conf_file(os.path.join(CONF_PATH, CONF_NAME), addb_sample, addb, addb_name, addb_len)

In [40]:
generate_addb_samples(sample_num=5, addb=addb, addb_name=addb_name, addb_len=addb_len)

# ADDB LHS - 2
- Generate knobs separated two files in master and slaves

In [44]:
# ADDB_LHSampling(sample)
sample_num = 10
addb = [spark, redis, rocksdb]
addb_name = ['spark', 'redis', 'rocksdb']
addb_len = [len(spark.knob_names), len(redis.knob_names), len(rocksdb.knob_names)]

In [66]:
def write_knobs(selected_db, name, val):
    if name in selected_db.continuous_names:
        knob_line = f'{name} {val}\n'
    if selected_db.numeric_cat_names is not None and name in selected_db.numeric_cat_names:
        knob_line = f'{name} {val}\n'
#         f.writelines(f'{name} {selected_db.numeric_cat[name][0][val]}\n')
    if selected_db.string_cat is not None and name in selected_db.string_cat_names:
        knob_line = f'{name} {selected_db.string_cat[name][0][val]}\n'
    return knob_line

def create_conf_file(CONF_FILE, addb_sample, addb, addb_name, addb_len):
    f = open(CONF_FILE, 'w')
    file_inputs = []
    cnt = 0
    for ld in range(len(addb)):
        selected_db = addb[ld]
        file_inputs.append(f'[{addb_name[ld]}]\n')
        for i, name in enumerate(selected_db.knob_names):
            i += sum(addb_len[:ld])
            selected_db = addb[ld]
            val = int(addb_sample[i])
            file_inputs.append(write_knobs(selected_db, name, val))
            if i == addb_len[ld]:
                cnt += 1
        file_inputs.append('\n')
    f.writelines(file_inputs[:-1])
    f.close()

In [46]:
def generate_addb_samples(sample_num, addb, addb_name, addb_len):
    addb_lhd = lhs(sum(addb_len), samples=sample_num)

    for a, dbms in enumerate(addb):
        for i, k in enumerate(dbms.knob_names):
            idx = sum(addb_len[:a]) + i
            normed_data = norm(loc=dbms.mean[i], scale=dbms.std[i]).ppf(addb_lhd[:, idx])
            normed_data = np.round(normed_data)

            # If the values are larger than maximum or less than minimum, set the values to be the maximum or minimum values.
            if k in dbms.continuous_names:
                normed_data = convert_outlier(normed_data, dbms.continuous, k)
            if dbms.numeric_cat_names is not None and k in dbms.numeric_cat_names:
                normed_data = convert_outlier(normed_data, dbms.numeric_cat, k)
            if dbms.string_cat_names is not None and k in dbms.string_cat_names:
                normed_data = convert_outlier(normed_data, dbms.string_cat, k)

            addb_lhd[:, idx] = normed_data
        addb_lhd = np.round(addb_lhd)

        for i, k in enumerate(dbms.knob_names):
            if dbms.numeric_cat_names is not None and k in dbms.numeric_cat_names:
                idx = sum(addb_len[:a]) + i
                normed_data = addb_lhd[:, idx]
                for n, data in enumerate(normed_data):
                    normed_data[n] = dbms.numeric_cat[k][0][int(data)]
                addb_lhd[:, idx] = normed_data.astype(float)
    
    addb_samples = addb_lhd
    
    MASTER_PATH = 'configs/master/'
    SLAVE_PATH = 'configs/slave/'
    if os.path.isdir(MASTER_PATH) is False:
        os.mkdir(MASTER_PATH)
    if os.path.isdir(SLAVE_PATH) is False:
        os.mkdir(SLAVE_PATH)
        
    for num, addb_sample in enumerate(addb_samples):
        CONF_NAME = f'addb_config{num}.conf'
        create_conf_file(os.path.join(MASTER_PATH, CONF_NAME), addb_sample[:addb_len[0]], addb[:1], addb_name[:1], addb_len[:1])
        create_conf_file(os.path.join(SLAVE_PATH, CONF_NAME), addb_sample[addb_len[0]:], addb[1:], addb_name[1:], addb_len[1:])

In [67]:
generate_addb_samples(sample_num=1, addb=addb, addb_name=addb_name, addb_len=addb_len)

# ADDB LHS - 3
- Generate knobs separated three files in master, slaves-redis and slave-rocksdb

In [3]:
# ADDB_LHSampling(sample)
sample_num = 100
addb = [spark, redis, rocksdb]
addb_name = ['spark', 'redis', 'rocksdb']
addb_len = [len(spark.knob_names), len(redis.knob_names), len(rocksdb.knob_names)]

In [4]:
def write_knobs(selected_db, name, val):
    if name in selected_db.continuous_names:
        knob_line = f'{name} {val}\n'
    if selected_db.numeric_cat_names is not None and name in selected_db.numeric_cat_names:
        knob_line = f'{name} {val}\n'
#         f.writelines(f'{name} {selected_db.numeric_cat[name][0][val]}\n')
    if selected_db.string_cat is not None and name in selected_db.string_cat_names:
        knob_line = f'{name} {selected_db.string_cat[name][0][val]}\n'
    return knob_line

def create_conf_file(CONF_FILE, addb_sample, addb, addb_name, addb_len):
    f = open(CONF_FILE, 'w')
    file_inputs = []
    cnt = 0
    for ld in range(len(addb)):
        selected_db = addb[ld]
        file_inputs.append(f'[{addb_name[ld]}]\n')
        for i, name in enumerate(selected_db.knob_names):
            i += sum(addb_len[:ld])
            selected_db = addb[ld]
            val = int(addb_sample[i])
            file_inputs.append(write_knobs(selected_db, name, val))
            if i == addb_len[ld]:
                cnt += 1
        file_inputs.append('\n')
    f.writelines(file_inputs[:-1])
    f.close()

In [5]:
def generate_addb_samples(sample_num, addb, addb_name, addb_len):
    addb_lhd = lhs(sum(addb_len), samples=sample_num)

    for a, dbms in enumerate(addb):
        for i, k in enumerate(dbms.knob_names):
            idx = sum(addb_len[:a]) + i
            normed_data = norm(loc=dbms.mean[i], scale=dbms.std[i]).ppf(addb_lhd[:, idx])
            normed_data = np.round(normed_data)

            # If the values are larger than maximum or less than minimum, set the values to be the maximum or minimum values.
            if k in dbms.continuous_names:
                normed_data = convert_outlier(normed_data, dbms.continuous, k)
            if dbms.numeric_cat_names is not None and k in dbms.numeric_cat_names:
                normed_data = convert_outlier(normed_data, dbms.numeric_cat, k)
            if dbms.string_cat_names is not None and k in dbms.string_cat_names:
                normed_data = convert_outlier(normed_data, dbms.string_cat, k)

            addb_lhd[:, idx] = normed_data
        addb_lhd = np.round(addb_lhd)

        for i, k in enumerate(dbms.knob_names):
            if dbms.numeric_cat_names is not None and k in dbms.numeric_cat_names:
                idx = sum(addb_len[:a]) + i
                normed_data = addb_lhd[:, idx]
                for n, data in enumerate(normed_data):
                    normed_data[n] = dbms.numeric_cat[k][0][int(data)]
                addb_lhd[:, idx] = normed_data.astype(float)
    
    addb_samples = addb_lhd
    
    MASTER_PATH = 'configs/master/'
    SLAVE_REDIS_PATH = 'configs/slave/redis'
    SLAVE_ROCKSDB_PATH = 'configs/slave/rocksdb'
    if os.path.isdir(MASTER_PATH) is False:
        os.mkdir(MASTER_PATH)
    if os.path.isdir(SLAVE_REDIS_PATH) is False:
        os.mkdir(SLAVE_REDIS_PATH)
    if os.path.isdir(SLAVE_ROCKSDB_PATH) is False:
        os.mkdir(SLAVE_ROCKSDB_PATH)
        
    for num, addb_sample in enumerate(addb_samples):
        CONF_NAME = f'addb_config{num}.conf'
        create_conf_file(CONF_FILE=os.path.join(MASTER_PATH, CONF_NAME), 
                         addb_sample=addb_sample[:addb_len[0]], 
                         addb=[addb[0]], 
                         addb_name=[addb_name[0]], 
                         addb_len=[addb_len[0]])
        create_conf_file(CONF_FILE=os.path.join(SLAVE_REDIS_PATH, CONF_NAME), 
                         addb_sample=addb_sample[addb_len[0]:addb_len[0]+addb_len[1]], 
                         addb=[addb[1]], 
                         addb_name=[addb_name[1]], 
                         addb_len=[addb_len[1]])
        create_conf_file(CONF_FILE=os.path.join(SLAVE_ROCKSDB_PATH, CONF_NAME), 
                         addb_sample=addb_sample[-addb_len[2]:],
                         addb=[addb[2]], 
                         addb_name=[addb_name[2]], 
                         addb_len=[addb_len[2]])

In [7]:
generate_addb_samples(sample_num=sample_num, addb=addb, addb_name=addb_name, addb_len=addb_len)

# Spark LHS
- Generate knobs for spark

In [3]:
# ADDB_LHSampling(sample)
sample_num = 10
addb = [spark]
addb_name = ['spark']
addb_len = [len(spark.knob_names)]

In [29]:
def write_knobs(selected_db, name, val):
    if name in selected_db.continuous_names:
        knob_line = f'{name} {int(val)}\n'
    if selected_db.numeric_cat_names is not None and name in selected_db.numeric_cat_names:
        knob_line = f'{name} {val}\n'
#         f.writelines(f'{name} {selected_db.numeric_cat[name][0][val]}\n')
    if selected_db.string_cat is not None and name in selected_db.string_cat_names:
        knob_line = f'{name} {selected_db.string_cat[name][0][int(val)]}\n'
    return knob_line

def create_conf_file(CONF_FILE, addb_sample, addb, addb_name, addb_len):
    f = open(CONF_FILE, 'w')
    file_inputs = []
    cnt = 0
    for ld in range(len(addb)):
        selected_db = addb[ld]
        file_inputs.append(f'[{addb_name[ld]}]\n')
        for i, name in enumerate(selected_db.knob_names):
            i += sum(addb_len[:ld])
            selected_db = addb[ld]
#             val = int(addb_sample[i])
            val = addb_sample[i]
            file_inputs.append(write_knobs(selected_db, name, val))
            if i == addb_len[ld]:
                cnt += 1
        file_inputs.append('\n')
    f.writelines(file_inputs[:-1])
    f.close()

In [32]:
def generate_spark_samples(sample_num, addb, addb_name, addb_len):
    addb_lhd = lhs(sum(addb_len), samples=sample_num)

    for a, dbms in enumerate(addb):
        for i, k in enumerate(dbms.knob_names):
            idx = sum(addb_len[:a]) + i
            normed_data = norm(loc=dbms.mean[i], scale=dbms.std[i]).ppf(addb_lhd[:, idx])
            normed_data = np.round(normed_data)

            # If the values are larger than maximum or less than minimum, set the values to be the maximum or minimum values.
            if k in dbms.continuous_names:
                normed_data = convert_outlier(normed_data, dbms.continuous, k)
            if dbms.numeric_cat_names is not None and k in dbms.numeric_cat_names:
                normed_data = convert_outlier(normed_data, dbms.numeric_cat, k)
            if dbms.string_cat_names is not None and k in dbms.string_cat_names:
                normed_data = convert_outlier(normed_data, dbms.string_cat, k)

            addb_lhd[:, idx] = normed_data
        addb_lhd = np.round(addb_lhd)

        for i, k in enumerate(dbms.knob_names):
            if dbms.numeric_cat_names is not None and k in dbms.numeric_cat_names:
                idx = sum(addb_len[:a]) + i
                normed_data = addb_lhd[:, idx]
                for n, data in enumerate(normed_data):
                    normed_data[n] = dbms.numeric_cat[k][0][int(data)]
                addb_lhd[:, idx] = normed_data.astype(float)
    
    addb_samples = addb_lhd
    
    CONFIG_PATH = 'configs/spark'
    if os.path.isdir(CONFIG_PATH) is False:
        os.mkdir(CONFIG_PATH)
        
    for num, addb_sample in enumerate(addb_samples):
        CONF_NAME = f'addb_config{num}.conf'
        create_conf_file(os.path.join(CONFIG_PATH, CONF_NAME), addb_sample, addb, addb_name, addb_len)
#         create_conf_file(os.path.join(SLAVE_PATH, CONF_NAME), addb_sample[addb_len[0]:], addb[1:], addb_name[1:], addb_len[1:])

In [34]:
generate_spark_samples(sample_num=1000, addb=addb, addb_name=addb_name, addb_len=addb_len)