In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
from statistics import mean
from matplotlib.animation import FuncAnimation, PillowWriter 
from matplotlib import rc
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from math import sqrt
from modAL.models import ActiveLearner, CommitteeRegressor
from modAL.disagreement import max_std_sampling
import time
from statistics import mean, median

## LRZIP Dataset generation

In [None]:
def generate_config():
    config_option = []
    all_possible_configs = []
    for algorithm in ['-z', '-b', '-g', '-n', '-l']:
        for level in range(8, 10):
            for window in range(1, 100, 20):
                for nice in range(-20, 20, 8):
                    for processor in range(1, 5):
                        _cmd = 'sudo lrzip {} -L {} -w {} -N {} -p {}'.format(algorithm, level, window, nice, processor)
                        all_possible_configs.append([algorithm, level, window, nice, processor])
                        config_option.append(_cmd)
    return config_option, all_possible_configs


def transfer_config(all_possible_configs):
    fea_algo_feature = np.eye(5)
    fea_algo_list = ['-b', '-g', '-l', '-n', '-z']
    all_possible_configs_cur = all_possible_configs
    all_possible_configs_cur = np.asanyarray(all_possible_configs_cur)
    config_features = []
    for possible_config in all_possible_configs_cur:
        algo_feature = fea_algo_feature[fea_algo_list.index(possible_config[0])]
        config_features.append(np.concatenate([algo_feature, np.asarray(possible_config[1:], dtype=float)]))
    config_features = np.asarray(config_features)
    scaler = MinMaxScaler()
    # scaler = StandardScaler()
    scaler.fit(config_features)
    config_features = scaler.transform(config_features)
    return config_features

In [None]:
lrzip_config, config_signal = generate_config()
all_input_signal = transfer_config(config_signal)
all_data = pd.read_csv("./lrzip.csv", index_col=0)
results = np.asarray(all_data[all_data['commit_num'] == 1]['time'])
all_possible_configs_cur = np.asarray(config_signal)
config_features = np.asarray(all_input_signal)

# CoMSA

In [None]:
R2_list = []
RMSE_list = []
time_avg = 0
for seed_num in range(0,20,1):
    print(seed_num)
    np.random.seed(seed_num)
    sampled_config_ids = list(np.random.randint(1000, size=6))
    initial_idx = np.array_split(sampled_config_ids, 2)

    learner_list = [ActiveLearner(
                        estimator=XGBRegressor(),
                        X_training=config_features[idx], y_training=results[idx]
                )
                for idx in initial_idx]

    # initializing the Committee
    committee = CommitteeRegressor(
        learner_list=learner_list,
        query_strategy=max_std_sampling
    )

    model=XGBRegressor()
    n_queries = 54
    res_al = []
    res_al_r2 = []
    start = time.time()
    for idx in range(n_queries):
        X_train = config_features[sampled_config_ids]
        y_train = results[sampled_config_ids]
        X_test = config_features[~np.isin(np.arange(len(config_features)), sampled_config_ids)]
        y_test = results[~np.isin(np.arange(len(config_features)), sampled_config_ids)]
        model.fit(X_train, y_train)
        Y_predict = model.predict(X_test)
        square_error = []
        for i in range(len(X_test)):
            RE=abs(y_test[i]-Y_predict[i])
            square_error.append(RE*RE)
        res_al.append(sqrt(sum(square_error)/len(square_error)))
        # print(sum(relative_error)/len(relative_error))
        res_al_r2.append(model.score(X_test,y_test))
        query_idx, query_instance = committee.query(config_features)
        sampled_config_ids += list(query_idx)
        committee.teach(config_features[query_idx], results[query_idx])
    end = time.time()
    time_avg = time_avg + end - start 
    print('time:'+ str(end - start))
    RMSE_list.append(res_al)
    R2_list.append(res_al_r2)

In [None]:
# np.save('CoM-XG.npy', R2_list)

## Crossover and mutation

In [None]:
def get_all_errors(X_train, y_train, regr):
    y_preds = regr.predict(X_train)
    square_errors = (y_train - y_preds)**2
    normalized_errors = (square_errors - square_errors.min()) / (square_errors.max() - square_errors.min())
    return square_errors, normalized_errors

def get_all_distances(X_train):
    dis_metrics = euclidean_distances(X_train, X_train)
    dis_metrics_sum = np.sum(dis_metrics, axis=1)
    normalized_dis = (dis_metrics_sum - dis_metrics_sum.min()) / (dis_metrics_sum.max() - dis_metrics_sum.min())
    return dis_metrics, normalized_dis

def get_ids_by_score(normalized_errors, normalized_dis, dis_metrics, sampled_config_ids, already_crossovered_config_id_list, ratio=0.5):
    # print(ratio)
    all_possible_configs_pairs = []
    all_scores = ratio*normalized_errors + (1-ratio)*normalized_dis
    sorted_idx_desc = all_scores.argsort()[:][::-1]
    # config_id_1 = sorted_idx_desc[0]
    dis_median = np.median(dis_metrics)
    # dis_median = 0
    for config_id_1 in sorted_idx_desc:
        # second_idx_desc = get_second_point_list_by_distance(dis_metrics, config_id_1)
        for config_id in sorted_idx_desc:
            if dis_metrics[config_id_1][config_id] >= dis_median:
                config_id_1_ori, config_id_2_ori = sampled_config_ids[config_id_1], sampled_config_ids[config_id]
                if config_id_1_ori not in already_crossovered_config_id_list and config_id_2_ori not in already_crossovered_config_id_list:
                # if {config_id_1_ori, config_id_2_ori} not in already_crossovered_config_id_list:
                    all_possible_configs_pairs.append([config_id_1_ori, config_id_2_ori])
                    return config_id_1_ori, config_id_2_ori

In [None]:
def crosssover(config_id_1, config_id_2,  weights, sampled_config_ids, already_crossovered_config_id_list):
    global count_failed
    # dis_metrics, normalized_dis = get_all_weighted_distance(X_train, feature_weights)
    # while True:
    # TODO: need to be improved

    # config_id_1, config_id_2 = get_ids_by_score_new(normalized_errors, normalized_dis, dis_metrics, sampled_config_ids, already_crossovered_config_id_list, ratio)
        # config_id_1, config_id_2 = sampled_config_ids[config_id_1], sampled_config_ids[config_id_2]
        # if config_id_1 < len(config_features) and config_id_2 < len(config_features):
            # break
    count_loop = 0
    new_configs = []
    # refine the weight based on the intuitives
    # weights = feature_weights
    algo_count = 5
    # algo_weight = np.sum(np.absolute(weights)[:algo_count])
    algo_weight = np.max(np.absolute(weights)[:algo_count])
    # cut_index_prob_raw = np.insert(np.absolute(weights)[algo_count:], 0, algo_weight)
    # re_fined_weights = np.concatenate([np.repeat(algo_weight, np.absolute(weights)[algo_count:]])
    cut_index_prob_raw = np.insert(np.absolute(weights)[algo_count:], 0, algo_weight)
    cut_index_prob = cut_index_prob_raw/cut_index_prob_raw.sum()
    already_cut_ids = []

    while True:
        count_loop += 1
        cut_index = np.random.randint(0, 5)
        # cut_index = np.random.choice(5, 1, p=cut_index_prob)[0]
        if cut_index == 4:
            cut_index -= 1
        new_config_1 = np.concatenate([all_possible_configs_cur[config_id_1][:cut_index+1], all_possible_configs_cur[config_id_2][cut_index+1:]])
        new_config_1_ids = np.where((all_possible_configs_cur==new_config_1).all(axis=1))[0]
        if new_config_1_ids.size > 0 and new_config_1_ids[0] not in sampled_config_ids:
            # new_config_1_id = np.where((all_possible_configs_cur==new_config_1).all(axis=1))[0][0]
            # if new_config_1_id not in sampled_config_ids:
            # new_configs.append(new_config_1)
            new_config_2 = np.concatenate([all_possible_configs_cur[config_id_2][:cut_index+1], all_possible_configs_cur[config_id_1][cut_index+1:]])
            new_config_2_ids = np.where((all_possible_configs_cur==new_config_2).all(axis=1))[0]
            if new_config_2_ids.size > 0 and new_config_2_ids[0] not in sampled_config_ids:
                new_configs = [new_config_1, new_config_2]
                break
        if count_loop == 100:
            count_failed +=1
            new_config_1, new_config_2 = np.random.randint(len(config_features), size=2)
            new_configs = [all_possible_configs_cur[config_id_1], all_possible_configs_cur[config_id_2]]
            # already_crossovered_config_id_list.append({config_id_1, config_id_2}) 
            break
    # already_crossovered_config_id_list.append({config_id_1, config_id_2})
    already_crossovered_config_id_list += [config_id_1, config_id_2]
    return new_configs

In [None]:
def mutation(pre_configs, weights):
    # weights = regr.coef_
    algo_count = 5
    algo_weight = np.max(np.absolute(weights)[:algo_count])
    # algo_weight = np.sum(np.absolute(weights)[:algo_count])
    # algo_weight = np.mean(np.absolute(weights)[:algo_count])
    # np.sum(np.absolute(weights)[:algo_count])
    # re_fined_weights = np.concatenate([np.repeat(algo_weight, np.absolute(weights)[algo_count:]])
    cut_index_prob_raw = np.insert(np.absolute(weights)[algo_count:], 0, algo_weight)
    cut_index_prob = cut_index_prob_raw/cut_index_prob_raw.sum()
    new_configs = []
    config_len = len(pre_configs[0])
    for pre_config in pre_configs:
        # mut_index = np.random.randint(0, config_len)
        mut_index = np.random.choice(5, 1, p=cut_index_prob)[0]
        while True:
            possible_val = np.random.choice(np.unique(all_possible_configs_cur[:, mut_index]))
            new_config = pre_config.copy()
            if new_config[mut_index] != possible_val:
                new_config[mut_index] = possible_val
                new_config_ids = np.where((all_possible_configs_cur[:]==new_config).all(axis=1))[0]
                if new_config_ids.size > 0:
                    break
        new_configs.append(new_config)
    return new_configs + pre_configs

In [None]:
r2_mutate=[]
time_avg_mutate = 0
for rand_num in range(0,20,1):
    r_square_list = []
    mean_squared_list = []
    mean_squared_all_list = []
    already_crossovered_config_id_list = []
    coefs_list = []
    # sampled_config_ids = np.random.randint(1000, size=6)
    count_failed = 0
    
    print(rand_num)
    start = time.time()
    np.random.seed(rand_num)
    sampled_config_ids = np.random.randint(1000, size=8)
    # X_train, X_test, y_train, y_test=train_test_split(config_features,results, test_size=0.8, random_state=rand_num)
    try:
        for i in range(27):
    
            X_train = config_features[sampled_config_ids]
            y_train= results[sampled_config_ids]
    
            X_test = config_features[~np.isin(np.arange(len(config_features)), sampled_config_ids)]
            y_test = results[~np.isin(np.arange(len(config_features)), sampled_config_ids)]
    
            regr = XGBRegressor()
            regr.fit(X_train, y_train)
            y_pred = regr.predict(X_test)
            # coefs_lst.append(regr.coef_)
            y_pred_all = regr.predict(X_test)
            relative_error = []
            for i in range(len(X_test)):
                RE=abs(y_test[i]-y_pred_all[i])
                relative_error.append(RE*RE)
                r2.append(sqrt(sum(relative_error)/len(relative_error)))
            r_square_list.append(sqrt(sum(relative_error)/len(relative_error)))
        
            # r_square_list.append(r2_score(y_test, y_pred))
            mean_squared_list.append(mean_squared_error(y_test, y_pred))
            mean_squared_all_list.append(mean_squared_error(y_test, y_pred_all))
            feature_weights = regr.feature_importances_
            square_errors, normalized_errors = get_all_errors(X_train, y_train, regr)
            dis_metrics, normalized_dis = get_all_distances(X_train)
            ratio = 0.9
            if np.sum(square_errors) < 0.1:
                ratio = 0
            config_id_1, config_id_2 = get_ids_by_score(normalized_errors, normalized_dis, dis_metrics, sampled_config_ids, already_crossovered_config_id_list, ratio)
    
            new_configs = crosssover(config_id_1, config_id_2, feature_weights, sampled_config_ids, already_crossovered_config_id_list)
            new_configs = mutation(new_configs, feature_weights)
    
            for new_config in new_configs:
                new_config_ids = np.where((all_possible_configs_cur==new_config).all(axis=1))[0]
                if new_config_ids.size > 0:
                    new_config_id = new_config_ids[0]
                    sampled_config_ids = np.append(sampled_config_ids, new_config_id)
        r2_mutate.append(r_square_list)
        end = time.time()
        time_avg_mutate = time_avg_mutate + end - start 
        print(end - start)
    except:
        print("An exception occurred")
    

In [None]:
R2 = np.matrix(r2_mutate, dtype=np.float32)
print(R2)

In [None]:
np.save('./Mutate.npy', R2)

In [None]:
data = []
for i in range(0, len(r2_mutate)):
    subset = [r2_mutate[i][idx] for idx in [0,7,10,14,17,20]]
    data.append(subset)

data_t = np.transpose(data)
for element in data_t:
    print(mean(element))

## Random - Baseline

In [None]:
r2_random = []
time_random = 0
for rand_num in range(0,20,1): 
    print(rand_num)
    start = time.time()
    np.random.seed(rand_num)
    n_queries = 54
    res_al_rand = []
    for idx in [9, 18, 27, 36, 45, 54]:
        sampled_config_ids_rand = list(np.random.randint(len(config_features), size=idx))
        X_train = config_features[sampled_config_ids_rand]
        y_train = results[sampled_config_ids_rand]
        X_test = config_features[~np.isin(np.arange(len(config_features)), sampled_config_ids_rand)]
        y_test = results[~np.isin(np.arange(len(config_features)), sampled_config_ids_rand)]
        model=RandomForestRegressor()
        model.fit(X_train, y_train)
        Y_predict = model.predict(X_test)
        relative_error = []
        for i in range(len(X_test)):
            RE=abs(y_test[i]-Y_predict[i])
            relative_error.append(RE*RE)
        res_al_rand.append(sqrt(sum(relative_error)/len(relative_error)))
    end = time.time()
    print(end-start)
    time_random += end-start
    r2_random.append(res_al_rand)

In [None]:
print(time_random/21)
R2 = np.matrix(r2_random, dtype=np.float32)
print(len(R2))
np.save('./Random.npy', R2)

# FLASH

In [None]:
index = np.load('../Compare/FLASH/LRZIP_AllNumeric.npy')

In [None]:
r2_list=[]
for l in [9,18,27,36,45,54]:
    r2 = []
    for i in range(0, len(index)):
        X_train = config_features[index[i,:l]]
        y_train = results[index[i,:l]]
        X_test = config_features[~np.isin(np.arange(len(config_features)), index[i,:l])]
        y_test = results[~np.isin(np.arange(len(config_features)), index[i,:l])]
        model=XGBRegressor()
        model.fit(X_train, y_train)
        Y_predict = model.predict(X_test)
        relative_error = []
        for i in range(len(X_test)):
            relative_error.append(abs(y_test[i]-Y_predict[i])/y_test[i])
        r2.append(mean(relative_error))
        # r2.append(model.score(X_test,y_test))
    r2_list.append(r2)

In [None]:
R2 = np.matrix(r2_list, dtype=np.float32)
    np.save('../Compare/FLASH_LRZIP.npy', R2)

# SPL

In [None]:
index = np.load('../Compare/SPL_Conqueror/lrzip/divDistBased.npy')

In [None]:
r2_list=[]
for l in range(4, 50, 9):
    r2 = []
    for i in range(0, len(index)):
        np.random.seed(i)
        sampled_config_ids_rand = np.array(np.random.randint(len(X_train), size=4))
        sampled_config_ids_rand = np.concatenate((sampled_config_ids_rand, index[i,:l]))
        X_train = config_features[sampled_config_ids_rand]
        y_train = results[sampled_config_ids_rand]
        X_test = config_features[~np.isin(np.arange(len(config_features)), sampled_config_ids_rand)]
        y_test = results[~np.isin(np.arange(len(config_features)), sampled_config_ids_rand)]
        model=XGBRegressor()
        model.fit(X_train, y_train)
        Y_predict = model.predict(X_test)
        relative_error = []
        for i in range(len(X_test)):
            RE=abs(y_test[i]-Y_predict[i])
            relative_error.append(RE*RE)
        r2.append(sqrt(sum(relative_error)/len(relative_error)))
        # r2.append(model.score(X_test,y_test))
    r2_list.append(r2)

In [None]:
for element in r2_list:
    print(mean(element))

In [None]:
R2 = np.matrix(r2_list, dtype=np.float32)
np.save('../Compare/SPL_Conqueror/SPL_divDistBased_LRZIP_mre.npy', R2)

# NsbS

In [None]:
from sklearn.metrics import r2_score

In [None]:
index = np.load('../Compare/NsbS/lrzip/NsbS.npy')
r2_list=[]
for l in range(4, 50, 9):
    r2 = []
    for i in range(0, len(index)):
        np.random.seed(i)
        sampled_config_ids_rand = np.array(np.random.randint(len(X_train), size=4))
        sampled_config_ids_rand = np.concatenate((sampled_config_ids_rand, index[i,:l]))
        X_train = config_features[sampled_config_ids_rand]
        y_train = results[sampled_config_ids_rand]
        X_test = config_features[~np.isin(np.arange(len(config_features)), sampled_config_ids_rand)]
        y_test = results[~np.isin(np.arange(len(config_features)), sampled_config_ids_rand)]
        model=XGBRegressor()
        model.fit(X_train, y_train)
        Y_predict = model.predict(X_test)
        relative_error = []
        # print(r2_score(y_test, Y_predict))
        for i in range(len(X_test)):
            RE=abs(y_test[i]-Y_predict[i])
            relative_error.append(RE*RE)
        r2.append(sqrt(sum(relative_error)/len(relative_error)))
        # print(r2)
        #r2.append(model.score(X_test,y_test))
    r2_list.append(r2)

In [None]:
for element in r2_list:
    print(mean(element))

In [None]:
R2 = np.matrix(r2_list, dtype=np.float32)
np.save('../Compare/NsbS/lrzip/result_mre.npy', R2)