In [32]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import layers
from matplotlib import pyplot as plt
from mlp_sparse_model import MLPSparseModel
from mlp_plain_model import MLPPlainModel
import time

In [3]:
# Define params
sys_name = 'LLVM'
n_exp = 20

In [4]:
def system_samplesize(sys_name):
    if (sys_name == 'Apache'):
        N_train_all = np.multiply(9, [1, 2, 3, 4, 5])  # This is for Apache
    elif (sys_name == 'BDBJ'):
        N_train_all = np.multiply(26, [1, 2, 3, 4, 5])  # This is for BDBJ
    elif (sys_name == 'BDBC'):
        N_train_all = np.multiply(18, [1, 2, 3, 4, 5])  # This is for BDBC
    elif (sys_name == 'LLVM'):
        N_train_all = np.multiply(11, [1, 2, 3, 4, 5])  # This is for LLVM
    elif (sys_name == 'SQL'):
        N_train_all = np.multiply(39, [1, 2, 3, 4, 5])  # This is for SQL
    elif (sys_name == 'x264'):
        N_train_all = np.multiply(16, [1, 2, 3, 4, 5])  # This is for X264
    elif (sys_name == 'Dune'):
        N_train_all = np.asarray([49, 78, 240, 375])  # This is for Dune
    elif (sys_name == 'hipacc'):
        N_train_all = np.asarray([261, 736, 528, 1281])  # This is for hipacc
    elif (sys_name == 'hsmgp'):
        N_train_all = np.asarray([77, 173, 384, 480])  # This is for hsmgp
    elif (sys_name == 'javagc'):
        N_train_all = np.asarray([423, 534, 855, 2571])  # This is for javagc
    elif (sys_name == 'sac'):
        N_train_all = np.asarray([2060, 2295, 2499, 3261])  # This is for sac
    else:
        raise AssertionError("Unexpected value of 'sys_name'!")

    return N_train_all

In [5]:
def seed_generator(sys_name, sample_size):
    # Generate the initial seed for each sample size (to match the seed
    # of the results in the paper)
    # This is just the initial seed, for each experiment, the seeds will be
    # equal the initial seed + the number of the experiment

    N_train_all = system_samplesize(sys_name)
    if sample_size in N_train_all:
        seed_o = np.where(N_train_all == sample_size)[0][0]
    else:
        seed_o = np.random.randint(1, 101)

    return seed_o

In [44]:
def split_and_scale_data(X_sample, Y_sample, N_train):
    
    # Scale X_train and Y_train
    max_X = np.amax(X_sample, axis=0)
    if 0 in max_X:
        max_X[max_X == 0] = 1
    X_train = np.divide(X_sample, max_X)
    max_Y = np.max(Y_sample)/100
    if max_Y == 0:
        max_Y = 1
    Y_train = np.divide(Y_sample, max_Y)

    # Split train data into 2 parts (67-33)
    N_cross = int(np.ceil(N_train*2/3))
    X_train = X_train[0:N_cross, :]
    Y_train = Y_train[0:N_cross]
    X_val = X_train[N_cross:N_train, :]
    Y_val = Y_train[N_cross:N_train]

    return X_train, Y_train, X_val, Y_val, max_Y

In [None]:
def find_opt_num_layers(X_train, Y_train, X_val, Y_val, config, errors):
        count = 0
        layer_range = range(2, 15)
        lr_range = np.logspace(np.log10(0.0001), np.log10(0.1), 4)
        for n_layer in layer_range:
            config['num_layer'] = n_layer
            for lr_index, lr_initial in enumerate(lr_range):
                # TODO: Use keras for build and train
                model = MLPPlainModel(config, "")
                model.build_train()
                model.train(X_train, Y_train, lr_initial)

                Y_pred_train = model.predict(X_train)
                abs_error_train = np.mean(np.abs(Y_pred_train - Y_train))
                errors['abs_error_all_train'][int(n_layer), lr_index] = abs_error_train

                Y_pred_val = model.predict(X_val)
                abs_error = np.mean(np.abs(Y_pred_val - Y_val))
                errors['abs_error_all'][int(n_layer), lr_index] = abs_error

            # Pick the learning rate that has the smallest train cost
            # Save testing abs_error correspond to the chosen learning_rate
            temp = errors['abs_error_all_train'][int(n_layer), :]/np.max(errors['abs_error_all_train'])
            temp_idx = np.where(abs(temp) < 0.0001)[0]
            if len(temp_idx) > 0:
                lr_best = lr_range[np.max(temp_idx)]
                err_val_best = errors['abs_error_all'][int(n_layer), np.max(temp_idx)]
            else:
                lr_best = lr_range[np.argmin(temp)]
                err_val_best = errors['abs_error_all'][int(n_layer), np.argmin(temp)]

            abs_error_layer_lr[int(n_layer), 0] = err_val_best
            abs_error_layer_lr[int(n_layer), 1] = lr_best

            if abs_err_layer_lr_min >= errors['abs_error_all'][int(n_layer), np.argmin(temp)]:
                abs_err_layer_lr_min = errors['abs_error_all'][int(n_layer), np.argmin(temp)]
                count = 0
            else:
                count += 1

            if count >= 2:
                break
        abs_error_layer_lr = abs_error_layer_lr[abs_error_layer_lr[:, 1] != 0]

        # Get the optimal number of layers
        n_layer_opt = layer_range[np.argmin(abs_error_layer_lr[:, 0])]+5

        return n_layer_opt, n_layer, errors

In [None]:
def find_opt_lr(X_train, Y_train, X_val, Y_val, n_break, config, errors):
    lr_range = np.logspace(np.log10(0.0001), np.log10(0.1), 4)
    for lr_index, lr_initial in enumerate(lr_range):
        # TODO: use keras for build and train
        model = MLPPlainModel(config, "")
        model.build_train()
        model.train(X_train, Y_train, lr_initial)

        Y_pred_train = model.predict(X_train)
        abs_error_train = np.mean(np.abs(Y_pred_train - Y_train))
        errors['abs_error_all_train'][int(n_break), lr_index] = abs_error_train

        Y_pred_val = model.predict(X_val)
        abs_error = np.mean(np.abs(Y_pred_val - Y_val))
        errors['abs_error_all'][int(n_break), lr_index] = abs_error

        temp = errors['abs_error_all_train'][int(n_break), :]/np.max(errors['abs_error_all_train'])
        temp_idx = np.where(abs(temp) < 0.0001)[0]
        if len(temp_idx) > 0:
            lr_best = lr_range[np.max(temp_idx)]
        else:
            lr_best = lr_range[np.argmin(temp)]

        lr_opt = lr_best
            

In [None]:
def nn_l1_val(X_train1, Y_train1, X_train2, Y_train2, n_layer, lambd, lr_initial):
    """
    Args:
        X_train1: train input data (2/3 of the whole training data)
        Y_train1: train output data (2/3 of the whole training data)
        X_train2: validate input data (1/3 of the whole training data)
        Y_train2: validate output data (1/3 of the whole training data)
        n_layer: number of layers of the neural network
        lambd: regularized parameter

    """
    config = dict()
    config['num_input'] = X_train1.shape[1]
    config['num_layer'] = n_layer
    config['num_neuron'] = 128
    config['lambda'] = lambd
    config['verbose'] = 0

    dir_output = 'C:/Users/Downloads/'

    # Build and train model
    model = MLPSparseModel(config, dir_output)
    model.build_train()
    model.train(X_train1, Y_train1, lr_initial)

    # Evaluate trained model on validation data
    Y_pred_val = model.predict(X_train2)
    abs_error = np.mean(np.abs(Y_pred_val - Y_train2))
    rel_error = np.mean(np.abs(np.divide(Y_train2 - Y_pred_val, Y_train2)))

    return abs_error, rel_error

In [None]:
def find_right_lambda():
    lambda_range = np.logspace(-2, np.log10(1000), 30)
    error_min = np.zeros((1, len(lambda_range)))
    rel_error_min = np.zeros((1, len(lambda_range)))
    decay = 'NA'
    for idx, lambd in enumerate(lambda_range):
        val_abserror, val_relerror = nn_l1_val(X_train, Y_train,
                                            X_val, Y_val,
                                            n_layer_opt, lambd, lr_opt)
        error_min[0, idx] = val_abserror
        rel_error_min[0, idx] = val_relerror

    # Find the value of lambda that minimize error_min
    lambda_f = lambda_range[np.argmin(error_min)]

    return lambda_f, error_min, rel_error_min

In [None]:
def solve_and_test_nn(X_train, Y_train, lr_opt, config):
    dir_output = 'C:/Users/Downloads'
    model = MLPSparseModel(config, dir_output)
    model.build_train()
    model.train(X_train, Y_train, lr_opt)

    # End measuring time
    end = time.time()
    time_search_train = end-start
    print('Time cost (seconds): {:.2f}'.format(time_search_train))
    time_all.append(time_search_train)

    # Testing with non-training data (whole data - the training data)
    testing_index = np.setdiff1d(np.array(range(N)), training_index)
    testing_data = data_df[testing_index, :]
    X_test = testing_data[:, 0:n]
    X_test = np.divide(X_test, max_X)
    Y_test = testing_data[:, n][:, np.newaxis]

    Y_pred_test = model.predict(X_test)
    Y_pred_test = max_Y*Y_pred_test
    rel_error = np.mean(np.abs(np.divide(Y_test.ravel() - Y_pred_test.ravel(), Y_test.ravel())))
    rel_error_mean.append(np.mean(rel_error)*100)

In [43]:
def main():
    sample_size_all = list(system_samplesize(sys_name))
    print('Read whole dataset from csv file ...')
    dir_data = 'Data/' + sys_name + '_AllNumeric.csv'
    print('Dataset: ' + dir_data)
    data_df = pd.read_csv(dir_data)
    (N, n) = data_df.shape
    
    # Some variables to store results
    result_sys = []
    len_count = 0

    for idx in range(len(sample_size_all)):
        N_train = sample_size_all[idx]
        print("Sample size: {}".format(N_train))
        seed_init = seed_generator(sys_name, N_train)

        rel_error_mean = []
        lambda_all = []
        error_min_all = []
        rel_error_min_all = []
        training_index_all = []
        n_layer_all = []
        lr_all = []
        abs_error_layer_lr_all = []
        time_all = []

        for m in range(1, n_exp+1):

            print("Experiment: {}".format(m))

            # Start measure time
            start = time.time()

            # Set seed and generate training data
            seed = seed_init*n_exp + m
            np.random.seed(seed)

            training_data = data_df.sample(n=N_train)
            X_sample = training_data.drop("PERF", axis=1)
            Y_sample = training_data["PERF"]

            X_train, Y_train, X_val, Y_val = split_and_scale_data(X_sample, Y_sample, N_train, data_df)

            # Choosing the right number of hidden layers and , start with 2
            # The best layer is when adding more layer and the testing error
            # does not increase anymore
            print('Tuning hyperparameters for the neural network ...')
            print('Step 1: Tuning the number of layers and the learning rate ...')

            config = {'num_input':n,
                      'num_neuron':128, 
                      'lambda':'NA', 
                      'decay':'NA',
                      'verbose':0}
            dir_output = 'C:/Users/Downloads'
            errors = {'abs_error_all': np.zeros((15, 4)), 
                      'abs_error_all_train': np.zeros((15, 4)), 
                      'abs_error_layer_lr': np.zeros((15, 2)),
                      'abs_err_layer_lr_min': 100}

            n_layer_opt, n_break, errors = find_opt_num_layers(X_train, Y_train, X_val, Y_val, config, errors)


            # Find the optimal learning rate of the specific layer
            config['num_layer'] = n_layer_opt
            lr_opt = find_opt_lr(X_train, Y_train, X_val, Y_val, n_break, config, errors)
            
            print('The optimal number of layers: {}'.format(n_layer_opt))
            print('The optimal learning rate: {:.4f}'.format(lr_opt))
        
            # Use grid search to find the right value of lambda
            lambda_f, error_min, rel_error_min = find_right_lambda(X_train, Y_train, X_val, Y_val, n_layer_opt, lr_opt)
        
            print('Step 2: Tuning the l1 regularized hyperparameter ...')
            print('The optimal l1 regularizer: {:.4f}'.format(lambda_f))

            # Store some useful results
            n_layer_all.append(n_layer_opt)
            lr_all.append(lr_opt)
            abs_error_layer_lr_all.append(errors['abs_error_layer_lr'])
            lambda_all.append(lambda_f)
            error_min_all.append(error_min)
            rel_error_min_all.append(rel_error_min)

            # Solve the final NN with the chosen lambda_f on the training data
            config = dict()
            config['num_neuron'] = 128
            config['num_input'] = n
            config['num_layer'] = n_layer_opt
            config['lambda'] = lambda_f
            config['verbose'] = 1

            rel_error = solve_and_test_nn(config)

            print('Prediction relative error (%): {:.2f}'.format(np.mean(rel_error)*100))

        result = dict()
        result["N_train"] = N_train
        result["lambda_all"] = lambda_all
        result["n_layer_all"] = n_layer_all
        result["lr_all"] = lr_all
        result["abs_error_layer_lr_all"] = abs_error_layer_lr_all
        result["rel_error_mean"] = rel_error_mean
        result["dir_data"] = dir_data
        result["error_min_all"] = error_min_all
        result["rel_error_min_all"] = rel_error_min_all
        result["training_index"] = training_index_all
        result["time_search_train"] = time_all
        result_sys.append(result)

        # Compute some statistics: mean, confidence interval
        result = []
        for i in range(len(result_sys)):
            temp = result_sys[i]
            sd_error_temp = np.sqrt(np.var(temp['rel_error_mean'], ddof=1))
            ci_temp = 1.96*sd_error_temp/np.sqrt(len(temp['rel_error_mean']))

            result_exp = [temp['N_train'], np.mean(temp['rel_error_mean']),
                          ci_temp]
            result.append(result_exp)

        result_arr = np.asarray(result)

        print('Finish experimenting for system {} with sample size {}.'.format(sys_name, N_train))

        print('Mean prediction relative error (%) is: {:.2f}, Margin (%) is: {:.2f}'.format(np.mean(rel_error_mean), ci_temp))        

        # Save the result statistics to a csv file after each sample
        # Save the raw results to an .npy file
        print('Save results to the current directory ...')

        filename = 'result_' + sys_name + '.csv'
        np.savetxt(filename, result_arr, fmt="%f", delimiter=",",
                   header="Sample size, Mean, Margin")
        print('Save the statistics to file ' + filename + ' ...')

        filename = 'result_' + sys_name + '_AutoML_veryrandom.npy'
        np.save(filename, result_sys)
        print('Save the raw results to file ' + filename + ' ...')


##Plot the performance predictions
#plt.figure()
#plt.plot(Y_test, 'r')
#plt.plot(Y_pred_test, 'b')
#plt.show()


#plt.figure()
#plt.plot(lambda_range, rel_error_min[0])
#plt.plot(error_min[0])
#
#
#plt.figure()
#plt.plot(rel_error_min_all[0][0])


## Load the raw result and compute the statistics 
## Compute the statistics (mean and confidence interval)
#result_temp = np.load('result_Apache_AutoML_veryrandom.npy').tolist()
#for idx in range(5):
#    rel_error_mean_temp = result_temp[idx]['rel_error_mean']
#    N_train_temp = result_temp[idx]['N_train']
#    print(N_train_temp)
#    print(np.mean(rel_error_mean_temp))
#
#    sd_error_temp = np.sqrt(np.var(rel_error_mean_temp, ddof=1))
#    ci_temp = 1.96*sd_error_temp/np.sqrt(len(rel_error_mean_temp))
#
#    print(ci_temp)




SyntaxError: invalid syntax (<ipython-input-43-2a7df9746039>, line 48)

In [36]:
main()

Read whole dataset from csv file ...
Dataset: Data/LLVM_AllNumeric.csv
Sample size: 11
Experiment: 1


MinMaxScaler(feature_range=     gvn  instcombine  inline  ...  iv_users  licm        PERF
830    0            1       0  ...         1     1  235.780000
795    0            1       0  ...         0     1  235.853333
495    0            0       0  ...         1     1  214.370000
822    0            1       0  ...         0     1  236.736667
859    1            1       0  ...         1     1  247.803333
104    0            1       1  ...         1     0  226.130000
27     1            1       1  ...         0     1  252.446667
986    0            1       1  ...         0     1  249.493333
735    1            0       0  ...         0     1  236.640000
786    0            1       0  ...         0     1  237.103333
491    0            0       0  ...         1     1  216.586667

[11 rows x 11 columns])

Unnamed: 0,gvn,instcombine,inline,jump_threading,simplifycfg,sccp,print_used_types,ipsccp,iv_users,licm
830,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
795,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
495,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
822,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
859,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
104,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
27,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
986,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
735,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
786,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


830     93.397945
795     93.426994
495     84.916946
822     93.776903
859     98.160668
104     89.575356
27     100.000000
986     98.830116
735     93.738611
786     93.922149
491     85.795019
Name: PERF, dtype: float64

TypeError: cannot unpack non-iterable NoneType object