In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ast import literal_eval
import math
import quantecon as qe
from sklearn.model_selection import cross_val_score
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [2]:
def create_bayesian_table(directory, name, experiment_per_generation, experiments, elements, activity, factor = 1):
    df = pd.read_csv(str(directory) + str(name) + '.txt', sep='\t')
    df = df[0:experiments]
    x_old = df[elements].to_numpy()
    x_old = np.around(x_old, decimals = 3)
    
    ID = list(range(1, experiment_per_generation+1))*(math.ceil(len(df)/experiment_per_generation))
    ID = np.array(ID[0:len(df)])

    gen = []
    for i in range(math.ceil(len(df)/experiment_per_generation)):
        gen.append([i]*experiment_per_generation)
    gen = np.array(gen)
    gen = np.reshape(gen, -1)
    gen = gen[0:len(df)]
    
    data = {'ID' : ID, 'Elements': [elements], 'Generation': gen}
    datalog = pd.DataFrame(data=data, index = np.arange(len(df)))
    datalog = pd.concat([datalog, pd.DataFrame(([[i] for i in x_old]), columns = ['Position'])], axis = 1)
    datalog['Activity'] = df[str(activity)] * factor
    return datalog

def data_parity_BO(label, 
                   load_location,
                   experiment_per_generation,
                   total_experiments,
                   element,
                   grid_len,
                   save_location,
                   grid_location,
                   upper_boundaries,
                   max_activity,
                   name,
                   length_scale):
    datalog = create_bayesian_table(directory = '../raw_data/composition_vs_activity/' + load_location,
                                    name = label,
                                    experiment_per_generation = experiment_per_generation,
                                    experiments = total_experiments,
                                    elements = element,
                                    activity = 'Activity',
                                    factor = -1)



    #Preaparing data for fitting
    x = []
    for i in range(len(datalog)):
        x.append(list(datalog['Position'][i]))
    x = np.array(x)
    y = datalog['Activity'].to_numpy()

    #GPR model
    kernel = RBF(length_scale = length_scale, length_scale_bounds = 'fixed') + WhiteKernel()
    gpr = GaussianProcessRegressor(kernel = kernel, random_state=0, normalize_y=True).fit(x, y)

    #Create Grid Data based on GPR
    elements = len(element)
    grid_len = grid_len
    grid = qe.simplex_grid(elements, grid_len)/grid_len
    element = element

    act = []
    for i in grid:
        act.append(float(gpr.predict(np.reshape(i, (1, -1)))))
    data_grid = pd.DataFrame(grid, columns = [element])
    data_grid = data_grid.assign(Activity = act)
    data_grid.to_csv(save_location, sep='\t', mode='w')

    #Try to remove the data outside the boundaries
    df = pd.read_csv(save_location, sep='\t')
    df = df.drop(['Unnamed: 0'], axis=1)

    for i in element:
        df = df.drop(df[df[i] > upper_boundaries].index)

    #Index need to be removed after removing some 
    df = df.reset_index(drop='True')

    #Real Grid
    real_grid = pd.read_csv(grid_location, sep='\t')
    real_grid = real_grid.drop(['Unnamed: 0'], axis=1)

    for i in element:
        real_grid = real_grid.drop(real_grid[real_grid[i] > upper_boundaries].index)

    #Index need to be removed after removing some 
    real_grid = real_grid.reset_index(drop='True')
    
    #Converting result in np.array
    result = np.array([real_grid['Activity'], df['Activity']])
    result = result.T
    result = abs(result/max_activity)
    length_scale = float(str(gpr.kernel_).split("length_scale=")[1].split(")")[0])
    return result, length_scale


def data_parity_PSO(load_location,
                    data_number,
                    element,
                    grid_len,
                    save_location,
                    grid_location,
                    upper_boundaries,
                    max_activity, name,
                    length_scale_bounds,
                    length_scale = 1):
    #Loading datalog
    datalog = pd.read_csv('../raw_data/composition_vs_activity/' + load_location, sep='\t')

    #Reading "Elements" columns from string to list
    datalog['Elements'] = datalog["Elements"].apply(lambda x: literal_eval(x))

    #Creating np.array of "Position" column and dropping the string type "Position" column
    position = []
    for k in range(len(datalog)):
        position.append(list(np.fromstring(datalog['Position'][k][1:-1], dtype=float, sep=' ')))
    position = np.array(position)
    datalog = datalog.drop(columns=['Position'])
    datalog = pd.concat([datalog, pd.DataFrame(([[k] for k in position]), columns = ['Position'])], axis = 1)

    datalog = datalog.loc[0:data_number]


    #Preparing data for fitting
    x = []
    for i in range(len(datalog)):
        x.append(list(datalog['Position'][i]))
    x = np.array(x)
    y = datalog['Activity'].to_numpy()

    #GPR model
    kernel = RBF(length_scale = length_scale, length_scale_bounds = length_scale_bounds) + WhiteKernel()
    gpr = GaussianProcessRegressor(kernel = kernel, random_state=0, normalize_y=True).fit(x, y)

    
    #Create Grid Data based on GPR
    grid_len = grid_len
    grid = qe.simplex_grid(len(element), grid_len)/grid_len
    element = element

    act = []
    for i in grid:
        act.append(float(gpr.predict(np.reshape(i, (1, -1)))))
    data_grid = pd.DataFrame(grid, columns = [element])
    data_grid = data_grid.assign(Activity = act)
    data_grid.to_csv(save_location, sep='\t', mode='w')

    #Try to remove the data outside the boundaries
    df = pd.read_csv(save_location, sep='\t')
    df = df.drop(['Unnamed: 0'], axis=1)

    for i in element:
        df = df.drop(df[df[i] > upper_boundaries].index)

    #Index need to be removed after removing some 
    df = df.reset_index(drop='True')

    #Real Grid
    real_grid = pd.read_csv(grid_location, sep='\t')
    real_grid = real_grid.drop(['Unnamed: 0'], axis=1)

    for i in element:
        real_grid = real_grid.drop(real_grid[real_grid[i] > upper_boundaries].index)

    #Index need to be removed after removing some 
    real_grid = real_grid.reset_index(drop='True')
    
    #Converting result in np.array
    result = np.array([real_grid['Activity'], df['Activity']])
    result = result.T
    result = abs(result/max_activity)
    length_scale = float(str(gpr.kernel_).split("length_scale=")[1].split(")")[0])
    return result, length_scale


def calculate_error(result):
    #Calculating MAE
    GP_AE = abs(result[:, 0] - result[:, 1])
    mae = round(GP_AE.mean(), 4)
    
    #Calculating R-squared
    r_square = round(r2_score(result[:, 1], result[:, 0]), 3)
    
    #Calculating RMSE
    rmse = round(np.sqrt(mean_squared_error(result[:, 1], result[:, 0])), 4)
    
    return np.array([mae, r_square, rmse])


def learning_curve(method, data_type, model, data_number, data_number_8D):
    n = model
    if n == 3:
        data_number = data_number_8D
    else:
        data_number = data_number
    
    for j in range(len(data_number)):
        globals()[f"learn_{j}"] = pd.read_csv('../result/LS_optimization/' +
                                              str(data_type) +  '/' + str(method) +
                                              '_model_' + str(n+1) + 
                                              '_data_number_' + str(data_number[j]) + '.txt', sep='\t')


    #Selecting best length scale, and creating variables for plotting
    chosen_length_scale = []
    for i in range(len(data_number)):
        chosen_length_scale.append(globals()['learn_' + str((i))].iloc[np.where(globals()['learn_' + str((i))]['MAE'] == globals()['learn_' + str((i))]['MAE'].min())[0].max()]['Length Scale'])
    chosen_length_scale = np.array(chosen_length_scale)
    
    
    
    if data_type == 'PSO':
        if n == 3:
            grid_location = '../result/grid_data/' + str(models[n]) + '/grid_data_9.txt'
        else:
            grid_location = '../result/grid_data/' + str(models[n]) + '/grid_data_19.txt'
        length_scale = []
        mae = []
        r_square = []
        rmse = []
        for i in range(len(data_number)): 
            parity_data = data_parity_PSO(load_location = str(models[n]) + data_location,
                                          data_number = data_number[i],
                                          element = elements[n],
                                          grid_len = grid_len[n],
                                          save_location = '../result/grid_data/' + str(models[n]) + '/grid_data_gpr.txt',
                                          grid_location = grid_location,
                                          upper_boundaries = upper_boundaries[n],
                                          max_activity = max_activity[n],
                                          name = name[n],
                                          length_scale = chosen_length_scale[i],
                                          length_scale_bounds = 'fixed')
            length_scale.append(parity_data[1])
            mae.append(calculate_error(parity_data[0])[0])
            r_square.append(calculate_error(parity_data[0])[1])
            rmse.append(calculate_error(parity_data[0])[2])
    
    if data_type == 'BO':
        if n == 3:
            grid_location = '../result/grid_data/' + str(models[n]) + '/grid_data_9.txt'
        else:
            grid_location = '../result/grid_data/' + str(models[n]) + '/grid_data_19.txt'
        length_scale = []
        mae = []
        r_square = []
        rmse = []
        for i in range(len(data_number)):
            parity_data = data_parity_BO(label = 'result_0',
                                           load_location = str(models[n]) + '/BO/',
                                           experiment_per_generation = 1,
                                           total_experiments = data_number[i],
                                           element = elements[n],
                                           grid_len = grid_len[n],
                                           save_location = '../result/grid_data/' + str(models[n]) + '/grid_data_gpr_bayes.txt',
                                           grid_location = grid_location,
                                           upper_boundaries = upper_boundaries[n],
                                           max_activity = max_activity[n],
                                           name = name[n],
                                           length_scale = chosen_length_scale[i])
            length_scale.append(parity_data[1])
            mae.append(calculate_error(parity_data[0])[0])
            r_square.append(calculate_error(parity_data[0])[1])
            rmse.append(calculate_error(parity_data[0])[2])
    
    df = pd.DataFrame({'Samples': data_number, 'Length Scale': length_scale, 'MAE': mae, 'R_square': r_square, 'RMSE': rmse})
    return df


In [3]:
models = ['model1', 'model2', 'model3', 'model4']
elements = [['Ag','Ir','Pd','Pt','Ru'], ['Ag','Ir','Pd','Pt','Ru'], ['Ir','Pd','Pt','Rh','Ru'], ['Pt','Pd','Au','Ru','Rh','Ir','Re','Os']]
max_activity = [0.163182, 0.213387, 0.167831, 1.039926]
name = ['PSO Neural Network', 'PSO DFT [Ag,Ir,Pd,Pt,Ru]', 'PSO DFT [Ir,Pd,Pt,Rh,Ru]', 'PSO Experimental']
scale = np.arange(0.1, 0.55, 0.02)
upper_boundaries = [0.8, 1, 1, 1]
grid_len = [20, 20, 20, 10]

# Learning Curve

In [4]:
method = 'k-fold_random'
data_type = 'PSO'
data_number_LC = np.linspace(15, 100, 18).astype(int)
data_number_LC_8D = np.linspace(24, 160, 18).astype(int)
for n in range(4):
    model = n
    #HP 3 is chosen -> str(2)
    data_location = '/PSO/PSO_' + str(2) + '_0.txt'
    globals()['learn_data_' + str(n)] = learning_curve(method, data_type, model, data_number_LC, data_number_LC_8D)
    globals()['learn_data_' + str(n)].to_csv('../result/learning_curve/model' + str(n+1) + '/pso.txt', sep='\t')



In [6]:
method = 'k-fold_random'
data_type = 'BO'
data_number_LC = np.linspace(15, 100, 18).astype(int)
data_number_LC_8D = np.linspace(24, 160, 18).astype(int)
for n in range(4):
    model = n
    globals()['learn_data_' + str(4+n)] = learning_curve(method, data_type, model, data_number_LC, data_number_LC_8D)
    globals()['learn_data_' + str(4+n)].to_csv('../result/learning_curve/model' + str(n+1) + '/bo.txt', sep='\t')

