In [1]:
import pickle, sys
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, Matern

In [2]:
sys.path.append('../src')
import data_pretreatment
import BayesOpt as bo

# 1 Init-selection of CNPs

In [93]:
dft_data_path = '../data/560_DFT_result.xlsx'
exp_data_path = '../data/opt_cnps/exp_data_steps.xlsx'
dft_df = pd.read_excel(dft_data_path, index_col='ID')
# Remove error data of DFT features until fixed
dft_df = dft_df.drop([110, 494, 75, 459], axis=0)
# Descriptors transform
origin_features = dft_df.loc[:, 'IP':'S1-T1'].to_numpy()
# Descriptors rescaling and PCA calculation
pca = PCA(n_components=5)
pca_features = MinMaxScaler().fit_transform(pca.fit_transform(origin_features))

In [97]:
# Calculating the distance matrix and using KS algorithm to pick diversity selection
distance_matrix = data_pretreatment.cal_euclidean_matrix(pca_features)
selected_df = dft_df.loc[data_pretreatment.ks_selection(distance_matrix, n_examples=20), :]
selected_df.iloc[:5, :4]

Unnamed: 0_level_0,name,smiles,IP,EA
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,Ra01_Rb004,N#Cc1c(-c2ccccc2)nc(-c2ccccc2)c(C#N)c1-c1c2ccc...,0.867341,-1.923257
65,Ra03_Rb010,N#Cc1ccc(-c2nc(-c3ccc(C#N)cc3)c(C#N)c(-c3ccc(N...,0.564757,-1.635845
261,Ra10_Rb010,N#Cc1ccc(-c2ccc(-c3nc(-c4ccc(-c5ccc(C#N)cc5)cc...,0.550825,-1.853265
87,Ra04_Rb004,N#Cc1c(-c2ccc(Br)cc2)nc(-c2ccc(Br)cc2)c(C#N)c1...,0.873042,-1.804753
129,Ra05_Rb018,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc4c(c3)...,1.035164,-1.982793


In [98]:
# Save suggestion
selected_df.iloc[:, :2].to_excel('../data/opt_cnps/suggested_cnps2.xlsx', sheet_name='step_0')

# 2 Bayes optimisation

## 2.1 Data pretreatments

In [100]:
dft_data_path = '../data/560_DFT_result.xlsx'
exp_data_path = '../data/opt_cnps/exp_data_steps.xlsx'
# Load fixed dft data
dft_df = pd.read_excel(dft_data_path, index_col='ID')
# Experiments result of diversity selection from collaborators
exp_df = pd.read_excel(exp_data_path, index_col='ID', sheet_name='step_0').loc[:, ['name', 'smiles', 'yield']]
exp_df

Unnamed: 0_level_0,name,smiles,yield
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,Ra01_Rb004,N#Cc1c(-c2ccccc2)nc(-c2ccccc2)c(C#N)c1-c1c2ccc...,0.0
65,Ra03_Rb010,N#Cc1ccc(-c2nc(-c3ccc(C#N)cc3)c(C#N)c(-c3ccc(N...,7.0
129,Ra05_Rb018,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc4c(c3)...,39.0
11,Ra01_Rb012,N#Cc1c(-c2ccccc2)nc(-c2ccccc2)c(C#N)c1-c1ccc(C...,2.0
336,Ra13_Rb001,N#Cc1c(-c2ccc3ccccc3c2)nc(-c2ccc3ccccc3c2)c(C#...,0.666667
415,Ra15_Rb024,Cc1ccc(-c2ccc(-c3c(C#N)c(-c4cccc(F)c4)nc(-c4cc...,0.0


In [101]:
# Descriptors transform
origin_features = dft_df.loc[:, 'IP':'S1-T1'].to_numpy()
# Descriptors rescaling and PCA calculation
pca = PCA(n_components=5)
pca_features = MinMaxScaler().fit_transform(pca.fit_transform(origin_features))
y_init = y/100
x_init = pca_features[exp_df.index]
print('Sum of variance of PCA features: {:.3f}'.format(sum(pca.explained_variance_ratio_)))
print('Feature matrix after PCA: {}\nInitial evaluated points:\nx: {}\ny: {}'.format(pca_features.shape, x_init.shape, y_init.shape))

Sum of variance of PCA features: 0.990
Feature matrix after PCA: (560, 5)
Initial evaluated points:
x: (6, 5)
y: (6, 1)


In [None]:
# Save features matrix
np.save('../data/opt_cnps/pca_features.npy', pca_features)

## 2.2 Kernel Preparation and Step 0

In [117]:
# set up kernel
def gpr_matern_kernel(param):
    kernel = ConstantKernel(constant_value=param['constant'], constant_value_bounds=param['constant_bounds']) * \
    Matern(length_scale=param['length_scale'], length_scale_bounds=param['length_scale_bounds'], nu=param['nu'])
    gpr = GaussianProcessRegressor(kernel=kernel, alpha=param['alpha'], normalize_y=False, n_restarts_optimizer=50)
    return gpr
# Hyper parameters
hyper_param = {
    'constant': 1,
    'constant_bounds': (0.1, 5),
    'length_scale': np.array([1., 1., 1., 1., 1.]),
    'length_scale_bounds': (1e-4, 1e6),
    'nu': 2.5,
    'BO_bounds': np.array([0, 1]),
    'alpha': 1e-4,
    'optimizer': 'sampling',
    'acq_func': 'UCB'
}
n_step = 0
# Init the optimiser
opt = bo.BayesOptimizer(
    base_estimator=gpr_matern_kernel(hyper_param),
    sampling=pca_features,
    bounds=hyper_param['BO_bounds'],
    acq_func=hyper_param['acq_func'],
    optimizer=hyper_param['optimizer']
)

In [None]:
# Generating the kappa parameters
parallel_param = bo.kwargs_generator(mean=5, size=12)
print(parallel_param)

In [61]:
# Tell step 0 results and fitting GPs
opt.tell(x_init, y_init)
opt.base_estimator.kernel_

0.316**2 * Matern(length_scale=[0.172, 6.02e+04, 2.66e+03, 3.85e+04, 2.45e+04], nu=2.5)

In [77]:
# Get the suggestion by BO
opt.sampling = pca_features
next_x = opt.parallel_ask(acq_func_args=parallel_param, num_samples=1)
next_df = bo.get_next_df(suggested_x=next_x, parallel_param=parallel_param, 
                      samples=pca_features, df=dft_df)
next_df

Unnamed: 0_level_0,name,SMILE,kappa
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
466,Ra17_Rb019,COc1ccc(N(c2ccc(OC)cc2)c2ccc(-c3c(C#N)c(-c4ccc...,0.602053
439,Ra16_Rb020,N#Cc1c(-c2cccc(Cl)c2)nc(-c2cccc(Cl)c2)c(C#N)c1...,1.238872
494,Ra18_Rb019,COc1ccc(N(c2ccc(OC)cc2)c2ccc(-c3c(C#N)c(-c4ccc...,1.838194
441,Ra16_Rb022,Cc1ccc(-c2c(C#N)c(-c3cccc(Cl)c3)nc(-c3cccc(Cl)...,3.376701
77,Ra03_Rb022,Cc1ccc(-c2c(C#N)c(-c3ccc(C#N)cc3)nc(-c3ccc(C#N...,3.566519
189,Ra07_Rb022,Cc1ccc(-c2c(C#N)c(-c3ccc(C(F)(F)F)cc3)nc(-c3cc...,4.893857
372,Ra14_Rb009,COc1ccc(-c2c(C#N)c(-c3cccc(Br)c3)nc(-c3cccc(Br...,4.905171
176,Ra07_Rb009,COc1ccc(-c2c(C#N)c(-c3ccc(C(F)(F)F)cc3)nc(-c3c...,4.917465
64,Ra03_Rb009,COc1ccc(-c2c(C#N)c(-c3ccc(C#N)cc3)nc(-c3ccc(C#...,5.424689
379,Ra14_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4cccc(Br)c4)nc(-...,5.762144


In [142]:
# Append the next suggestion sheet to the original Excel file
with pd.ExcelWriter('../data/opt_cnps/suggested_cnps.xlsx', mode='a') as excel_writer:
    next_df.to_excel(excel_writer, sheet_name='step_{}'.format(n_step + 1))
with open('../data/opt_cnps/photoredox_BO.pkl', 'wb') as bo_file:
    pickle.dump(opt, file=bo_file)

## 2.2 Step 1

In [6]:
n_step = 1
# Loading feature matrix and experimental results
pca_features = np.load('../data/opt_cnps/pca_features.npy')
dft_df = pd.read_excel('../data/560_DFT_result.xlsx', index_col='ID')
exp_data_path = '../data/opt_cnps/exp_data_steps.xlsx'
exp_x, exp_y, exp_df = bo.load_exp_data(
    exp_df_path=exp_data_path, sheet_name='step_{}'.format(n_step), 
    samples=pca_features, return_df=True
)
exp_df

Unnamed: 0_level_0,name,smiles,yield
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
439,Ra16_Rb020,N#Cc1c(-c2cccc(Cl)c2)nc(-c2cccc(Cl)c2)c(C#N)c1...,10.333333
494,Ra18_Rb019,COc1ccc(N(c2ccc(OC)cc2)c2ccc(-c3c(C#N)c(-c4ccc...,0.0
372,Ra14_Rb009,COc1ccc(-c2c(C#N)c(-c3cccc(Br)c3)nc(-c3cccc(Br...,15.333333
64,Ra03_Rb009,COc1ccc(-c2c(C#N)c(-c3ccc(C#N)cc3)nc(-c3ccc(C#...,3.333333
379,Ra14_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4cccc(Br)c4)nc(-...,25.666667
58,Ra03_Rb003,N#Cc1ccc(-c2nc(-c3ccc(C#N)cc3)c(C#N)c(-c3ccc4c...,2.0


In [105]:
# Next step parallel suggestion parameters
parallel_param = bo.kwargs_generator(mean=2.5, size=12)
parallel_param

{'kappa': [0.26147734257270994,
  0.4837147598117253,
  0.6532669005013081,
  1.4646436405928491,
  1.7666246735112012,
  1.852170393328392,
  2.1464933052188986,
  2.8496579792000793,
  2.95139600825862,
  3.9807346194995956,
  4.119257957244427,
  7.31776580409898]}

In [114]:
# Load previous BO_object
with open('../data/opt_cnps/photoredox_BO.pkl', 'rb') as bo_file:
    opt = pickle.load(bo_file)
# Tell experimental results and fitting the GPs
opt.tell(x=exp_x, y=exp_y)
# Get the suggestion by Bayesian optimization
opt.sampling = pca_features
next_x = opt.parallel_ask(acq_func_args=parallel_param, num_samples=1)
next_df = bo.get_next_df(suggested_x=next_x, parallel_param=parallel_param, 
                      samples=pca_features, df=dft_df)
next_df

Unnamed: 0_level_0,name,SMILE,kappa
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
110,Ra04_Rb027,N#Cc1c(-c2ccc(Br)cc2)nc(-c2ccc(Br)cc2)c(C#N)c1...,0.261477
463,Ra17_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4cccc(OC)c4)nc(-...,0.483715
464,Ra17_Rb017,COc1cccc(-c2nc(-c3cccc(OC)c3)c(C#N)c(-c3ccc4c(...,0.653267
263,Ra10_Rb012,N#Cc1ccc(-c2ccc(-c3nc(-c4ccc(-c5ccc(C#N)cc5)cc...,1.464644
207,Ra08_Rb012,N#Cc1c(-c2ccc(-c3ccccc3)cc2)nc(-c2ccc(-c3ccccc...,1.766625
321,Ra12_Rb014,Cc1cccc(-c2nc(-c3cccc(C)c3)c(C#N)c(-c3ccc(-c4c...,1.85217
293,Ra11_Rb014,N#Cc1c(-c2ccc(Cl)cc2)nc(-c2ccc(Cl)cc2)c(C#N)c1...,2.146493
69,Ra03_Rb014,N#Cc1ccc(-c2nc(-c3ccc(C#N)cc3)c(C#N)c(-c3ccc(-...,2.849658
97,Ra04_Rb014,N#Cc1c(-c2ccc(Br)cc2)nc(-c2ccc(Br)cc2)c(C#N)c1...,2.951396
153,Ra06_Rb014,N#Cc1c(-c2ccc(F)cc2)nc(-c2ccc(F)cc2)c(C#N)c1-c...,3.980735


In [158]:
# Append the next suggestion sheet to the original Excel file
with pd.ExcelWriter('../data/opt_cnps/suggested_cnps.xlsx', mode='a') as excel_writer:
    next_df.to_excel(excel_writer, sheet_name='step_{}'.format(n_step + 1))
with open('../data/opt_cnps/photoredox_BO.pkl', 'wb') as bo_file:
    pickle.dump(opt, file=bo_file)

## 2.3 Step 2

In [7]:
n_step = 2
# Loading feature matrix and experimental results
pca_features = np.load('../data/opt_cnps/pca_features.npy')
dft_df = pd.read_excel('../data/560_DFT_result.xlsx', index_col='ID')
exp_data_path = '../data/opt_cnps/exp_data_steps.xlsx'
exp_x, exp_y, exp_df = bo.load_exp_data(
    exp_df_path=exp_data_path, sheet_name='step_{}'.format(n_step), 
    samples=pca_features, return_df=True
)
exp_df

Unnamed: 0_level_0,name,smiles,yield
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
110,Ra04_Rb027,N#Cc1c(-c2ccc(Br)cc2)nc(-c2ccc(Br)cc2)c(C#N)c1...,0.666667
463,Ra17_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4cccc(OC)c4)nc(-...,48.666667
207,Ra08_Rb012,N#Cc1c(-c2ccc(-c3ccccc3)cc2)nc(-c2ccc(-c3ccccc...,0.0
69,Ra03_Rb014,N#Cc1ccc(-c2nc(-c3ccc(C#N)cc3)c(C#N)c(-c3ccc(-...,2.0
13,Ra01_Rb014,N#Cc1c(-c2ccccc2)nc(-c2ccccc2)c(C#N)c1-c1ccc(-...,0.0
559,Ra21_Rb028,N#Cc1ccc(-c2ccc(-c3c(C#N)c(-c4cccc(C(F)(F)F)c4...,1.0


In [48]:
# Next step parallel suggestion parameters
parallel_param = bo.kwargs_generator(mean=2.5, size=12)
parallel_param

{'kappa': [0.6747718851721768,
  0.7111855852527105,
  1.2475096429607242,
  1.261152794223848,
  2.02483320242886,
  2.5175981221321138,
  2.8769931290266726,
  3.1477716805522373,
  3.8062273853060598,
  4.293332641560919,
  5.522346926734617,
  16.370051739350757]}

In [28]:
# Load previous BO_object
with open('../data/opt_cnps/photoredox_BO.pkl', 'rb') as bo_file:
    opt = pickle.load(bo_file)
# Tell experimental results and fitting the GPs
opt.tell(x=exp_x, y=exp_y)
opt.base_estimator.kernel_

0.316**2 * Matern(length_scale=[0.441, 0.652, 2.61e+03, 0.347, 0.804], nu=2.5)

In [90]:
# Get the suggestion
opt.sampling = pca_features
next_x = opt.parallel_ask(acq_func_args=parallel_param, num_samples=1)
next_df = bo.get_next_df(suggested_x=next_x, parallel_param=parallel_param, 
                      samples=pca_features, df=dft_df)
next_df = next_df.reindex(columns=['name', 'SMILE', 'kappa', 'mean'])
x = np.array(next_x).reshape(-1, 5)
# Get the mean values from GPs
next_df.loc[:, 'mean'] = list(opt.base_estimator.predict(X=x).reshape(-1,))
next_df

Unnamed: 0,name,SMILE,kappa,mean
302,Ra11_Rb023,CN(c1ccccc1)c1ccc(-c2c(C#N)c(-c3ccc(Cl)cc3)nc(...,0.674772,0.503823
554,Ra21_Rb023,CN(c1ccccc1)c1ccc(-c2c(C#N)c(-c3cccc(C(F)(F)F)...,0.711186,0.488488
50,Ra02_Rb023,CN(c1ccccc1)c1ccc(-c2c(C#N)c(-c3cccc(C#N)c3)nc...,1.24751,0.490376
190,Ra07_Rb023,CN(c1ccccc1)c1ccc(-c2c(C#N)c(-c3ccc(C(F)(F)F)c...,1.261153,0.490311
386,Ra14_Rb023,CN(c1ccccc1)c1ccc(-c2c(C#N)c(-c3cccc(Br)c3)nc(...,2.024833,0.465571
333,Ra12_Rb026,Cc1ccc(N(c2ccc(C)cc2)c2ccc(-c3c(C#N)c(-c4cccc(...,2.517598,0.358421
193,Ra07_Rb026,Cc1ccc(N(c2ccc(C)cc2)c2ccc(-c3c(C#N)c(-c4ccc(C...,2.876993,0.350998
137,Ra05_Rb026,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc(N(c4c...,3.147772,0.338061
473,Ra17_Rb026,COc1cccc(-c2nc(-c3cccc(OC)c3)c(C#N)c(-c3ccc(N(...,3.806227,0.338125
417,Ra15_Rb026,Cc1ccc(N(c2ccc(C)cc2)c2ccc(-c3c(C#N)c(-c4cccc(...,4.293333,0.335614


In [92]:
# Append the next suggestion sheet to the original Excel file
with pd.ExcelWriter('../data/opt_cnps/suggested_cnps.xlsx', mode='a') as excel_writer:
    next_df.to_excel(excel_writer, sheet_name='step_{}'.format(n_step + 1))
with open('../data/opt_cnps/photoredox_BO.pkl', 'wb') as bo_file:
    pickle.dump(opt, file=bo_file)

## 2.4 Step 3

In [8]:
n_steps = 3
# Loading feature matrix and experimental results
pca_features = np.load('../data/opt_cnps/pca_features.npy')
dft_df = pd.read_excel('../data/560_DFT_result.xlsx', index_col='ID')
exp_data_path = '../data/opt_cnps/exp_data_steps.xlsx'
exp_x, exp_y, exp_df = bo.load_exp_data(
    exp_df_path=exp_data_path, sheet_name='step_{}'.format(n_step), 
    samples=pca_features, return_df=True
)
exp_df

Unnamed: 0_level_0,name,smiles,yield
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
110,Ra04_Rb027,N#Cc1c(-c2ccc(Br)cc2)nc(-c2ccc(Br)cc2)c(C#N)c1...,0.666667
463,Ra17_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4cccc(OC)c4)nc(-...,48.666667
207,Ra08_Rb012,N#Cc1c(-c2ccc(-c3ccccc3)cc2)nc(-c2ccc(-c3ccccc...,0.0
69,Ra03_Rb014,N#Cc1ccc(-c2nc(-c3ccc(C#N)cc3)c(C#N)c(-c3ccc(-...,2.0
13,Ra01_Rb014,N#Cc1c(-c2ccccc2)nc(-c2ccccc2)c(C#N)c1-c1ccc(-...,0.0
559,Ra21_Rb028,N#Cc1ccc(-c2ccc(-c3c(C#N)c(-c4cccc(C(F)(F)F)c4...,1.0


In [13]:
# Load previous BO_object
with open('../data/opt_cnps/photoredox_BO.pkl', 'rb') as bo_file:
    opt = pickle.load(bo_file)
# Tell experimental results and fitting the GPs
opt.tell(x=exp_x, y=exp_y)
opt.base_estimator.kernel_

0.316**2 * Matern(length_scale=[4.15e+03, 0.278, 1.37, 5.22e+04, 0.174], nu=2.5)

In [46]:
# Next step parallel suggestion parameters
parallel_param = bo.kwargs_generator(mean=2.5, size=12)
parallel_param

{'kappa': [0.20827958019421752,
  0.2628539344457737,
  1.2035524867291811,
  1.4988834920957204,
  1.606561673882382,
  1.7503199107686929,
  2.266072206327105,
  3.1657580240746315,
  3.229847529178021,
  3.2848465998459417,
  4.332779383370244,
  4.915127670014655]}

In [50]:
# Get the suggestion
opt.sampling = pca_features
next_x = opt.parallel_ask(acq_func_args=parallel_param, num_samples=1)
next_df = bo.get_next_df(suggested_x=next_x, parallel_param=parallel_param, 
                      samples=pca_features, df=dft_df)
next_df = next_df.reindex(columns=['name', 'SMILE', 'kappa', 'mean'])
x = np.array(next_x).reshape(-1, 5)
# Get the mean values from GPs
next_df.loc[:, 'mean'] = list(opt.base_estimator.predict(X=x).reshape(-1,))
next_df

Unnamed: 0,name,SMILE,kappa,mean
239,Ra09_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc(OC)cc4)nc(-...,0.20828,0.524754
464,Ra17_Rb017,COc1cccc(-c2nc(-c3cccc(OC)c3)c(C#N)c(-c3ccc4c(...,0.262854,0.43215
243,Ra09_Rb020,COc1ccc(-c2nc(-c3ccc(OC)cc3)c(C#N)c(-c3ccc(-n4...,1.203552,0.391405
553,Ra21_Rb022,Cc1ccc(-c2c(C#N)c(-c3cccc(C(F)(F)F)c3)nc(-c3cc...,1.498883,0.331049
168,Ra07_Rb001,N#Cc1c(-c2ccc(C(F)(F)F)cc2)nc(-c2ccc(C(F)(F)F)...,1.606562,0.327642
300,Ra11_Rb021,N#Cc1c(-c2ccc(Cl)cc2)nc(-c2ccc(Cl)cc2)c(C#N)c1...,1.75032,0.26366
104,Ra04_Rb021,N#Cc1c(-c2ccc(Br)cc2)nc(-c2ccc(Br)cc2)c(C#N)c1...,2.266072,0.2588
263,Ra10_Rb012,N#Cc1ccc(-c2ccc(-c3nc(-c4ccc(-c5ccc(C#N)cc5)cc...,3.165758,0.055712
328,Ra12_Rb021,Cc1cccc(-c2nc(-c3cccc(C)c3)c(C#N)c(-c3ccc4ccc5...,3.229848,0.252654
132,Ra05_Rb021,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc4ccc5c...,3.284847,0.25311


In [51]:
# Append the next suggestion sheet to the original Excel file
with pd.ExcelWriter('../data/opt_cnps/suggested_cnps.xlsx', mode='a') as excel_writer:
    next_df.to_excel(excel_writer, sheet_name='step_{}'.format(n_step + 1))
with open('../data/opt_cnps/photoredox_BO.pkl', 'wb') as bo_file:
    pickle.dump(opt, file=bo_file)

## 2.5 Step 4

In [10]:
n_step = 4
# Loading feature matrix and experimental results
pca_features = np.load('../data/opt_cnps/pca_features.npy')
dft_df = pd.read_excel('../data/560_DFT_result.xlsx', index_col='ID')
exp_data_path = '../data/opt_cnps/exp_data_steps.xlsx'
exp_x, exp_y, exp_df = bo.load_exp_data(
    exp_df_path=exp_data_path, sheet_name='step_{}'.format(n_step), 
    samples=pca_features, return_df=True
)
exp_df

Unnamed: 0_level_0,name,smiles,yield
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
239,Ra09_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc(OC)cc4)nc(-...,52.333333
464,Ra17_Rb017,COc1cccc(-c2nc(-c3cccc(OC)c3)c(C#N)c(-c3ccc4c(...,43.0
243,Ra09_Rb020,COc1ccc(-c2nc(-c3ccc(OC)cc3)c(C#N)c(-c3ccc(-n4...,29.5
168,Ra07_Rb001,N#Cc1c(-c2ccc(C(F)(F)F)cc2)nc(-c2ccc(C(F)(F)F)...,0.0
263,Ra10_Rb012,N#Cc1ccc(-c2ccc(-c3nc(-c4ccc(-c5ccc(C#N)cc5)cc...,0.0
328,Ra12_Rb021,Cc1cccc(-c2nc(-c3cccc(C)c3)c(C#N)c(-c3ccc4ccc5...,22.5
132,Ra05_Rb021,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc4ccc5c...,32.0
153,Ra06_Rb014,N#Cc1c(-c2ccc(F)cc2)nc(-c2ccc(F)cc2)c(C#N)c1-c...,0.0


In [7]:
# Load previous BO_object
with open('../data/opt_cnps/photoredox_BO.pkl', 'rb') as bo_file:
    opt = pickle.load(bo_file)
# Tell experimental results and fitting the GPs
opt.tell(x=exp_x, y=exp_y)
opt.base_estimator.kernel_

0.316**2 * Matern(length_scale=[0.226, 1.35e+03, 1.34, 0.0954, 6.37e+03], nu=2.5)

In [16]:
# Step 3 parallel suggestion parameters
parallel_param = bo.kwargs_generator(mean=2.5, size=12, name='kappa')
parallel_param

{'kappa': [0.14813485305398716,
  0.2348219695022978,
  0.4170771299660344,
  0.43399178110196507,
  0.5611334059528091,
  0.8406228621748734,
  1.7471099842442945,
  1.9113209027504232,
  2.6901450890607985,
  3.3987305464134723,
  4.582918320507437,
  13.259032567389191]}

In [20]:
# Get the suggestion
opt.sampling = pca_features
next_x = opt.parallel_ask(acq_func_args=parallel_param, num_samples=1)
next_df = bo.get_next_df(suggested_x=next_x, parallel_param=parallel_param, 
                      samples=pca_features, df=dft_df)
next_df = next_df.reindex(columns=['name', 'SMILE', 'kappa', 'mean'])
x = np.array(next_x).reshape(-1, 5)
# Get the mean values from GPs
next_df.loc[:, 'mean'] = list(opt.base_estimator.predict(X=x).reshape(-1,))
next_df

Unnamed: 0,name,SMILE,kappa,mean
519,Ra20_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4cccc(I)c4)nc(-c...,0.148135,0.541791
240,Ra09_Rb017,COc1ccc(-c2nc(-c3ccc(OC)cc3)c(C#N)c(-c3ccc4c(c...,0.234822,0.516101
338,Ra13_Rb003,N#Cc1c(-c2ccc3ccccc3c2)nc(-c2ccc3ccccc3c2)c(C#...,0.417077,0.483356
351,Ra13_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc5ccccc5c4)nc...,0.433992,0.470216
128,Ra05_Rb017,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc4c(c3)...,0.561133,0.491938
491,Ra18_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc(I)cc4)nc(-c...,0.840623,0.470588
269,Ra10_Rb018,N#Cc1ccc(-c2ccc(-c3nc(-c4ccc(-c5ccc(C#N)cc5)cc...,1.74711,0.34639
242,Ra09_Rb019,COc1ccc(-c2nc(-c3ccc(OC)cc3)c(C#N)c(-c3ccc(N(c...,1.911321,0.160676
130,Ra05_Rb019,COc1ccc(N(c2ccc(OC)cc2)c2ccc(-c3c(C#N)c(-c4ccc...,2.690145,0.159955
158,Ra06_Rb019,COc1ccc(N(c2ccc(OC)cc2)c2ccc(-c3c(C#N)c(-c4ccc...,3.398731,0.155669


In [22]:
# Append the next suggestion sheet to the original Excel file
with pd.ExcelWriter('../data/opt_cnps/suggested_cnps.xlsx', mode='a') as excel_writer:
    next_df.to_excel(excel_writer, sheet_name='step_{}'.format(n_step + 1))
with open('../data/opt_cnps/photoredox_BO.pkl', 'wb') as bo_file:
    pickle.dump(opt, file=bo_file)

## 2.6 Step 5

In [11]:
n_step = 5
# Loading feature matrix and experimental results
pca_features = np.load('../data/opt_cnps/pca_features.npy')
dft_df = pd.read_excel('../data/560_DFT_result.xlsx', index_col='ID')
exp_data_path = '../data/opt_cnps/exp_data_steps.xlsx'
exp_x, exp_y, exp_df = bo.load_exp_data(
    exp_df_path=exp_data_path, sheet_name='step_{}'.format(n_step), 
    samples=pca_features, return_df=True
)
exp_df

Unnamed: 0_level_0,name,smiles,yield
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
519,Ra20_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4cccc(I)c4)nc(-c...,32.0
240,Ra09_Rb017,COc1ccc(-c2nc(-c3ccc(OC)cc3)c(C#N)c(-c3ccc4c(c...,48.666667
338,Ra13_Rb003,N#Cc1c(-c2ccc3ccccc3c2)nc(-c2ccc3ccccc3c2)c(C#...,1.0
351,Ra13_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc5ccccc5c4)nc...,1.0
128,Ra05_Rb017,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc4c(c3)...,54.666667
491,Ra18_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc(I)cc4)nc(-c...,40.0
269,Ra10_Rb018,N#Cc1ccc(-c2ccc(-c3nc(-c4ccc(-c5ccc(C#N)cc5)cc...,7.0
130,Ra05_Rb019,COc1ccc(N(c2ccc(OC)cc2)c2ccc(-c3c(C#N)c(-c4ccc...,0.0
74,Ra03_Rb019,COc1ccc(N(c2ccc(OC)cc2)c2ccc(-c3c(C#N)c(-c4ccc...,0.0
55,Ra02_Rb028,N#Cc1ccc(-c2ccc(-c3c(C#N)c(-c4cccc(C#N)c4)nc(-...,0.666667


In [222]:
# Load previous BO_object
with open('../data/opt_cnps/photoredox_BO.pkl', 'rb') as bo_file:
    opt = pickle.load(bo_file)
# Tell experimental results and fitting the GPs
opt.tell(x=exp_x, y=exp_y)
opt.base_estimator.kernel_

0.316**2 * Matern(length_scale=[0.236, 3.65e+05, 1.66e+03, 0.108, 0.269], nu=2.5)

In [17]:
# Step 5 parallel suggestion parameters
parallel_param = bo.kwargs_generator(mean=2.5, size=12)
parallel_param

{'kappa': [0.04584220041133728,
  0.06287603654546714,
  0.325120289030415,
  0.5082135255837025,
  0.6954883136767915,
  2.3145651639860247,
  2.770295610157596,
  2.952767743939239,
  3.1806863847782973,
  3.215367865522933,
  3.5161296925662446,
  3.9136864744130917]}

In [224]:
# Get the suggestion
opt.sampling = pca_features
next_x = opt.parallel_ask(acq_func_args=parallel_param, num_samples=1)
next_df = bo.get_next_df(suggested_x=next_x, parallel_param=parallel_param, 
                      samples=pca_features, df=dft_df)
next_df = next_df.reindex(columns=['name', 'SMILE', 'kappa', 'mean'])
x = np.array(next_x).reshape(-1, 5)
# Get the mean values from GPs
next_df.loc[:, 'mean'] = list(opt.base_estimator.predict(X=x).reshape(-1,))
next_df

Unnamed: 0,name,SMILE,kappa,mean
127,Ra05_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc(C)cc4)nc(-c...,0.045842,0.508614
323,Ra12_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4cccc(C)c4)nc(-c...,0.062876,0.500055
244,Ra09_Rb021,COc1ccc(-c2nc(-c3ccc(OC)cc3)c(C#N)c(-c3ccc4ccc...,0.32512,0.478969
156,Ra06_Rb017,Cn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc(F)cc4)nc(-c4...,0.508214,0.492417
459,Ra17_Rb012,COc1cccc(-c2nc(-c3cccc(OC)c3)c(C#N)c(-c3ccc(C#...,0.695488,0.329406
279,Ra10_Rb028,N#Cc1ccc(-c2ccc(-c3nc(-c4ccc(-c5ccc(C#N)cc5)cc...,2.314565,0.087532
256,Ra10_Rb005,C#Cc1ccc(-c2c(C#N)c(-c3ccc(-c4ccc(C#N)cc4)cc3)...,2.770296,0.076839
56,Ra03_Rb001,N#Cc1ccc(-c2nc(-c3ccc(C#N)cc3)c(C#N)c(-c3cccc4...,2.952768,0.076407
54,Ra02_Rb027,N#Cc1cccc(-c2nc(-c3cccc(C#N)c3)c(C#N)c(-c3ccc(...,3.180686,0.060219
29,Ra02_Rb002,N#Cc1cccc(-c2nc(-c3cccc(C#N)c3)c(C#N)c(-c3ccc(...,3.215368,0.055999


In [194]:
# Append the next suggestion sheet to the original Excel file
with pd.ExcelWriter('../data/opt_cnps/suggested_cnps.xlsx', mode='a') as excel_writer:
    next_df.to_excel(excel_writer, sheet_name='step_{}'.format(n_step + 1))
# Save the BO object
with open('../data/opt_cnps/photoredox_BO.pkl', 'wb') as bo_file:
    pickle.dump(opt, file=bo_file)

## 2.7 Step 6

In [12]:
n_step = 6
# Loading feature matrix and experimental results
pca_features = np.load('../data/opt_cnps/pca_features.npy')
dft_df = pd.read_excel('../data/560_DFT_result.xlsx', index_col='ID')
exp_data_path = '../data/opt_cnps/exp_data_steps.xlsx'
exp_x, exp_y, exp_df = bo.load_exp_data(
    exp_df_path=exp_data_path, sheet_name='step_{}'.format(n_step), 
    samples=pca_features, return_df=True
)
exp_df

Unnamed: 0_level_0,name,smiles,yield
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
127,Ra05_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc(C)cc4)nc(-c...,66.666667
323,Ra12_Rb016,COc1ccc(-c2nc(-c3ccc(OC)cc3)c(C#N)c(-c3ccc4c(c...,53.666667
244,Ra09_Rb021,N#Cc1c(-c2ccc3ccccc3c2)nc(-c2ccc3ccccc3c2)c(C#...,32.5
459,Ra17_Rb012,COc1cccc(-c2nc(-c3cccc(OC)c3)c(C#N)c(-c3ccc(C#...,0.0
56,Ra03_Rb001,N#Cc1ccc(-c2nc(-c3ccc(C#N)cc3)c(C#N)c(-c3cccc4...,1.0
234,Ra09_Rb011,COc1ccc(-c2nc(-c3ccc(OC)cc3)c(C#N)c(-c3ccc4c(c...,38.0


In [6]:
# Load previous BO_object
with open('../data/opt_cnps/photoredox_BO.pkl', 'rb') as bo_file:
    opt = pickle.load(bo_file)
# Tell experimental results and fitting the GPs
opt.tell(x=exp_x, y=exp_y)
opt.base_estimator.kernel_

0.316**2 * Matern(length_scale=[2.17, 1e+06, 9.19e+03, 0.0577, 0.169], nu=2.5)

In [8]:
# Step 5 parallel suggestion parameters
parallel_param = bo.kwargs_generator(mean=2.5, size=12)
parallel_param

{'kappa': [0.2951568355459849,
  0.47647837643559093,
  0.9800181969948213,
  1.038714577619687,
  1.3135529934619443,
  1.3288522313711202,
  1.3814107235306186,
  3.0040949967458435,
  3.3937106141837843,
  3.491008270073936,
  3.820943269131958,
  4.867182465818417]}

In [9]:
# Get the suggestion
opt.sampling = pca_features
next_x = opt.parallel_ask(acq_func_args=parallel_param, num_samples=1)
next_df = bo.get_next_df(suggested_x=next_x, parallel_param=parallel_param, 
                      samples=pca_features, df=dft_df)
next_df = next_df.reindex(columns=['name', 'SMILE', 'kappa', 'mean'])
x = np.array(next_x).reshape(-1, 5)
# Get the mean values from GPs
next_df.loc[:, 'mean'] = list(opt.base_estimator.predict(X=x).reshape(-1,))
next_df

Unnamed: 0,name,SMILE,kappa,mean
122,Ra05_Rb011,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc4c(c3)...,0.295157,0.680056
502,Ra18_Rb027,N#Cc1c(-c2ccc(I)cc2)nc(-c2ccc(I)cc2)c(C#N)c1-c...,0.476478,0.656313
308,Ra12_Rb001,Cc1cccc(-c2nc(-c3cccc(C)c3)c(C#N)c(-c3cccc4ccc...,0.980018,0.630278
530,Ra20_Rb027,N#Cc1c(-c2cccc(I)c2)nc(-c2cccc(I)c2)c(C#N)c1-c...,1.038715,0.610266
303,Ra11_Rb024,Cc1ccc(-c2ccc(-c3c(C#N)c(-c4ccc(Cl)cc4)nc(-c4c...,1.313553,0.650861
138,Ra05_Rb027,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc(-c4cc...,1.328852,0.648977
135,Ra05_Rb024,Cc1ccc(-c2ccc(-c3c(C#N)c(-c4ccc(C)cc4)nc(-c4cc...,1.381411,0.558402
54,Ra02_Rb027,N#Cc1cccc(-c2nc(-c3cccc(C#N)c3)c(C#N)c(-c3ccc(...,3.004095,0.123981
29,Ra02_Rb002,N#Cc1cccc(-c2nc(-c3cccc(C#N)c3)c(C#N)c(-c3ccc(...,3.393711,0.121014
191,Ra07_Rb024,Cc1ccc(-c2ccc(-c3c(C#N)c(-c4ccc(C(F)(F)F)cc4)n...,3.491008,0.115032


In [10]:
# Append the next suggestion sheet to the original Excel file
with pd.ExcelWriter('../data/opt_cnps/suggested_cnps.xlsx', mode='a') as excel_writer:
    next_df.to_excel(excel_writer, sheet_name='step_{}'.format(n_step + 1))
# Save the BO object
with open('../data/opt_cnps/photoredox_BO.pkl', 'wb') as bo_file:
    pickle.dump(opt, file=bo_file)

## 2.8 Step 7

In [13]:
n_step = 7
# Loading feature matrix and experimental results
pca_features = np.load('../data/opt_cnps/pca_features.npy')
dft_df = pd.read_excel('../data/560_DFT_result.xlsx', index_col='ID')
exp_data_path = '../data/opt_cnps/exp_data_steps.xlsx'
exp_x, exp_y, exp_df = bo.load_exp_data(
    exp_df_path=exp_data_path, sheet_name='step_{}'.format(n_step), 
    samples=pca_features, return_df=True
)
exp_df

Unnamed: 0_level_0,name,smiles,yield
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
122,Ra05_Rb011,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc4c(c3)...,53.0
303,Ra11_Rb024,Cc1ccc(-c2ccc(-c3c(C#N)c(-c4ccc(Cl)cc4)nc(-c4c...,0.666667
135,Ra05_Rb024,Cc1ccc(-c2ccc(-c3c(C#N)c(-c4ccc(C)cc4)nc(-c4cc...,0.0
29,Ra02_Rb002,N#Cc1cccc(-c2nc(-c3cccc(C#N)c3)c(C#N)c(-c3ccc(...,0.333333
327,Ra12_Rb020,Cc1cccc(-c2nc(-c3cccc(C)c3)c(C#N)c(-c3ccc(-n4c...,23.0
156,Ra06_Rb017,Cn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc(F)cc4)nc(-c4...,51.0
502,Ra18_Rb027,N#Cc1c(-c2ccc(I)cc2)nc(-c2ccc(I)cc2)c(C#N)c1-c...,0.0
138,Ra05_Rb027,Cc1ccc(-c2nc(-c3ccc(C)cc3)c(C#N)c(-c3ccc(-c4cc...,0.0


In [4]:
# Load previous BO_object
with open('../data/opt_cnps/photoredox_BO.pkl', 'rb') as bo_file:
    opt = pickle.load(bo_file)
# Tell experimental results and fitting the GPs
opt.tell(x=exp_x, y=exp_y)
opt.base_estimator.kernel_

0.316**2 * Matern(length_scale=[0.0374, 0.0313, 0.228, 8.15e+03, 2.75e+05], nu=2.5)

In [19]:
# Step 5 parallel suggestion parameters
parallel_param = bo.util.kwargs_generator(mean=2.5, size=12, name='kappa')
parallel_param

{'kappa': [0.10889742824207856,
  0.21646694036122738,
  0.45079677871247537,
  0.937074841405609,
  1.3760757504993149,
  1.6289397317912577,
  1.9498860056548615,
  2.5101832788002447,
  2.9570800480263157,
  3.0227228519036404,
  3.3724298023408967,
  4.90927343769168]}

In [6]:
# Get the suggestion
opt.sampling = pca_features
next_x = opt.parallel_ask(acq_func_args=parallel_param, num_samples=1)
next_df = bo.util.get_next_df(suggested_x=next_x, parallel_param=parallel_param, 
                      samples=pca_features, df=dft_df)
next_df = next_df.reindex(columns=['name', 'SMILE', 'kappa', 'mean'])
x = np.array(next_x).reshape(-1, 5)
# Get the mean values from GPs
next_df.loc[:, 'mean'] = list(opt.base_estimator.predict(X=x).reshape(-1,))
next_df

Unnamed: 0,name,SMILE,kappa,mean
155,Ra06_Rb016,CCn1c2ccccc2c2cc(-c3c(C#N)c(-c4ccc(F)cc4)nc(-c...,0.047995,0.664804
318,Ra12_Rb011,Cc1cccc(-c2nc(-c3cccc(C)c3)c(C#N)c(-c3ccc4c(c3...,0.525416,0.647941
63,Ra03_Rb008,COc1ccc2cc(-c3c(C#N)c(-c4ccc(C#N)cc4)nc(-c4ccc...,0.58791,0.588326
371,Ra14_Rb008,COc1ccc2cc(-c3c(C#N)c(-c4cccc(Br)c4)nc(-c4cccc...,1.670862,0.55758
175,Ra07_Rb008,COc1ccc2cc(-c3c(C#N)c(-c4ccc(C(F)(F)F)cc4)nc(-...,1.684856,0.476545
427,Ra16_Rb008,COc1ccc2cc(-c3c(C#N)c(-c4cccc(Cl)c4)nc(-c4cccc...,2.009551,0.449144
539,Ra21_Rb008,COc1ccc2cc(-c3c(C#N)c(-c4cccc(C(F)(F)F)c4)nc(-...,2.107478,0.456593
91,Ra04_Rb008,COc1ccc2cc(-c3c(C#N)c(-c4ccc(Br)cc4)nc(-c4ccc(...,2.207981,0.443307
287,Ra11_Rb008,COc1ccc2cc(-c3c(C#N)c(-c4ccc(Cl)cc4)nc(-c4ccc(...,3.160213,0.414493
399,Ra15_Rb008,COc1ccc2cc(-c3c(C#N)c(-c4cccc(F)c4)nc(-c4cccc(...,3.83216,0.394832


In [7]:
# Append the next suggestion sheet to the original Excel file
with pd.ExcelWriter('../data/opt_cnps/suggested_cnps.xlsx', mode='a') as excel_writer:
    next_df.to_excel(excel_writer, sheet_name='step_{}'.format(n_step + 1))
# Save the BO object
with open('../data/opt_cnps/photoredox_BO.pkl', 'wb') as bo_file:
    pickle.dump(opt, file=bo_file)