## Bayesian Optimisation for Antimicrobial Polymer Discovery

In [8]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
import torch
from botorch.models import SingleTaskGP, ModelListGP, fully_bayesian
from botorch.models.gp_regression_mixed import MixedSingleTaskGP
from botorch.fit import fit_gpytorch_mll, fit_fully_bayesian_model_nuts
from botorch.utils import standardize
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.acquisition.monte_carlo import qExpectedImprovement, qUpperConfidenceBound
from botorch.acquisition.analytic import UpperConfidenceBound, ProbabilityOfImprovement, ExpectedImprovement
from botorch.optim import optimize_acqf, optimize_acqf_mixed
from botorch.cross_validation import gen_loo_cv_folds
import math
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern, DotProduct, RationalQuadratic, WhiteKernel
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

## Retrieve Training dataset and assign variables
To enable mixed search space, utilise MixedSingleTaskGP which uses a special kernel to combine continuous and categorical data.

In [9]:
data = pd.read_excel('dataset_final.xlsx', sheet_name = 'Dataset_Complete_modified')
data

Unnamed: 0,Polymer Index,type_A,type_B1,type_B2,type_C,composition_A,composition_B1,composition_B2,composition_C,block_sequence_theoretical,...,A2,B2,C2,A3,B3,C3,A4,B4,C4,cLogP_predicted
0,1,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630
1,2,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630
2,3,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630
3,4,Boc-AEAm,PEAm,,,0.7,0.30,0.00,0.00,AB,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270
4,5,Boc-AEAm,PEAm,,,0.7,0.30,0.00,0.00,AB,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,157,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.00,0.47,0.23,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-1.33796
153,158,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.70,0.00,0.00,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.16180
154,159,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.47,0.23,0.00,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.45137
155,160,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.23,0.47,0.00,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.75353


In [10]:
data.corr()

  data.corr()


Unnamed: 0,Polymer Index,composition_A,composition_B1,composition_B2,composition_C,Number of blocks,dpn,Target,NMR,GPC,...,A2,B2,C2,A3,B3,C3,A4,B4,C4,cLogP_predicted
Polymer Index,1.0,-0.237395,-0.209524,0.402446,0.045304,-0.515925,-0.249226,-0.410508,-0.556035,-0.257679,...,-0.243827,-0.438501,-0.387557,-0.226511,-0.190109,-0.172808,,-0.151171,-0.151171,-0.679035
composition_A,-0.237395,1.0,-0.282589,-0.310371,-0.41245,-0.010745,-0.004903,0.113958,0.207758,-0.182915,...,0.045402,-0.031455,-0.029569,-0.01607,-0.015225,-0.013064,,-0.01129,-0.01129,-0.086684
composition_B1,-0.209524,-0.282589,1.0,-0.399126,-0.309069,0.271207,0.120026,0.257981,0.2735,0.010997,...,0.129752,0.221584,0.180416,0.11888,0.112632,0.096645,,0.083517,0.083517,0.36422
composition_B2,0.402446,-0.310371,-0.399126,1.0,-0.286314,-0.275958,-0.11763,-0.274649,-0.471571,-0.175124,...,-0.127162,-0.228049,-0.214375,-0.116507,-0.110385,-0.094716,,-0.08185,-0.08185,-0.102905
composition_C,0.045304,-0.41245,-0.309069,-0.286314,1.0,0.016395,0.002952,-0.096356,-0.01039,0.344606,...,-0.047429,0.038591,0.063775,0.014118,0.013376,0.011478,,0.009919,0.009919,-0.170816
Number of blocks,-0.515925,-0.010745,0.271207,-0.275958,0.016395,1.0,0.280251,0.386098,0.408109,0.324969,...,0.45214,0.623867,0.595649,0.632507,0.467679,0.401295,,0.541927,0.541927,0.222334
dpn,-0.249226,-0.004903,0.120026,-0.11763,0.002952,0.280251,1.0,0.884207,0.766664,0.784606,...,0.114781,0.265455,0.22288,0.203834,0.084057,0.118917,,0.062328,0.062328,0.096098
Target,-0.410508,0.113958,0.257981,-0.274649,-0.096356,0.386098,0.884207,1.0,0.889259,0.813675,...,0.128931,0.29796,0.25631,0.312228,0.089902,0.116247,,0.236861,0.236861,0.118706
NMR,-0.556035,0.207758,0.2735,-0.471571,-0.01039,0.408109,0.766664,0.889259,1.0,0.679494,...,0.141521,0.34314,0.223529,0.312518,0.110646,0.178843,,0.162049,0.162049,0.229514
GPC,-0.257679,-0.182915,0.010997,-0.175124,0.344606,0.324969,0.784606,0.813675,0.679494,1.0,...,0.075559,0.213651,0.229229,0.319477,0.074642,0.083392,,0.294095,0.294095,-0.272896


In [11]:
data = data.drop(columns=['Polymer Index','Dispersity','clogP','block_sequence_theoretical', 'block_sequence_experimental','MIC_PAO1', 'MIC_PA','MIC_EC', 'MIC_AB', 'MIC_SA', 'MIC_MSmeg','GPC','Target','NMR'])
data = data.replace({'>128':128,'>256':256, '32-64':64, '64-128':128,'128-256':256})

In [12]:
number_of_good_polymers = [i for i in data['MIC_PAO1_PA'] if i<=64]
len(number_of_good_polymers) # 27 good polymers

27

Convert string types to encode 1, 2 or 3

## Assign Classes

In [13]:
data['Category'] = data['MIC_PAO1_PA'].apply(lambda x: 1 if x <= 64 else 0)
data = data.drop(columns = ['MIC_PAO1_PA'])
data

Unnamed: 0,type_A,type_B1,type_B2,type_C,composition_A,composition_B1,composition_B2,composition_C,Number of blocks,dpn,...,B2,C2,A3,B3,C3,A4,B4,C4,cLogP_predicted,Category
0,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,1,100,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630,1
1,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630,1
2,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,1,20,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630,1
3,Boc-AEAm,PEAm,,,0.7,0.30,0.00,0.00,1,100,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270,0
4,Boc-AEAm,PEAm,,,0.7,0.30,0.00,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.00,0.47,0.23,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-1.33796,0
153,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.70,0.00,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.16180,0
154,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.47,0.23,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.45137,0
155,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.23,0.47,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.75353,0


In [14]:
data['type_A'] = data['type_A'].apply(lambda x:0 if x == 'Boc-AEAm' else 1 if x == 'DMAEA' else 2)
data['type_B1'] = data['type_B1'].apply(lambda x:0 if x == 'PEAm' else 1)
data['type_B2'] = data['type_B2'].apply(lambda x:0 if x == 'None' else 1)
data['type_C'] = data['type_C'].apply(lambda x:0 if x == 'HEAm' else 1 if x == 'None' else 2 if
                                      x =='PEGA' else 3)
data

Unnamed: 0,type_A,type_B1,type_B2,type_C,composition_A,composition_B1,composition_B2,composition_C,Number of blocks,dpn,...,B2,C2,A3,B3,C3,A4,B4,C4,cLogP_predicted,Category
0,0,0,0,0,0.5,0.30,0.00,0.20,1,100,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630,1
1,0,0,0,0,0.5,0.30,0.00,0.20,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630,1
2,0,0,0,0,0.5,0.30,0.00,0.20,1,20,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630,1
3,0,0,0,1,0.7,0.30,0.00,0.00,1,100,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270,0
4,0,0,0,1,0.7,0.30,0.00,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,2,0,1,0,0.3,0.00,0.47,0.23,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-1.33796,0
153,2,0,1,0,0.3,0.70,0.00,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.16180,0
154,2,0,1,0,0.3,0.47,0.23,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.45137,0
155,2,0,1,0,0.3,0.23,0.47,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.75353,0


In [15]:
# data_with_dummies = pd.get_dummies(data, drop_first=True)
# data_with_dummies

In [16]:
data.to_csv('modified_data.csv', index=False)

In [17]:
Y_train = data['Category']
X = data.drop(columns = ['Category'])

## Standardise dataset

In [18]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

In [19]:
# X.iloc[:, 0:18] = scaler.fit_transform(X.iloc[:, 0:18]) # Only standardise numerical values
# X_train = pd.DataFrame(X, columns = X.columns, index = X.index.values.tolist())
# X_train

In [71]:
X_train = data[["cLogP_predicted","composition_A", "composition_B1",  "composition_B2", "composition_C", "type_A", "type_B1","type_B2","type_C"]]
# Change X_train_RF here if input space is enlarged in later stages
# Only one type A needed.
X_train

Unnamed: 0,cLogP_predicted,composition_A,composition_B1,composition_B2,composition_C,type_A,type_B1,type_B2,type_C
0,0.67630,0.5,0.30,0.00,0.20,0,0,0,0
1,0.67630,0.5,0.30,0.00,0.20,0,0,0,0
2,0.67630,0.5,0.30,0.00,0.20,0,0,0,0
3,1.05270,0.7,0.30,0.00,0.00,0,0,0,1
4,1.05270,0.7,0.30,0.00,0.00,0,0,0,1
...,...,...,...,...,...,...,...,...,...
152,-1.33796,0.3,0.00,0.47,0.23,2,0,1,0
153,-0.16180,0.3,0.70,0.00,0.00,2,0,1,0
154,-0.45137,0.3,0.47,0.23,0.00,2,0,1,0
155,-0.75353,0.3,0.23,0.47,0.00,2,0,1,0


## Random Forest Classifier
1. Trained on 5 numerical datasets only

In [72]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
rf=RandomForestClassifier()

In [73]:
params = {'n_estimators': [10,100,200,400,500,1000], 'max_depth': [10,20,30,40,50,60,70,80,90,100,None], 'max_features': ['auto', 'sqrt','log2'], 'min_samples_leaf': [1,2,4], 'min_samples_split': [2,5,10], 'bootstrap': [True,False],'class_weight': ['balanced']}

In [74]:
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=0)
rf_op = RandomizedSearchCV(rf, params, n_iter=20, cv=cv, scoring='f1', random_state=0, verbose=1, n_jobs=-1)

Take compositional parameters
Top 3 numerical features: cLogP, composition_B1,composition_A
Most important categorical feature: type_B2_none

Parameters taken: cLogP, composition_A, composition_B1, composition_B2, composition_C

In [75]:
rf_op.fit(X_train,Y_train.values.ravel())

Fitting 20 folds for each of 20 candidates, totalling 400 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [76]:
rf_op.best_params_

{'n_estimators': 500,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'log2',
 'max_depth': None,
 'class_weight': 'balanced',
 'bootstrap': False}

In [77]:
rf = RandomForestClassifier(random_state=0, n_estimators=rf_op.best_params_['n_estimators'], min_samples_split=rf_op.best_params_['min_samples_split'], min_samples_leaf=rf_op.best_params_['min_samples_leaf'], max_features=rf_op.best_params_['max_features'], max_depth=rf_op.best_params_['max_depth'], class_weight = rf_op.best_params_['class_weight'],bootstrap=rf_op.best_params_['bootstrap'])

In [78]:
arr = cross_val_score(rf, X_train, Y_train.values.ravel(), cv=cv)
arr

array([0.925     , 0.87179487, 0.82051282, 0.84615385, 0.85      ,
       0.82051282, 0.8974359 , 0.8974359 , 0.9       , 0.87179487,
       0.82051282, 0.79487179, 0.825     , 0.87179487, 0.82051282,
       0.8974359 , 0.925     , 0.82051282, 0.8974359 , 0.84615385])

In [79]:
np.mean(arr)

0.8609935897435899

In [80]:
rf.fit(X_train,Y_train.values.ravel())

In [81]:
Y_predict_RF = rf.predict(X_train)
confusion_matrix(Y_train,Y_predict_RF)

array([[112,  18],
       [  0,  27]])

## SVC

In [31]:
from sklearn.svm import SVC

svc = SVC()

In [32]:
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
          'gamma': [1000, 100, 10, 1, 0.1, 0.01, 0.001, 0.0001], 'class_weight': ['balanced']}

In [33]:
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=0)
svc_op = RandomizedSearchCV(svc, params, n_iter=100, cv=cv, scoring='recall', random_state=0, n_jobs=-1, verbose=1)

In [34]:
svc_op.fit(X_train, Y_train.values.ravel())
svc_op.best_params_

Fitting 20 folds for each of 100 candidates, totalling 2000 fits


{'kernel': 'rbf', 'gamma': 0.01, 'class_weight': 'balanced', 'C': 1}

In [35]:
svc = SVC(kernel='sigmoid', gamma=0.0001, C=0.001, random_state=0, class_weight='balanced')

In [36]:
arr = cross_val_score(svc, X_train, Y_train.values.ravel(), cv=cv)
arr

array([0.175     , 0.17948718, 0.17948718, 0.15384615, 0.175     ,
       0.17948718, 0.17948718, 0.15384615, 0.175     , 0.17948718,
       0.17948718, 0.15384615, 0.175     , 0.17948718, 0.17948718,
       0.15384615, 0.175     , 0.17948718, 0.17948718, 0.15384615])

In [37]:
np.mean(arr)

0.17195512820512818

In [38]:
svc.fit(X_train, Y_train.values.ravel())

In [39]:
Y_svc = svc.predict(X_train)
Y_svc

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1])

In [40]:
confusion_matrix(Y_train, Y_svc)

array([[  0, 130],
       [  0,  27]])

## Gaussian Process Classifier

In [41]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern, DotProduct, RationalQuadratic, WhiteKernel
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# kernel = Matern()
gpc = GaussianProcessClassifier()

In [42]:
params = {'kernel':[RBF(), DotProduct(), Matern(),  RationalQuadratic(), WhiteKernel()]}

In [43]:
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=0)
gpc_op = GridSearchCV(gpc, params, scoring='f1', cv=cv, n_jobs=-1)

In [44]:
gpc_op.fit(X_train,Y_train.values.ravel())



In [45]:
print(gpc_op.best_params_)
print(gpc_op.best_score_)

{'kernel': DotProduct(sigma_0=1)}
0.16944444444444445


In [46]:
gpc_op = GaussianProcessClassifier(kernel = DotProduct())
gpc_op.fit(X_train,Y_train.values.ravel())

In [47]:
Y_predict_gpc = gpc_op.predict(X_train)
Y_predict_gpc

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

In [48]:
confusion_matrix(Y_train,Y_predict_gpc)

array([[129,   1],
       [ 24,   3]])

## Decision Tree

In [49]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()

In [50]:
params = {'max_depth': [2, 3, 5, 10, 20], 'min_samples_leaf': [5, 10, 20, 50, 100], 'criterion': ["gini", "entropy"],
          'splitter': ["best", "random"], 'max_features': ["auto", "sqrt", "log2"], 'class_weight': ['balanced']}
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=0)

In [51]:
dtc_op = RandomizedSearchCV(dtc, params, n_iter=20, cv=cv, scoring='f1', verbose=1, random_state=0, n_jobs=-1)
dtc_op.fit(X_train, Y_train.values.ravel())

Fitting 20 folds for each of 20 candidates, totalling 400 fits




In [52]:
dtc_op.best_params_

{'splitter': 'best',
 'min_samples_leaf': 5,
 'max_features': 'auto',
 'max_depth': 10,
 'criterion': 'gini',
 'class_weight': 'balanced'}

In [53]:
dtc = DecisionTreeClassifier(random_state=0, max_depth=dtc_op.best_params_['max_depth'],
                             criterion=dtc_op.best_params_['criterion'],
                             min_samples_leaf=dtc_op.best_params_['min_samples_leaf'],
                             splitter=dtc_op.best_params_['splitter'], max_features=dtc_op.best_params_['max_features'],
                             class_weight=dtc_op.best_params_['class_weight'])

In [54]:
arr = cross_val_score(dtc, X_train, Y_train.values.ravel(), cv=cv)
np.mean(arr)



0.774647435897436

In [55]:
dtc.fit(X_train, Y_train.values.ravel())



In [56]:
Y_predict_dtc = dtc.predict(X_train)
confusion_matrix(Y_train, Y_predict_dtc)

array([[106,  24],
       [  2,  25]])

## ADA Boost

In [57]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()

In [58]:
params = {'n_estimators': [1, 10, 20, 30, 50, 100, 200, 1000, 2000, 3000, 5000], 'algorithm': ['SAMME', 'SAMME.R'],
          'learning_rate': [1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 2.0, 2.5]}
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=0)

In [59]:
ada_op = RandomizedSearchCV(ada, params, n_iter=20, cv=cv, scoring='f1', verbose=1, random_state=0, n_jobs=-1)

In [60]:
ada_op.fit(X_train, Y_train.values.ravel())

Fitting 20 folds for each of 20 candidates, totalling 400 fits


In [61]:
ada_op.best_params_

{'n_estimators': 100, 'learning_rate': 1.0, 'algorithm': 'SAMME'}

In [62]:
ada = AdaBoostClassifier(random_state=0, n_estimators=ada_op.best_params_['n_estimators'], learning_rate=ada_op.best_params_['learning_rate'], algorithm=ada_op.best_params_['algorithm'])

In [63]:
arr = cross_val_score(ada, X_train, Y_train.values.ravel(), cv=cv)
np.mean(arr)

0.8839423076923077

In [64]:
ada.fit(X_train, Y_train.values.ravel())

In [65]:
Y_predict_ada_prob = ada.predict_proba(X_train)

In [66]:
Y_predict_ada = ada.predict(X_train)
confusion_matrix(Y_train, Y_predict_ada)

array([[118,  12],
       [  2,  25]])

## Bayesian Optimisation

In [82]:
data # non-standardised with dummy variables

Unnamed: 0,type_A,type_B1,type_B2,type_C,composition_A,composition_B1,composition_B2,composition_C,Number of blocks,dpn,...,B2,C2,A3,B3,C3,A4,B4,C4,cLogP_predicted,Category
0,0,0,0,0,0.5,0.30,0.00,0.20,1,100,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630,1
1,0,0,0,0,0.5,0.30,0.00,0.20,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630,1
2,0,0,0,0,0.5,0.30,0.00,0.20,1,20,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630,1
3,0,0,0,1,0.7,0.30,0.00,0.00,1,100,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270,0
4,0,0,0,1,0.7,0.30,0.00,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,2,0,1,0,0.3,0.00,0.47,0.23,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-1.33796,0
153,2,0,1,0,0.3,0.70,0.00,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.16180,0
154,2,0,1,0,0.3,0.47,0.23,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.45137,0
155,2,0,1,0,0.3,0.23,0.47,0.00,1,40,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.75353,0


X_train_BO

In [83]:
data_BO = data[["cLogP_predicted", "composition_A","composition_B1",  "composition_B2","composition_C","type_A", "type_B1", "type_B2", "type_C"]]
data_BO

Unnamed: 0,cLogP_predicted,composition_A,composition_B1,composition_B2,composition_C,type_A,type_B1,type_B2,type_C
0,0.67630,0.5,0.30,0.00,0.20,0,0,0,0
1,0.67630,0.5,0.30,0.00,0.20,0,0,0,0
2,0.67630,0.5,0.30,0.00,0.20,0,0,0,0
3,1.05270,0.7,0.30,0.00,0.00,0,0,0,1
4,1.05270,0.7,0.30,0.00,0.00,0,0,0,1
...,...,...,...,...,...,...,...,...,...
152,-1.33796,0.3,0.00,0.47,0.23,2,0,1,0
153,-0.16180,0.3,0.70,0.00,0.00,2,0,1,0
154,-0.45137,0.3,0.47,0.23,0.00,2,0,1,0
155,-0.75353,0.3,0.23,0.47,0.00,2,0,1,0


In [84]:
X_train_BO = torch.tensor(data_BO.values)
# X_train_BO = X_train_BO.float()
# X_train_BO = data_BO
X_train_BO

tensor([[ 0.6763,  0.5000,  0.3000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.6763,  0.5000,  0.3000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.6763,  0.5000,  0.3000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [-0.4514,  0.3000,  0.4700,  ...,  0.0000,  1.0000,  0.0000],
        [-0.7535,  0.3000,  0.2300,  ...,  0.0000,  1.0000,  0.0000],
        [-1.0431,  0.3000,  0.0000,  ...,  0.0000,  1.0000,  0.0000]],
       dtype=torch.float64)

Y-train needs to be the probability of being good

In [85]:
Y_train_BO = rf.predict_proba(X_train_BO)
# depend on what classifier we are using, currently using decision tree classifier
Y_train_BO = Y_train_BO[:,1]
Y_train_BO = torch.tensor(Y_train_BO)
Y_train_BO = torch.reshape(Y_train_BO,(len(Y_train_BO),1))
Y_train_BO



tensor([[8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [5.0844e-01],
        [5.0844e-01],
        [5.0844e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [5.0844e-01],
        [5.0844e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [8.4375e-01],
        [7.7415e-01],
        [7.7415e-01],
        [8.4375e-01],
        [9.7934e-01],
        [9.9544e-01],
        [9.4466e-01],
        [9.9834e-01],
        [9.9834e-01],
        [1.1177e-02],
        [3.0179e-03],
        [1.7017e-02],
        [8.5287e-03],
        [8.5287e-03],
        [8.5287e-03],
        [1.7028e-01],
        [1.2371e-03],
        [5

In [86]:
best_y = max(Y_train_BO)
best_y

tensor([0.9983], dtype=torch.float64)

In [87]:
np.count_nonzero(Y_train_BO > 0.5)

45

# Fit BO model
Physical constraints must be applied
### SingleTaskGP:
Only considers continuous inputs
### MixedSingleTaskGP:
Include discrete inputs and combines using a categorical kernel

In [88]:
# model=SingleTaskGP(X_train_BO, Y_train_BO)
# mll=ExactMarginalLogLikelihood(model.likelihood, model)
# fit_gpytorch_mll(mll)
# model = fully_bayesian.SaasFullyBayesianSingleTaskGP(X_train_BO, Y_train_BO)
# fit_fully_bayesian_model_nuts(model)

model=MixedSingleTaskGP(X_train_BO,Y_train_BO,cat_dims=[5,6,7,8])
mll=ExactMarginalLogLikelihood(model.likelihood, model)
fit_gpytorch_mll(mll)

ExactMarginalLogLikelihood(
  (likelihood): GaussianLikelihood(
    (noise_covar): HomoskedasticNoise(
      (noise_prior): GammaPrior()
      (raw_noise_constraint): GreaterThan(1.000E-06)
    )
  )
  (model): MixedSingleTaskGP(
    (likelihood): GaussianLikelihood(
      (noise_covar): HomoskedasticNoise(
        (noise_prior): GammaPrior()
        (raw_noise_constraint): GreaterThan(1.000E-06)
      )
    )
    (mean_module): ConstantMean()
    (covar_module): AdditiveKernel(
      (kernels): ModuleList(
        (0): ScaleKernel(
          (base_kernel): AdditiveKernel(
            (kernels): ModuleList(
              (0): MaternKernel(
                (raw_lengthscale_constraint): GreaterThan(1.000E-04)
                (distance_module): Distance()
              )
              (1): ScaleKernel(
                (base_kernel): CategoricalKernel(
                  (raw_lengthscale_constraint): GreaterThan(1.000E-06)
                )
                (raw_outputscale_constraint): Posi

# Acquisition Function
## EI

In [89]:
from botorch.sampling.normal import SobolQMCNormalSampler
# Sobol sequences use a base of two to form successively finer uniform partitions of the unit interval and then reorder the coordinates in each dimension. (Wiki)

EI = qExpectedImprovement(
    model = model,
    best_f = best_y,
    sampler = SobolQMCNormalSampler(1024)
)

## UCB

In [None]:
UCB = qUpperConfidenceBound(
    model = model,
    beta = 0.6,
)

In [90]:
# Constraints and bounds
bounds = torch.tensor([[-5., 0., 0., 0., 0., 0., 0., 0.,0.],[5., 1., 1., 1., 1., 1., 1., 1., 1.]])

optimize_acqf_mixed: Optimize over a list of fixed_features and returns the best solution.
Gradient is not implied to categorical features.

In [180]:
import itertools
from collections import ChainMap
from itertools import combinations
fixed_feature_list = [[{5:0},{5:1}],
                      [{6:0},{6:1}],
                      [{7:0},{7:1}],
                      [{8:0},{8:1},{8:2},{8:3}]]
combination = [p for p in itertools.product(*fixed_feature_list)]
combination

[({5: 0}, {6: 0}, {7: 0}, {8: 0}),
 ({5: 0}, {6: 0}, {7: 0}, {8: 1}),
 ({5: 0}, {6: 0}, {7: 0}, {8: 2}),
 ({5: 0}, {6: 0}, {7: 0}, {8: 3}),
 ({5: 0}, {6: 0}, {7: 1}, {8: 0}),
 ({5: 0}, {6: 0}, {7: 1}, {8: 1}),
 ({5: 0}, {6: 0}, {7: 1}, {8: 2}),
 ({5: 0}, {6: 0}, {7: 1}, {8: 3}),
 ({5: 0}, {6: 1}, {7: 0}, {8: 0}),
 ({5: 0}, {6: 1}, {7: 0}, {8: 1}),
 ({5: 0}, {6: 1}, {7: 0}, {8: 2}),
 ({5: 0}, {6: 1}, {7: 0}, {8: 3}),
 ({5: 0}, {6: 1}, {7: 1}, {8: 0}),
 ({5: 0}, {6: 1}, {7: 1}, {8: 1}),
 ({5: 0}, {6: 1}, {7: 1}, {8: 2}),
 ({5: 0}, {6: 1}, {7: 1}, {8: 3}),
 ({5: 1}, {6: 0}, {7: 0}, {8: 0}),
 ({5: 1}, {6: 0}, {7: 0}, {8: 1}),
 ({5: 1}, {6: 0}, {7: 0}, {8: 2}),
 ({5: 1}, {6: 0}, {7: 0}, {8: 3}),
 ({5: 1}, {6: 0}, {7: 1}, {8: 0}),
 ({5: 1}, {6: 0}, {7: 1}, {8: 1}),
 ({5: 1}, {6: 0}, {7: 1}, {8: 2}),
 ({5: 1}, {6: 0}, {7: 1}, {8: 3}),
 ({5: 1}, {6: 1}, {7: 0}, {8: 0}),
 ({5: 1}, {6: 1}, {7: 0}, {8: 1}),
 ({5: 1}, {6: 1}, {7: 0}, {8: 2}),
 ({5: 1}, {6: 1}, {7: 0}, {8: 3}),
 ({5: 1}, {6: 1}, {7

In [181]:
dict(ChainMap(*combination[4]))

{8: 0, 7: 1, 6: 0, 5: 0}

In [182]:
# combination_list = []
for x in range(0, len(combination)):
    combination[x] = dict(ChainMap(*combination[x]))

combination

[{8: 0, 7: 0, 6: 0, 5: 0},
 {8: 1, 7: 0, 6: 0, 5: 0},
 {8: 2, 7: 0, 6: 0, 5: 0},
 {8: 3, 7: 0, 6: 0, 5: 0},
 {8: 0, 7: 1, 6: 0, 5: 0},
 {8: 1, 7: 1, 6: 0, 5: 0},
 {8: 2, 7: 1, 6: 0, 5: 0},
 {8: 3, 7: 1, 6: 0, 5: 0},
 {8: 0, 7: 0, 6: 1, 5: 0},
 {8: 1, 7: 0, 6: 1, 5: 0},
 {8: 2, 7: 0, 6: 1, 5: 0},
 {8: 3, 7: 0, 6: 1, 5: 0},
 {8: 0, 7: 1, 6: 1, 5: 0},
 {8: 1, 7: 1, 6: 1, 5: 0},
 {8: 2, 7: 1, 6: 1, 5: 0},
 {8: 3, 7: 1, 6: 1, 5: 0},
 {8: 0, 7: 0, 6: 0, 5: 1},
 {8: 1, 7: 0, 6: 0, 5: 1},
 {8: 2, 7: 0, 6: 0, 5: 1},
 {8: 3, 7: 0, 6: 0, 5: 1},
 {8: 0, 7: 1, 6: 0, 5: 1},
 {8: 1, 7: 1, 6: 0, 5: 1},
 {8: 2, 7: 1, 6: 0, 5: 1},
 {8: 3, 7: 1, 6: 0, 5: 1},
 {8: 0, 7: 0, 6: 1, 5: 1},
 {8: 1, 7: 0, 6: 1, 5: 1},
 {8: 2, 7: 0, 6: 1, 5: 1},
 {8: 3, 7: 0, 6: 1, 5: 1},
 {8: 0, 7: 1, 6: 1, 5: 1},
 {8: 1, 7: 1, 6: 1, 5: 1},
 {8: 2, 7: 1, 6: 1, 5: 1},
 {8: 3, 7: 1, 6: 1, 5: 1}]

In [184]:
X_candidates,_ = optimize_acqf_mixed(
    acq_function = EI,
    bounds = bounds,
    q = 5, # Number of suggested candidates
    num_restarts = 20,
    raw_samples = 64,
    equality_constraints = [(torch.tensor([1, 2, 3, 4]), torch.tensor([1.]*4), 1.0)],
    fixed_features_list = combination
)
X_candidates

tensor([[0.8867, 0.4284, 0.1908, 0.3584, 0.0225, 0.0000, 1.0000, 0.0000, 3.0000],
        [1.3693, 0.3742, 0.1774, 0.4067, 0.0418, 0.0000, 1.0000, 0.0000, 1.0000],
        [1.1138, 0.3269, 0.1695, 0.5035, 0.0000, 0.0000, 0.0000, 1.0000, 3.0000],
        [1.6063, 0.3949, 0.1840, 0.3936, 0.0275, 0.0000, 0.0000, 0.0000, 1.0000],
        [1.0397, 0.3335, 0.1704, 0.4961, 0.0000, 0.0000, 1.0000, 1.0000, 0.0000]])

## Substitute generated candidates to random forest classifier

The outcome of Y-rf should be the probability, and keep maximising
rf.probability

In [None]:
Y_dtc = dtc.predict_proba(X_candidates)
Y_dtc

In [None]:
np.count_nonzero(Y_dtc[:,1] > 0.5)

In [None]:
max(Y_dtc[:,1]) # Max. probability achieved

In [None]:
Y_dtc = torch.tensor(Y_dtc[:, 1])
Y_dtc = torch.reshape(Y_dtc,(len(Y_dtc),1))
Y_dtc

## Second Round

In [None]:
X_train_2 = (X_train_BO,X_candidates)
X_train_2 = torch.cat(X_train_2, dim = 0)
X_train_2 = X_train_2.float()
X_train_2

In [None]:
Y_train_2 = (Y_train_BO,Y_dtc)
Y_train_2 = torch.cat(Y_train_2, dim = 0)
Y_train_2 = Y_train_2.float()
Y_train_2
# len(Y_train_2)

In [None]:
model=MixedSingleTaskGP(X_train_2, Y_train_2,cat_dims=[-1])
mll=ExactMarginalLogLikelihood(model.likelihood, model)
fit_gpytorch_mll(mll)

# model = fully_bayesian.SaasFullyBayesianSingleTaskGP(X_train_2, Y_train_2)
# fit_fully_bayesian_model_nuts(model)

In [None]:
EI = qExpectedImprovement(
    model = model,
    best_f = best_y,
    sampler = SobolQMCNormalSampler(1024)
)

UCB = qUpperConfidenceBound(
    model = model,
    beta = 0.6,
)

In [None]:
X_candidates,_ = optimize_acqf_mixed(
    acq_function = EI,
    bounds = bounds,
    q = 10, # Number of suggested candidates
    num_restarts = 20,
    raw_samples = 64,
    equality_constraints = [(torch.tensor([1, 2, 3, 4]), torch.tensor([1.]*4), 1.0)],
    fixed_features_list = [{5:0},{5:1}]
)
X_candidates

In [None]:
Y_dtc_2 = dtc_op.predict_proba(X_candidates)
Y_dtc_2

In [None]:
np.count_nonzero(Y_dtc_2[:,1] > 0.5)

1. Change Y-train (y_rf) done
2. Apply constraints (parameter constraints)
3. think about the dummy variables, whether that's doable in BO

In [None]:
max(Y_dtc_2[:,1])