## Bayesian Optimisation for Antimicrobial Polymer Discovery

In [26]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
import torch
from botorch.models import SingleTaskGP, ModelListGP
from botorch.models.gp_regression_mixed import MixedSingleTaskGP
from botorch.fit import fit_gpytorch_model
from botorch.utils import standardize
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.acquisition.monte_carlo import qExpectedImprovement
from botorch.acquisition.analytic import UpperConfidenceBound, ProbabilityOfImprovement, ExpectedImprovement
from botorch.optim import optimize_acqf, optimize_acqf_mixed
from botorch.cross_validation import gen_loo_cv_folds
import math
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc

## Retrieve Training dataset and assign variables
To enable mixed search space, utilise MixedSingleTaskGP which uses a special kernel to combine continuous and categorical data.

In [27]:
data = pd.read_excel('dataset_final.xlsx', sheet_name = 'Dataset_Complete_modified')
data

Unnamed: 0,Polymer Index,type_A,type_B1,type_B2,type_C,composition_A,composition_B1,composition_B2,composition_C,block_sequence_theoretical,...,A2,B2,C2,A3,B3,C3,A4,B4,C4,cLogP_predicted
0,1,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630
1,2,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630
2,3,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630
3,4,Boc-AEAm,PEAm,,,0.7,0.30,0.00,0.00,AB,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270
4,5,Boc-AEAm,PEAm,,,0.7,0.30,0.00,0.00,AB,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,157,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.00,0.47,0.23,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-1.33796
153,158,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.70,0.00,0.00,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.16180
154,159,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.47,0.23,0.00,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.45137
155,160,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.23,0.47,0.00,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.75353


In [28]:
data.corr()

  data.corr()


Unnamed: 0,Polymer Index,composition_A,composition_B1,composition_B2,composition_C,Number of blocks,dpn,Target,NMR,GPC,...,A2,B2,C2,A3,B3,C3,A4,B4,C4,cLogP_predicted
Polymer Index,1.0,-0.237395,-0.209524,0.402446,0.045304,-0.515925,-0.249226,-0.410508,-0.556035,-0.257679,...,-0.243827,-0.438501,-0.387557,-0.226511,-0.190109,-0.172808,,-0.151171,-0.151171,-0.679035
composition_A,-0.237395,1.0,-0.282589,-0.310371,-0.41245,-0.010745,-0.004903,0.113958,0.207758,-0.182915,...,0.045402,-0.031455,-0.029569,-0.01607,-0.015225,-0.013064,,-0.01129,-0.01129,-0.086684
composition_B1,-0.209524,-0.282589,1.0,-0.399126,-0.309069,0.271207,0.120026,0.257981,0.2735,0.010997,...,0.129752,0.221584,0.180416,0.11888,0.112632,0.096645,,0.083517,0.083517,0.36422
composition_B2,0.402446,-0.310371,-0.399126,1.0,-0.286314,-0.275958,-0.11763,-0.274649,-0.471571,-0.175124,...,-0.127162,-0.228049,-0.214375,-0.116507,-0.110385,-0.094716,,-0.08185,-0.08185,-0.102905
composition_C,0.045304,-0.41245,-0.309069,-0.286314,1.0,0.016395,0.002952,-0.096356,-0.01039,0.344606,...,-0.047429,0.038591,0.063775,0.014118,0.013376,0.011478,,0.009919,0.009919,-0.170816
Number of blocks,-0.515925,-0.010745,0.271207,-0.275958,0.016395,1.0,0.280251,0.386098,0.408109,0.324969,...,0.45214,0.623867,0.595649,0.632507,0.467679,0.401295,,0.541927,0.541927,0.222334
dpn,-0.249226,-0.004903,0.120026,-0.11763,0.002952,0.280251,1.0,0.884207,0.766664,0.784606,...,0.114781,0.265455,0.22288,0.203834,0.084057,0.118917,,0.062328,0.062328,0.096098
Target,-0.410508,0.113958,0.257981,-0.274649,-0.096356,0.386098,0.884207,1.0,0.889259,0.813675,...,0.128931,0.29796,0.25631,0.312228,0.089902,0.116247,,0.236861,0.236861,0.118706
NMR,-0.556035,0.207758,0.2735,-0.471571,-0.01039,0.408109,0.766664,0.889259,1.0,0.679494,...,0.141521,0.34314,0.223529,0.312518,0.110646,0.178843,,0.162049,0.162049,0.229514
GPC,-0.257679,-0.182915,0.010997,-0.175124,0.344606,0.324969,0.784606,0.813675,0.679494,1.0,...,0.075559,0.213651,0.229229,0.319477,0.074642,0.083392,,0.294095,0.294095,-0.272896


In [29]:
data = data.drop(columns=['composition_C','Polymer Index','Dispersity','clogP','block_sequence_theoretical', 'block_sequence_experimental','MIC_PAO1', 'MIC_PA','MIC_EC', 'MIC_AB', 'MIC_SA', 'MIC_MSmeg','GPC','Target','NMR'])
data = data.replace({'>128':128,'>256':256, '32-64':64, '64-128':128,'128-256':256})

## Assign Classes

In [30]:
data['Category'] = data['MIC_PAO1_PA'].apply(lambda x: 1 if x <= 64 else 0)
data = data.drop(columns = ['MIC_PAO1_PA'])

In [31]:
data_with_dummies = pd.get_dummies(data, drop_first=True)
data_with_dummies

Unnamed: 0,composition_A,composition_B1,composition_B2,Number of blocks,dpn,A1,B1,C1,A2,B2,...,C4,cLogP_predicted,Category,type_A_Boc-AEAm,type_A_DMAEA,type_B1_PEAm,type_B2_None,type_C_HEAm,type_C_None,type_C_PEGA
0,0.5,0.30,0.00,1,100,0.3330,0.3330,0.334,0.0,0.0,...,0.0,0.67630,1,1,0,1,1,1,0,0
1,0.5,0.30,0.00,1,40,0.3325,0.3325,0.335,0.0,0.0,...,0.0,0.67630,1,1,0,1,1,1,0,0
2,0.5,0.30,0.00,1,20,0.3300,0.3350,0.335,0.0,0.0,...,0.0,0.67630,1,1,0,1,1,1,0,0
3,0.7,0.30,0.00,1,100,0.5000,0.5000,0.000,0.0,0.0,...,0.0,1.05270,0,1,0,1,1,0,1,0
4,0.7,0.30,0.00,1,40,0.5000,0.5000,0.000,0.0,0.0,...,0.0,1.05270,1,1,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,0.3,0.00,0.47,1,40,0.3325,0.3325,0.335,0.0,0.0,...,0.0,-1.33796,0,0,0,1,0,1,0,0
153,0.3,0.70,0.00,1,40,0.3325,0.3325,0.335,0.0,0.0,...,0.0,-0.16180,0,0,0,1,0,1,0,0
154,0.3,0.47,0.23,1,40,0.3325,0.3325,0.335,0.0,0.0,...,0.0,-0.45137,0,0,0,1,0,1,0,0
155,0.3,0.23,0.47,1,40,0.3325,0.3325,0.335,0.0,0.0,...,0.0,-0.75353,0,0,0,1,0,1,0,0


In [32]:
data_with_dummies.to_csv('modified_data.csv', index=False)

In [34]:
Y_train = data_with_dummies['Category']
Y_train = Y_train.reset_index()
Y_train = Y_train.drop(columns = ['index'])
X_train = data_with_dummies.drop(columns = ['Category'])

## Standardise dataset

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train = scaler.fit_transform(X)
X_train = pd.DataFrame(X_scaled, columns = X.columns, index = X.index.values.tolist())

X_train

Unnamed: 0,composition_A,composition_B1,composition_B2,Number of blocks,dpn,A1,B1,C1,A2,B2,...,B4,C4,cLogP_predicted,type_A_Boc-AEAm,type_A_DMAEA,type_B1_PEAm,type_B2_None,type_C_HEAm,type_C_None,type_C_PEGA
0,-0.099386,0.735233,-0.720559,-0.382977,4.108432,-0.178481,0.358184,0.428247,-0.176477,-0.316488,...,-0.113592,-0.113592,0.572166,1.019294,-0.584705,0.139573,1.800901,0.246598,-0.181369,-0.113592
1,-0.099386,0.735233,-0.720559,-0.382977,-0.163249,-0.184463,0.353758,0.437195,-0.176477,-0.316488,...,-0.113592,-0.113592,0.572166,1.019294,-0.584705,0.139573,1.800901,0.246598,-0.181369,-0.113592
2,-0.099386,0.735233,-0.720559,-0.382977,-1.587142,-0.214375,0.375887,0.437195,-0.176477,-0.316488,...,-0.113592,-0.113592,0.572166,1.019294,-0.584705,0.139573,1.800901,0.246598,-0.181369,-0.113592
3,1.148908,0.735233,-0.720559,-0.382977,4.108432,1.819636,1.836417,-2.560361,-0.176477,-0.316488,...,-0.113592,-0.113592,0.869052,1.019294,-0.584705,0.139573,1.800901,-4.055175,5.513620,-0.113592
4,1.148908,0.735233,-0.720559,-0.382977,-0.163249,1.819636,1.836417,-2.560361,-0.176477,-0.316488,...,-0.113592,-0.113592,0.869052,1.019294,-0.584705,0.139573,1.800901,-4.055175,5.513620,-0.113592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,-1.347681,-1.160195,2.238264,-0.382977,-0.163249,-0.184463,0.353758,0.437195,-0.176477,-0.316488,...,-0.113592,-0.113592,-1.016587,-0.981071,-0.584705,0.139573,-0.555278,0.246598,-0.181369,-0.113592
153,-1.347681,3.262471,-0.720559,-0.382977,-0.163249,-0.184463,0.353758,0.437195,-0.176477,-0.316488,...,-0.113592,-0.113592,-0.088888,-0.981071,-0.584705,0.139573,-0.555278,0.246598,-0.181369,-0.113592
154,-1.347681,1.809309,0.727376,-0.382977,-0.163249,-0.184463,0.353758,0.437195,-0.176477,-0.316488,...,-0.113592,-0.113592,-0.317287,-0.981071,-0.584705,0.139573,-0.555278,0.246598,-0.181369,-0.113592
155,-1.347681,0.292966,2.238264,-0.382977,-0.163249,-0.184463,0.353758,0.437195,-0.176477,-0.316488,...,-0.113592,-0.113592,-0.555616,-0.981071,-0.584705,0.139573,-0.555278,0.246598,-0.181369,-0.113592


## Random Forest Classifier

In [36]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
rf=RandomForestClassifier()

In [37]:
params = {'n_estimators': [10,100,200,400,500,1000], 'max_depth': [10,20,30,40,50,60,70,80,90,100,None], 'max_features': ['auto', 'sqrt','log2'], 'min_samples_leaf': [1,2,4], 'min_samples_split': [2,5,10], 'bootstrap': [True,False],'class_weight': ['balanced']}

In [None]:
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=0)
rf_op = RandomizedSearchCV(rf, params, n_iter=20, cv=cv, scoring='f1', random_state=0, verbose=1, n_jobs=-1)

Top 3 numerical features: cLogP, composition_B1,composition_A

In [3]:
data = pd.read_csv('modified_data.csv',usecols=['cLogP_predicted', 'composition_B1','composition_A'])
data

Unnamed: 0,composition_A,composition_B1,cLogP_predicted
0,0.5,0.30,0.67630
1,0.5,0.30,0.67630
2,0.5,0.30,0.67630
3,0.7,0.30,1.05270
4,0.7,0.30,1.05270
...,...,...,...
152,0.3,0.00,-1.33796
153,0.3,0.70,-0.16180
154,0.3,0.47,-0.45137
155,0.3,0.23,-0.75353


In [30]:
train_y_raw = torch.tensor(data.MIC_PAO1, dtype = float)
train_y = torch.reshape(train_y_raw,(len(train_y_raw),1))
train_x = torch.tensor(data.iloc[:,0:3].values)
best_y = min(train_y)
train_x, train_y, best_y

AttributeError: 'DataFrame' object has no attribute 'MIC_PAO1'

# Generate the next point
### SingleTaskGP:
Only considers continuous inputs
### MixedSingleTaskGP:
Include discrete inputs and combines using a categorical kernel

In [6]:
#Surrogate = MixedSingleTaskGP(train_X = train_x, train_Y = train_y, cat_dims=[3,4,5,6,7,8])
Surrogate = SingleTaskGP(train_X = train_x, train_Y = train_y)
mll = ExactMarginalLogLikelihood(Surrogate.likelihood, Surrogate)
EI = qExpectedImprovement(model = Surrogate, best_f = best_y)
UCB = UpperConfidenceBound(model = Surrogate, beta = 0.2)
# new_point_analytic, _ = optimize_acqf_mixed(
#     acq_function=EI,
#     bounds=([[0.0] * 9, [1.0] * 9]),
#     q = 1,
#     fixed_features_list = [{3,1}],
#     num_restarts=20,
#     raw_samples=100,
#     options={},
# )

new_point_analytic, _ = optimize_acqf(
    acq_function=EI,
    bounds=torch.tensor([[0.0] * 3, [1.0] * 3]),
    q=1,
    num_restarts=20,
    raw_samples=100,
    options={},
)
new_point_analytic

tensor([[0.5851, 0.1811, 0.0589]])