## Bayesian Optimisation for Antimicrobial Polymer Discovery

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
import torch
from botorch.models import SingleTaskGP, ModelListGP, fully_bayesian
from botorch.models.gp_regression_mixed import MixedSingleTaskGP
from botorch.fit import fit_gpytorch_model, fit_fully_bayesian_model_nuts
from botorch.utils import standardize
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.acquisition.monte_carlo import qExpectedImprovement, qUpperConfidenceBound
from botorch.acquisition.analytic import UpperConfidenceBound, ProbabilityOfImprovement, ExpectedImprovement
from botorch.optim import optimize_acqf, optimize_acqf_mixed
from botorch.cross_validation import gen_loo_cv_folds
import math
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc

## Retrieve Training dataset and assign variables
To enable mixed search space, utilise MixedSingleTaskGP which uses a special kernel to combine continuous and categorical data.

In [None]:
data = pd.read_excel('dataset_final.xlsx', sheet_name = 'Dataset_Complete_modified')
data

In [None]:
data.corr()

In [None]:
data = data.drop(columns=['composition_C','Polymer Index','Dispersity','clogP','block_sequence_theoretical', 'block_sequence_experimental','MIC_PAO1', 'MIC_PA','MIC_EC', 'MIC_AB', 'MIC_SA', 'MIC_MSmeg','GPC','Target','NMR'])
data = data.replace({'>128':128,'>256':256, '32-64':64, '64-128':128,'128-256':256})

## Assign Classes

In [None]:
data['Category'] = data['MIC_PAO1_PA'].apply(lambda x: 1 if x <= 64 else 0)
data = data.drop(columns = ['MIC_PAO1_PA'])

In [None]:
data_with_dummies = pd.get_dummies(data, drop_first=True)
data_with_dummies

In [None]:
data_with_dummies.to_csv('modified_data.csv', index=False)

In [None]:
Y_train = data_with_dummies['Category']
X = data_with_dummies.drop(columns = ['Category'])

## Standardise dataset

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X.iloc[:, 0:17] = scaler.fit_transform(X.iloc[:, 0:17]) # Only standardise numerical values
X_train = pd.DataFrame(X, columns = X.columns, index = X.index.values.tolist())
X_train

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
rf=RandomForestClassifier()

In [None]:
params = {'n_estimators': [10,100,200,400,500,1000], 'max_depth': [10,20,30,40,50,60,70,80,90,100,None], 'max_features': ['auto', 'sqrt','log2'], 'min_samples_leaf': [1,2,4], 'min_samples_split': [2,5,10], 'bootstrap': [True,False],'class_weight': ['balanced']}

In [None]:
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=5, random_state=0)
rf_op = RandomizedSearchCV(rf, params, n_iter=20, cv=cv, scoring='f1', random_state=0, verbose=1, n_jobs=-1)

Only take 3 parameters for initial testing
Top 3 numerical features: cLogP, composition_B1,composition_A
Most important categorical feature: type_B2_none

In [None]:
X_train_RF = X_train[["cLogP_predicted", "composition_B1", "composition_A"]]
# Change X_train_RF here if input space is enlarged in later stages

In [None]:
rf_op.fit(X_train_RF,Y_train.values.ravel())

In [None]:
rf_op.best_params_

In [None]:
rf = RandomForestClassifier(random_state=0, n_estimators=rf_op.best_params_['n_estimators'], min_samples_split=rf_op.best_params_['min_samples_split'], min_samples_leaf=rf_op.best_params_['min_samples_leaf'], max_features=rf_op.best_params_['max_features'], max_depth=rf_op.best_params_['max_depth'], class_weight = rf_op.best_params_['class_weight'],bootstrap=rf_op.best_params_['bootstrap'])

In [None]:
arr = cross_val_score(rf, X_train_RF, Y_train.values.ravel(), cv=cv)
arr

In [None]:
np.mean(arr)

In [None]:
rf.fit(X_train_RF,Y_train.values.ravel())

## Bayesian Optimisation

In [None]:
X_train # Standardiesd with dummy variables

Top 3 numerical features: cLogP, composition_B1,composition_A

In [None]:
data_BO = X_train[["cLogP_predicted", "composition_B1", "composition_A"]]
data_BO

In [None]:
Y_train_BO = torch.tensor(data_with_dummies["Category"])
Y_train_BO = torch.reshape(Y_train_BO,(len(Y_train_BO),1))
Y_train_BO = Y_train_BO.float()
Y_train_BO

In [None]:
X_train_BO = torch.tensor(data_BO.values)
X_train_BO = X_train_BO.float()
best_y = 1.
X_train_BO

# Fit BO model
Physical constraints must be applied
### SingleTaskGP:
Only considers continuous inputs
### MixedSingleTaskGP:
Include discrete inputs and combines using a categorical kernel

In [None]:
# model=SingleTaskGP(X_train_BO, Y_train_BO)
# mll=ExactMarginalLogLikelihood(model.likelihood, model)
# fit_gpytorch_model(mll)
model = fully_bayesian.SaasFullyBayesianSingleTaskGP(X_train_BO, Y_train_BO)
fit_fully_bayesian_model_nuts(model)

# Acquisition Function
## EI

In [None]:
EI = qExpectedImprovement(
    model = model,
    best_f = best_y
)

## UCB

In [None]:
UCB = qUpperConfidenceBound(
    model = model,
    beta = 0.6,
)

In [None]:
# Output bounds
bounds = torch.tensor([[-5., 0., 0.],[5., 1., 1.]])

In [None]:
X_candidates,_ = optimize_acqf(
    acq_function = EI,
    bounds = bounds,
    q = 20, # Number of suggested candidates
    num_restarts = 200,
    raw_samples = 512,
   # fixed_features_list = [{3: 1}]
)
X_candidates

## Substitute generated candidates to random forest classifier

In [None]:
Y_rf = rf.predict(X_candidates)
Y_rf

In [None]:
np.count_nonzero(Y_rf == 1)

In [None]:
Y_rf = torch.tensor(Y_rf)
Y_rf = torch.reshape(Y_rf,(len(Y_rf),1))
Y_rf = Y_rf.float()
Y_rf

## Second Round

In [None]:
X_train_2 = (X_train_BO,X_candidates)
X_train_2 = torch.cat(X_train_2, dim = 0)
X_train_2
# len(X_train_2)

In [None]:
Y_train_2 = (Y_train_BO,Y_rf)
Y_train_2 = torch.cat(Y_train_2, dim = 0)
Y_train_2
# len(Y_train_2)

In [None]:
# model=SingleTaskGP(X_train_2, Y_train_2)
# mll=ExactMarginalLogLikelihood(model.likelihood, model)
# fit_gpytorch_model(mll)

model = fully_bayesian.SaasFullyBayesianSingleTaskGP(X_train_2, Y_train_2)
fit_fully_bayesian_model_nuts(model)

In [None]:
EI = qExpectedImprovement(
    model = model,
    best_f = best_y
)

UCB = qUpperConfidenceBound(
    model = model,
    beta = 0.6,
)

In [None]:
X_candidates,_ = optimize_acqf(
    acq_function = EI,
    bounds = torch.tensor([[-5., 0., 0.],[5., 1., 1.]]),
    q = 20, # Number of suggested candidates
    num_restarts = 200,
    raw_samples = 512
)
X_candidates

KeyboardInterrupt: 

In [None]:
Y_rf = rf.predict(X_candidates)
Y_rf