## Bayesian Optimisation for Antimicrobial Polymer Discovery

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib import rcParams
import torch
from botorch.models import SingleTaskGP, ModelListGP
from botorch.models.gp_regression_mixed import MixedSingleTaskGP
from botorch.fit import fit_gpytorch_model
from botorch.utils import standardize
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.acquisition.monte_carlo import qExpectedImprovement
from botorch.acquisition.analytic import UpperConfidenceBound, ProbabilityOfImprovement, ExpectedImprovement
from botorch.optim import optimize_acqf, optimize_acqf_mixed
from botorch.cross_validation import gen_loo_cv_folds
import math
import pandas as pd

## Retrieve Training dataset and assign variables
To enable mixed search space, utilise MixedSingleTaskGP which uses a special kernel to combine continuous and categorical data.

In [5]:
data = pd.read_excel('dataset_final.xlsx', sheet_name = 'Dataset_Complete_modified')
data

Unnamed: 0,Polymer Index,type_A,type_B1,type_B2,type_C,composition_A,composition_B1,composition_B2,composition_C,block_sequence_theoretical,...,A2,B2,C2,A3,B3,C3,A4,B4,C4,cLogP_predicted
0,1,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630
1,2,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630
2,3,Boc-AEAm,PEAm,,HEAm,0.5,0.30,0.00,0.20,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.67630
3,4,Boc-AEAm,PEAm,,,0.7,0.30,0.00,0.00,AB,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270
4,5,Boc-AEAm,PEAm,,,0.7,0.30,0.00,0.00,AB,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.05270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,157,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.00,0.47,0.23,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-1.33796
153,158,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.70,0.00,0.00,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.16180
154,159,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.47,0.23,0.00,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.45137
155,160,AAPTAC,PEAm,NIPAm,HEAm,0.3,0.23,0.47,0.00,ABC,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,-0.75353


In [None]:
# hello

Top 3 numerical features: cLogP, composition_B1,composition_A

In [3]:
data = pd.read_csv('modified_data.csv',usecols=['cLogP_predicted', 'composition_B1','composition_A'])
data

Unnamed: 0,composition_A,composition_B1,cLogP_predicted
0,0.5,0.30,0.67630
1,0.5,0.30,0.67630
2,0.5,0.30,0.67630
3,0.7,0.30,1.05270
4,0.7,0.30,1.05270
...,...,...,...
152,0.3,0.00,-1.33796
153,0.3,0.70,-0.16180
154,0.3,0.47,-0.45137
155,0.3,0.23,-0.75353


In [30]:
train_y_raw = torch.tensor(data.MIC_PAO1, dtype = float)
train_y = torch.reshape(train_y_raw,(len(train_y_raw),1))
train_x = torch.tensor(data.iloc[:,0:3].values)
best_y = min(train_y)
train_x, train_y, best_y

AttributeError: 'DataFrame' object has no attribute 'MIC_PAO1'

# Generate the next point
### SingleTaskGP:
Only considers continuous inputs
### MixedSingleTaskGP:
Include discrete inputs and combines using a categorical kernel

In [6]:
#Surrogate = MixedSingleTaskGP(train_X = train_x, train_Y = train_y, cat_dims=[3,4,5,6,7,8])
Surrogate = SingleTaskGP(train_X = train_x, train_Y = train_y)
mll = ExactMarginalLogLikelihood(Surrogate.likelihood, Surrogate)
EI = qExpectedImprovement(model = Surrogate, best_f = best_y)
UCB = UpperConfidenceBound(model = Surrogate, beta = 0.2)
# new_point_analytic, _ = optimize_acqf_mixed(
#     acq_function=EI,
#     bounds=([[0.0] * 9, [1.0] * 9]),
#     q = 1,
#     fixed_features_list = [{3,1}],
#     num_restarts=20,
#     raw_samples=100,
#     options={},
# )

new_point_analytic, _ = optimize_acqf(
    acq_function=EI,
    bounds=torch.tensor([[0.0] * 3, [1.0] * 3]),
    q=1,
    num_restarts=20,
    raw_samples=100,
    options={},
)
new_point_analytic

tensor([[0.5851, 0.1811, 0.0589]])