### Machine learning for TYK2-inhibitor affinity prediction with Scikit-Learn/DeepChem/RDKit

In [1]:
import pandas as pd
import numpy as np


#### Load ChEMBL bioactivity data

In [2]:
from rdkit import Chem
from rdkit.Chem.Draw import MolsToGridImage
from rdkit.Chem import PandasTools

chembl_bioactivity_df = pd.read_pickle('../data/chembl_bioactivity_data.pkl')
chembl_bioactivity_df.head(2)

Unnamed: 0,molecule_chembl_id,molecule_pref_name,canonical_smiles,pchembl_value,standard_type,standard_relation,standard_value,standard_units,potential_duplicate,target_pref_name,target_organism,assay_type,assay_description,chembl_id_duplicate,mean_pchembl_value,max_pchembl_value,min_pchembl_value,core_smiles,Mol,Scaffold
0,CHEMBL10,SB-203580,C[S+]([O-])c1ccc(-c2nc(-c3ccc(F)cc3)c(-c3ccncc...,5.7,Kd,=,2000.0,nM,False,Tyrosine-protein kinase TYK2,Homo sapiens,B,Binding constant for TYK2(JH2domain-pseudokina...,False,5.7,5.7,5.7,c1ccc(-c2nc(-c3ccccc3)c(-c3ccncc3)[nH]2)cc1,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."
1,CHEMBL1076700,,Cc1cc(Nc2nc3cccc(-c4cc(F)c(CN5CCOCC5)c(F)c4)c3...,7.01,IC50,=,97.0,nM,False,Tyrosine-protein kinase TYK2,Homo sapiens,B,Inhibition of GST-tagged TYK2 assessed as inhi...,False,7.01,7.01,7.01,c1ccc(Nc2nc3cccc(-c4ccc(CN5CCOCC5)cc4)c3o2)cc1,"<img data-content=""rdkit/molecule"" src=""data:i...","<img data-content=""rdkit/molecule"" src=""data:i..."


In [3]:
chembl_bioactivity_df.shape

(1502, 20)

In [4]:
from rdkit.Chem.SaltRemover import SaltRemover

def unsalt(smiles):
    remover = SaltRemover()
    #print(remover.salts)
    mol = Chem.MolFromSmiles(smiles)
    mol, deleted = remover.StripMolWithDeleted(mol)
    #print([Chem.MolToSmarts(s) for s in deleted])
    return Chem.MolToSmiles(mol, True)

chembl_bioactivity_ml_df = chembl_bioactivity_df[['molecule_chembl_id', 'canonical_smiles', 'mean_pchembl_value']].copy()

#remove salts
smiles = list(map(lambda i: unsalt(i), list(chembl_bioactivity_ml_df['canonical_smiles'])))

chembl_bioactivity_ml_df['smiles']  = smiles
#chembl_bioactivity_ml_df.head()

mols = [Chem.MolFromSmiles(smi) for smi in chembl_bioactivity_ml_df['smiles']]  #sanitize=True default



#### Featurize the ChEMBL dataset

#### 1. Use Molecular Descriptors

In [5]:
import deepchem as dc

#if use_fragment = True, a total of 208 descriptors are returned to include fragment binary descriptors like 'fr_'
md_featurizer = dc.feat.RDKitDescriptors(use_fragment = False)

features_md = md_featurizer.featurize(mols)
#features_md is a N x 123 array containing the 123 molecular descriptors(physiochemical properties) for the 1502 molecules
print(features_md.shape)
features_md[:5]

(1502, 123)


array([[ 1.33377069e+01, -1.02754413e+00,  1.33377069e+01,
         2.88772971e-01,  5.25196404e-01,  3.77444000e+02,
         3.61316000e+02,  3.77099811e+02,  1.34000000e+02,
         0.00000000e+00,  1.51945536e-01, -6.11646138e-01,
         6.11646138e-01,  1.51945536e-01,  8.51851852e-01,
         1.48148148e+00,  2.11111111e+00,  3.22277374e+01,
         1.00631031e+01,  2.15653172e+00, -2.03749157e+00,
         2.32663597e+00, -1.95909026e+00,  7.90297649e+00,
         6.00587178e-01,  1.83223649e+00,  1.04524911e+03,
         1.88027541e+01,  1.45170915e+01,  1.53335881e+01,
         1.31141935e+01,  8.31838406e+00,  9.87646226e+00,
         6.03030029e+00,  7.40995310e+00,  4.25014686e+00,
         5.18616699e+00,  2.85853311e+00,  3.37789251e+00,
        -2.96000000e+00,  3.14577780e+00,  1.74536410e+01,
         7.34321150e+00,  3.62549170e+00,  1.58876011e+02,
         9.53672839e+00,  1.78973945e+01,  4.89548348e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+0

In [6]:
#from rdkit.ML.Descriptors import MoleculeDescriptors

#calculated rdkit descriptors
descriptors = []
descList = []
from rdkit.Chem import Descriptors
for descriptor, function in Descriptors.descList:
    if descriptor.startswith('fr_'):
        continue
    descriptors.append(descriptor)
    descList.append((descriptor, function))
print(descriptors)
print(len(descriptors))

['MaxEStateIndex', 'MinEStateIndex', 'MaxAbsEStateIndex', 'MinAbsEStateIndex', 'qed', 'MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'NumRadicalElectrons', 'MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3', 'BCUT2D_MWHI', 'BCUT2D_MWLOW', 'BCUT2D_CHGHI', 'BCUT2D_CHGLO', 'BCUT2D_LOGPHI', 'BCUT2D_LOGPLOW', 'BCUT2D_MRHI', 'BCUT2D_MRLOW', 'BalabanJ', 'BertzCT', 'Chi0', 'Chi0n', 'Chi0v', 'Chi1', 'Chi1n', 'Chi1v', 'Chi2n', 'Chi2v', 'Chi3n', 'Chi3v', 'Chi4n', 'Chi4v', 'HallKierAlpha', 'Ipc', 'Kappa1', 'Kappa2', 'Kappa3', 'LabuteASA', 'PEOE_VSA1', 'PEOE_VSA10', 'PEOE_VSA11', 'PEOE_VSA12', 'PEOE_VSA13', 'PEOE_VSA14', 'PEOE_VSA2', 'PEOE_VSA3', 'PEOE_VSA4', 'PEOE_VSA5', 'PEOE_VSA6', 'PEOE_VSA7', 'PEOE_VSA8', 'PEOE_VSA9', 'SMR_VSA1', 'SMR_VSA10', 'SMR_VSA2', 'SMR_VSA3', 'SMR_VSA4', 'SMR_VSA5', 'SMR_VSA6', 'SMR_VSA7', 'SMR_VSA8', 'SMR_VSA9', 'SlogP_VSA1', 'SlogP_VSA10', 'SlogP_VSA11', 'Slo

#### 2. Use Fingerprints

In [7]:
fp_featurizer = dc.feat.CircularFingerprint(size=2048)

features_fp = fp_featurizer.featurize(mols)
#features_fp is a N x 2048 array containing the fingerprints for the 1502 molecules
print(features_fp.shape)
features_fp[:5]

(1502, 2048)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### 3. Use Graph Convolutions

In [8]:
gc_featurizer = dc.feat.ConvMolFeaturizer()
features_graphs = gc_featurizer.featurize(mols)
features_graphs

array([<deepchem.feat.mol_graphs.ConvMol object at 0x7f854b6b5e50>,
       <deepchem.feat.mol_graphs.ConvMol object at 0x7f85c01bdd90>,
       <deepchem.feat.mol_graphs.ConvMol object at 0x7f8622e84890>, ...,
       <deepchem.feat.mol_graphs.ConvMol object at 0x7f85410c3750>,
       <deepchem.feat.mol_graphs.ConvMol object at 0x7f85410c3650>,
       <deepchem.feat.mol_graphs.ConvMol object at 0x7f85410c3510>],
      dtype=object)

#### Dataset preparation

In [9]:
features = features_fp
labels = chembl_bioactivity_ml_df['mean_pchembl_value']
ids = chembl_bioactivity_ml_df['molecule_chembl_id']

dataset = dc.data.NumpyDataset(X=features, y=labels, ids=ids)

train_dataset, test_dataset = dc.splits.RandomSplitter().train_test_split(dataset, seed=42)

In [10]:
train_dataset.get_shape()

((1201, 2048), (1201,), (1201,), (1201,))

In [11]:
test_dataset.get_shape()

((301, 2048), (301,), (301,), (301,))

#### RandomForestRegressor model

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [13]:
seed = 42
rf_model = RandomForestRegressor()
rf_model.random_state = seed

param_grid = {'oob_score': [True], 'n_estimators':[50, 100, 150, 200, 250]}

grid_search = GridSearchCV(rf_model, param_grid, cv=10, verbose = 1, refit = True, return_train_score=True, n_jobs = -2)

grid_search.fit(train_dataset.X, train_dataset.y)


Fitting 10 folds for each of 5 candidates, totalling 50 fits


GridSearchCV(cv=10, estimator=RandomForestRegressor(random_state=42), n_jobs=-2,
             param_grid={'n_estimators': [50, 100, 150, 200, 250],
                         'oob_score': [True]},
             return_train_score=True, verbose=1)

In [14]:
grid_search.best_params_

{'n_estimators': 250, 'oob_score': True}

In [15]:
grid_search.best_estimator_

RandomForestRegressor(n_estimators=250, oob_score=True, random_state=42)

In [16]:
grid_search.cv_results_

{'mean_fit_time': array([12.75109618, 25.94488497, 38.75033133, 49.92226384, 57.27423413]),
 'std_fit_time': array([ 1.73536555,  3.20697271,  5.44109877,  7.77227467, 12.86719182]),
 'mean_score_time': array([0.01463439, 0.02689342, 0.03929541, 0.04694223, 0.04528439]),
 'std_score_time': array([0.0022877 , 0.00619353, 0.00864456, 0.01158299, 0.017581  ]),
 'param_n_estimators': masked_array(data=[50, 100, 150, 200, 250],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_oob_score': masked_array(data=[True, True, True, True, True],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_estimators': 50, 'oob_score': True},
  {'n_estimators': 100, 'oob_score': True},
  {'n_estimators': 150, 'oob_score': True},
  {'n_estimators': 200, 'oob_score': True},
  {'n_estimators': 250, 'oob_score': True}],
 'split0_test_score': array([0.58432508, 0.59453371, 0.58

In [17]:
grid_search.best_score_

0.5125287043201121

In [18]:
y_pred_train = grid_search.predict(train_dataset.X)
y_pred_test = grid_search.predict(test_dataset.X)

In [19]:
R2_cv_train = r2_score(train_dataset.y, y_pred_train)
R2_cv_test = r2_score(test_dataset.y, y_pred_test)

print("RF Train set R2 %f" % R2_cv_train)
print("RF Test set R2 %f" % R2_cv_test)

RF Train set R2 0.919201
RF Test set R2 0.540480


In [21]:
import deepchem as dc
print("DeepChem: ", dc.__version__)

#deepchem is enabled by/running on TensorFlow GPU platform
import tensorflow as tf
print("TensorFlow: ", tf.__version__)
print("GPUs available: ", tf.config.list_physical_devices('GPU'))

import sklearn
print("Scikit-Learn: ", sklearn.__version__)

import rdkit
print("RDKit: ", rdkit.__version__)

from platform import python_version
print("Python: ", python_version())
print("Numpy: ", np.__version__)
print("Pandas: ", pd.__version__)

DeepChem:  2.5.0
TensorFlow:  2.4.1
GPUs available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Scikit-Learn:  0.24.2
RDKit:  2021.03.1
Python:  3.7.10
Numpy:  1.19.5
Pandas:  1.2.4


In [22]:
! conda list

# packages in environment at /home/cv/anaconda3/envs/deepchem:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       1_gnu    conda-forge
absl-py                   0.12.0                   pypi_0    pypi
argon2-cffi               20.1.0                   pypi_0    pypi
astunparse                1.6.3                    pypi_0    pypi
async-generator           1.10                     pypi_0    pypi
attrs                     21.1.0                   pypi_0    pypi
backcall                  0.2.0                    pypi_0    pypi
bleach                    3.3.0                    pypi_0    pypi
boost                     1.74.0           py37h6dcda5c_3    conda-forge
boost-cpp                 1.74.0               hc6e9bd1_2    conda-forge
bzip2                     1.0.8                h7f98852_4    conda-forge
ca-certificates           