In [1]:
!pip install --pre deepchem
import deepchem
deepchem.__version__



'2.4.0'

In [2]:
#for simplicities sake, I'm going to be using the exact data set
#and code used in tutorial 13, which contains the pdbbind data

#start with common imports
import os
import numpy as np
import pandas as pd
import tempfile

from rdkit import Chem
from rdkit.Chem import AllChem
import deepchem as dc
from simtk.openmm.app import PDBFile
from pdbfixer import PDBFixer
from sklearn.ensemble import RandomForestRegressor

from deepchem.utils.vina_utils import prepare_inputs
from deepchem.utils import download_url, load_from_disk
from deepchem.utils.evaluate import Evaluator 

#slight changes are being made to follow my normal coding structure
#create a data directory that will contain our file
dataDir = dc.utils.get_data_dir()
datasetFile = os.path.join(dataDir, "pdbbind_core_df.csv.gz")

#if the file location doesnt exist, the program will find and
#download the file from designated url
if not os.path.exists(datasetFile):
    print('File does not exist. Downloading file...')
    download_url("https://s3-us-west-1.amazonaws.com/deepchem.io/datasets/pdbbind_core_df.csv.gz")
    # download_url(http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/pdbbind_v2015.tar.gz)
    print('File downloaded...')

#place the data file into a program "understood" format
raw_dataset = load_from_disk(datasetFile)
raw_dataset = raw_dataset[['pdb_id', 'smiles', 'label']]

#visualize a little bit for sanity's sake
raw_dataset.head()


Unnamed: 0,pdb_id,smiles,label
0,2d3u,CC1CCCCC1S(O)(O)NC1CC(C2CCC(CN)CC2)SC1C(O)O,6.92
1,3cyx,CC(C)(C)NC(O)C1CC2CCCCC2C[NH+]1CC(O)C(CC1CCCCC...,8.0
2,3uo4,OC(O)C1CCC(NC2NCCC(NC3CCCCC3C3CCCCC3)N2)CC1,6.52
3,1p1q,CC1ONC(O)C1CC([NH3+])C(O)O,4.89
4,3ag9,NC(O)C(CCC[NH2+]C([NH3+])[NH3+])NC(O)C(CCC[NH2...,8.05


In [3]:
pdbids = raw_dataset['pdb_id'].values
ligand_smiles = raw_dataset['smiles'].values

In [4]:
len(pdbids), len(ligand_smiles)

(193, 193)

In [None]:
%%time
for (pdbid, ligand) in zip(pdbids, ligand_smiles):
  fixer = PDBFixer(url='https://files.rcsb.org/download/%s.pdb' % (pdbid))
  PDBFile.writeFile(fixer.topology, fixer.positions, open('%s.pdb' % (pdbid), 'w'))
  
  p, m = None, None
  # skip pdb fixing for speed
  try:
    p, m = prepare_inputs('%s.pdb' % (pdbid), ligand)
  except:
    print('%s failed sanitization' % (pdbid)) 

  if p and m:  # protein and molecule are readable by RDKit
    Chem.rdmolfiles.MolToPDBFile(p, '%s.pdb' % (pdbid))
    Chem.rdmolfiles.MolToPDBFile(m, 'ligand_%s.pdb' % (pdbid))

In [None]:
proteins = [f for f in os.listdir('.') if len(f) == 8 and f.endswith('.pdb')]
ligands = [f for f in os.listdir('.') if f.startswith('ligand') and f.endswith('.pdb')]

In [None]:
len(proteins), len(ligands)

In [None]:
# Handle failed sanitizations
failures = set([f[:-4] for f in proteins]) - set([f[7:-4] for f in ligands])
print(failures)
for pdbid in failures:
  proteins.remove(pdbid + '.pdb')

In [None]:
len(proteins), len(ligands)
# make sure len(proteins) = len(ligands)

In [None]:
pdbids = [f[:-4] for f in proteins]
small_dataset = raw_dataset[raw_dataset['pdb_id'].isin(pdbids)]
labels = small_dataset.label

In [None]:
# Circ fingerprint featurizer 
fp_featurizer_1 = dc.feat.CircularFingerprint(size=2048)
features_1 = fp_featurizer_1.featurize([Chem.MolFromPDBFile(l) for l in ligands])

In [None]:
dataset_1 = dc.data.NumpyDataset(X=features_1, y=labels, ids=pdbids)
train_dataset_1, test_dataset_1 = dc.splits.RandomSplitter().train_test_split(dataset_1, seed=2)

In [None]:
# fitting w/ the use of random state
seed = 2
sklearn_model = RandomForestRegressor(n_estimators=100, max_features='sqrt')
sklearn_model.random_state = seed
model_1 = dc.models.SklearnModel(sklearn_model)
model_1.fit(train_dataset_1)

In [None]:
# look at accuracies
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

evaluator = Evaluator(model_1, train_dataset_1, [])
train_r2score = evaluator.compute_model_performance([metric])
print("RF Train set R^2 %f" % (train_r2score["pearson_r2_score"]))

evaluator = Evaluator(model_1, test_dataset_1, [])
test_r2score = evaluator.compute_model_performance([metric])
print("RF Test set R^2 %f" % (test_r2score["pearson_r2_score"]))

In [None]:
#Cont. Circ. fingerprint featurizer 
fp_featurizer_2 = dc.feat.ContactCircularFingerprint(size=2048)
features_2 = fp_featurizer_2.featurize(zip(ligands, proteins))

In [None]:
dataset_2 = dc.data.NumpyDataset(X=features_2, y=labels, ids=pdbids)
train_dataset_2, test_dataset_2 = dc.splits.RandomSplitter().train_test_split(dataset_2, seed=2)

In [None]:
# fitting w/ the use of random state
seed = 2
sklearn_model = RandomForestRegressor(n_estimators=100, max_features='sqrt')
sklearn_model.random_state = seed
model_2 = dc.models.SklearnModel(sklearn_model)
model_2.fit(train_dataset_2)

In [None]:
# look at accuracies
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

evaluator = Evaluator(model_2, train_dataset_2, [])
train_r2score = evaluator.compute_model_performance([metric])
print("RF Train set R^2 %f" % (train_r2score["pearson_r2_score"]))

evaluator = Evaluator(model_2, test_dataset_2, [])
test_r2score = evaluator.compute_model_performance([metric])
print("RF Test set R^2 %f" % (test_r2score["pearson_r2_score"]))