In [55]:
import numpy as np
import tensorflow as tf
import deepchem as dc
from deepchem.utils.save import load_from_disk
from deepchem.models import GraphConvTensorGraph

In [56]:
dataset_file= "tox21.csv"
dataset = load_from_disk(dataset_file)

In [57]:
tasks = ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD','NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=8192)
transformers = dc.trans.BalancingTransformer(dataset=dataset, transform_w=True)
print("About to transform data")
dataset = transformer.transform(dataset)
splitter = dc.splits.RandomSplitter()
train, valid, test = splitter.train_valid_test_split(dataset)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from tox21.csv
Loading shard 1 of size 8192.
Featurizing sample 0




Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 15.343 s
TIMING: dataset construction took 18.133 s
Loading dataset from disk.
About to transform data
TIMING: dataset construction took 3.775 s
Loading dataset from disk.
TIMING: dataset construction took 2.996 s
Loading dataset from disk.
TIMING: dataset construction took 1.311 s
Loading dataset from disk.
TIMING: dataset construction took 1.592 s
Loading dataset from disk.


In [58]:
#model

n_filters = 3
nodes = 100
model = GraphConvTensorGraph(len(tasks), batch_size=100, learning_rate = 0.001,dense_layer_size=nodes,graph_conv_layers=[n_filters]*2, mode='classification',dropout=0.2)







In [79]:
'''Area under curve for ROC represents average measure from toxic/nontoxic classification of molecules'''

#metrics
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

In [69]:
#Fit the model
model.fit(train, nb_epoch=10)

print("Evaluating model")
train_scores = model.evaluate(train, [metric])
valid_scores = model.evaluate(valid, [metric])

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)


Evaluating model
computed_metrics: [0.8305826794536473, 0.865808674425162, 0.8439520105088381, 0.8339464031359108, 0.7293907649976132, 0.7936217324514889, 0.7657072659305648, 0.736789578431909, 0.8087769582547791, 0.7094280963325454, 0.812859046775077, 0.7826345690251015]
computed_metrics: [0.8086437440305636, 0.8249242424242424, 0.852360274548256, 0.7158308751229105, 0.7161561561561562, 0.7555782081775966, 0.6732868133834318, 0.7158840648104452, 0.8615271159518947, 0.7504789520012427, 0.7844523809523809, 0.7163072776280324]
Train scores
{'mean-roc_auc_score': 0.7927914816435532}
Validation scores
{'mean-roc_auc_score': 0.7646191754322628}


In [81]:
test_scores = model.evaluate(test, [metric])
print("Test scores")
print(test_scores)

computed_metrics: [0.7910515764534354, 0.8920175883646203, 0.8106529209621993, 0.7486414614406289, 0.7209389749801596, 0.769307870858831, 0.6935921221635508, 0.7337973424929947, 0.7896354477930504, 0.5931404958677686, 0.797905544147844, 0.7527760318457992]
Test scores
{'mean-roc_auc_score': 0.7577881147809068}
