In [1]:
import pandas as pd 
from pysmiles import read_smiles
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import deepchem as dc
import tensorflow as tf
import tensorflow.keras.layers as layers
from deepchem.models.layers import GraphConv, GraphPool, GraphGather
from deepchem.metrics import to_one_hot
from deepchem.feat.mol_graphs import ConvMol
from deepchem.models import GraphConvModel

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Helper Functions

In [2]:
from deepchem.trans import NormalizationTransformer
from deepchem.splits import IndexSplitter


def Splitting_data(dataset, frac_train = 0.8, frac_valid = 0.1, frac_test = 0.1):
    splitter = IndexSplitter()
    train, valid, test = splitter.train_valid_test_split(dataset, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test)
    train, valid, test = splitter.train_valid_test_split(dataset)

    transformers = [NormalizationTransformer(transform_y = True, dataset = train, move_mean = True)]

  
    for transformer in transformers:
        train = transformer.transform(train)
        valid = transformer.transform(valid)
        test = transformer.transform(test)
    
    return (train, valid, test), transformers

In [3]:
def data_load(tasks, smiles, dataset_file):
    featurizer = dc.feat.ConvMolFeaturizer()
    
    loader = dc.data.CSVLoader(tasks = tasks, smiles_field = smiles, featurizer=featurizer)
    dataset = loader.featurize(dataset_file, shard_size=8192)
    data_set, transformers = Splitting_data(dataset)
    
    return tasks, data_set, transformers
    
    

In [4]:
def graph_conv(tasks, dataset, transformers, batch_size = 128):
    train_dataset, valid_dataset, test_dataset = dataset

    #Fit models
    metric = dc.metrics.Metric(dc.metrics.pearson_r2_score, np.mean)

    #Do setup required for tf/keras models
    #Number of features on conv-mols
    n_feat = 75
    #Batch size of models
    model = GraphConvModel(len(tasks), batch_size=batch_size, mode='regression')

    #Fit trained model
    model.fit(train_dataset, nb_epoch=20)

    print("Evaluating model")
    train_scores = model.evaluate(train_dataset, [metric], transformers)
    valid_scores = model.evaluate(valid_dataset, [metric], transformers)
    test_scores = model.evaluate(test_dataset, [metric], transformers)

    print("Train scores")
    print(train_scores)

    print("Validation scores")
    print(valid_scores)
    
    print("Test scores")
    print(test_scores)

# ESOL Dataset

In [5]:
esol_labels, esol_dataset, esol_trans = data_load(['measured log solubility in mols per litre'], 'smiles', 'ESOL/delaney-processed.csv') 

Loading raw samples now.
shard_size: 8192
About to start loading CSV from ESOL/delaney-processed.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
TIMING: featurizing shard 0 took 1.350 s
TIMING: dataset construction took 1.618 s
Loading dataset from disk.
TIMING: dataset construction took 0.328 s
Loading dataset from disk.
TIMING: dataset construction took 0.115 s
Loading dataset from disk.
TIMING: dataset construction took 0.201 s
Loading dataset from disk.
TIMING: dataset construction took 0.280 s
Loading dataset from disk.
TIMING: dataset construction took 0.132 s
Loading dataset from disk.
TIMING: dataset construction took 0.125 s
Loading dataset from disk.
TIMING: dataset construction took 0.233 s
Loading dataset from disk.
TIMING: dataset construction took 0.032 s
Loading dataset from disk.
TIMING: dataset construction took 0.033 s
Loading dataset from disk.


In [6]:
model = GraphConvModel(len(esol_labels), batch_size= 128, mode='regression')
help(model)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Help on GraphConvModel in module deepchem.models.graph_models object:

class GraphConvModel(deepchem.models.keras_model.KerasModel)
 |  GraphConvModel(n_tasks, graph_conv_layers=[64, 64], dense_layer_size=128, dropout=0.0, mode='classification', number_atom_features=75, n_classes=2, uncertainty=False, batch_size=100, **kwargs)
 |  
 |  This is a DeepChem model implemented by a Keras model.
 |  
 |  This class provides several advantages over using the Keras model's fitting
 |  and prediction methods directly.
 |  
 |  1. It provides better integration with the rest of DeepChem, such as direct
 |     support for Datasets and Transformers.
 |  
 |  2. It defines the loss in a more flexible way.  In particular, Keras does not
 |     support multidimensional weight matrices, which makes it impossible to
 |     implement most multitask models with Keras.
 |  
 |  3. It pro

In [6]:
graph_conv(esol_labels, esol_dataset, esol_trans)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor






Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Evaluating model
computed_metrics: [0.9536899002515603]
computed_metrics: [0.8222152933705345]
computed_metrics: [0.8094500993708863]
Train scores
{'mean-pearson_r2_score': 0.9536899002515603}
Validation scores
{'mean-pearson_r2_score': 0.8222152933705345}
Test scores
{'mean-pearson_r2_score': 0.8094500993708863}


# Freesolv Dataset

In [7]:
freesolv_labels, freesolv_dataset, freesolv_trans = data_load(['expt'], 'smiles', 'FreeSolv/SAMPL.csv') 

Loading raw samples now.
shard_size: 8192
About to start loading CSV from FreeSolv/SAMPL.csv
Loading shard 1 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 0 took 0.549 s
TIMING: dataset construction took 0.648 s
Loading dataset from disk.
TIMING: dataset construction took 0.118 s
Loading dataset from disk.
TIMING: dataset construction took 0.066 s
Loading dataset from disk.
TIMING: dataset construction took 0.059 s
Loading dataset from disk.
TIMING: dataset construction took 0.269 s
Loading dataset from disk.
TIMING: dataset construction took 0.063 s
Loading dataset from disk.
TIMING: dataset construction took 0.055 s
Loading dataset from disk.
TIMING: dataset construction took 0.103 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.
TIMING: dataset construction took 0.017 s
Loading dataset from disk.


In [8]:
graph_conv(freesolv_labels, freesolv_dataset, freesolv_trans, batch_size = 50)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Evaluating model
computed_metrics: [0.9875903953321227]
computed_metrics: [0.9021377352841167]
computed_metrics: [0.9135036015596026]
Train scores
{'mean-pearson_r2_score': 0.9875903953321227}
Validation scores
{'mean-pearson_r2_score': 0.9021377352841167}
Test scores
{'mean-pearson_r2_score': 0.9135036015596026}


# Lipophilicity Dataset

In [9]:
lipo_labels, lipo_dataset, lipo_trans = data_load(['exp'], 'smiles', 'lipophilicity/Lipophilicity.csv') 

Loading raw samples now.
shard_size: 8192
About to start loading CSV from lipophilicity/Lipophilicity.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
TIMING: featurizing shard 0 took 7.265 s
TIMING: dataset construction took 8.707 s
Loading dataset from disk.
TIMING: dataset construction took 1.779 s
Loading dataset from disk.
TIMING: dataset construction took 0.618 s
Loading dataset from disk.
TIMING: dataset construction took 0.844 s
Loading dataset from disk.
TIMING: dataset construction took 1.519 s
Loading dataset from disk.
TIMING: dataset construction took 0.870 s
Loading dataset from disk.
TIMING: dataset construction took 0.640 s
Loading dataset from disk.
TIMING: dataset construction took 1.463 s
Loading dataset from disk.
TIMING: dataset construction took 0.156 s
Loading dataset from disk.
TIMING: dataset construction took 0.173 s
Loading dataset from disk.


In [10]:
graph_conv(lipo_labels, lipo_dataset, lipo_trans, batch_size = 400)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Evaluating model
computed_metrics: [0.7680606276374528]
computed_metrics: [0.63346888706968]
computed_metrics: [0.6531321969963345]
Train scores
{'mean-pearson_r2_score': 0.7680606276374528}
Validation scores
{'mean-pearson_r2_score': 0.63346888706968}
Test scores
{'mean-pearson_r2_score': 0.6531321969963345}
