In [2]:
import deepchem as dc
import numpy as np

  return f(*args, **kwds)
  from numpy.core.umath_tests import inner1d


### Create a random dataset and explore

In [3]:
x = np.random.random((4,5))
y = np.random.random((4,1))

In [4]:
x

array([[0.74082559, 0.16997139, 0.2178261 , 0.62008477, 0.64043808],
       [0.30756477, 0.66993958, 0.77062414, 0.13033649, 0.61015221],
       [0.32277995, 0.9333166 , 0.91555511, 0.54070729, 0.17170172],
       [0.27697189, 0.50193518, 0.13327517, 0.66143032, 0.84805375]])

In [5]:
y

array([[0.43059509],
       [0.16143965],
       [0.2069391 ],
       [0.79790076]])

In [6]:
dataset = dc.data.NumpyDataset(x,y)

In [9]:
print(dataset.X)

[[0.74082559 0.16997139 0.2178261  0.62008477 0.64043808]
 [0.30756477 0.66993958 0.77062414 0.13033649 0.61015221]
 [0.32277995 0.9333166  0.91555511 0.54070729 0.17170172]
 [0.27697189 0.50193518 0.13327517 0.66143032 0.84805375]]


In [12]:
print(dataset.y)

[[0.43059509]
 [0.16143965]
 [0.2069391 ]
 [0.79790076]]


In [13]:
np.array_equal(x, dataset.X)

True

In [14]:
np.array_equal(y, dataset.y)

True

### Load the tox21 test dataset (and explore)

In [17]:
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()

Loading dataset from disk.
Loading dataset from disk.
Loading dataset from disk.


In [18]:
tox21_tasks

['NR-AR',
 'NR-AR-LBD',
 'NR-AhR',
 'NR-Aromatase',
 'NR-ER',
 'NR-ER-LBD',
 'NR-PPAR-gamma',
 'SR-ARE',
 'SR-ATAD5',
 'SR-HSE',
 'SR-MMP',
 'SR-p53']

In [19]:
len(tox21_tasks)

12

In [20]:
tox21_datasets

(<deepchem.data.datasets.DiskDataset at 0x1a2a200470>,
 <deepchem.data.datasets.DiskDataset at 0x113d87470>,
 <deepchem.data.datasets.DiskDataset at 0x119b40518>)

In [26]:
train_dataset, valid_dataset, test_dataset = tox21_datasets

In [27]:
train_dataset.X.shape # (samples, feature vector length)

(6264, 1024)

In [28]:
valid_dataset.X.shape

(783, 1024)

In [29]:
test_dataset.X.shape

(784, 1024)

In [30]:
np.shape(train_dataset.y) # 12 data poins, called labels, for each sample. correspond to the 12 tasks above

(6264, 12)

In [31]:
train_dataset.w.shape 

(6264, 12)

In [33]:
np.count_nonzero(train_dataset.w) # only 62166 values are nonzero/ were measured

62166

In [34]:
np.count_nonzero(train_dataset.w == 0) # there are 13002 zeros in this array of data values.

13002

In [35]:
# keep zero values around so we avoid irregularly shaped arrays. Just remember to handle missing data as we go along.

In [36]:
transformers # the data has been transformed with BalancingTransformer, which is used to correct for unbalanced data

[<deepchem.trans.transformers.BalancingTransformer at 0x119b40ef0>]

In this dataset, we're looking at which molecules bind to an array of targets. Most molecules do not bind to most of the targets, meaning that most of the labels are zero. So a model "could triviallly achieve >90% accuracy by always predicting 0." To avoid this, "BalancingTransformer adjusts the weights for individual data points so that the total weight assigned to every class is the same. That way the loss function has not systematic preference for any one class."

### Train an existing model (MultitaskClassifier) on the Tox21 dataset

In [38]:
model = dc.models.MultitaskClassifier(n_tasks=12,n_features=1024, layer_sizes =[1000])

In [39]:
model.fit(train_dataset, nb_epoch=10) #an epoch is one complete pass thorugh all samples in a dataset

859.6167750282893

There's usually not enough training data to get to a fully optimized model before running out of data. So we use multiple epochs to train models with smaller amounts of data (use multiple passes over same training dataset). However, the more epochs you use, the more likely you are to end up with an overfit model. 

### Evaluate performance of the trained model

`dc.metrics.Metric` class provides a general way to specify metrics for models

In [50]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") # np.mean == mean of ROC-AUC across all tasks is returned

`ROC-AUC` is a popular heuristic used to summarize how well a classifier works. 

With this dataset, we want to classify molecules as "toxic" or "non-toxic", but the model outputs continuous numbers, not discrete predictions. In practice, we pick a threshold value over which a molecule is predicted to be "toxic". Choice of this threshold will affect false positive/negative rate. 

The ROC (receiver operating characteristic) curve helps visualize this trade-off. Try many diff threshold vals, plot a curve of the true positive rate vs. false positive rate as the threshold is varied.

`ROC-AUC` is the total area under teh ROC curve. If a perfect threshold exists (every sample classified correctly), ROC-AUC is 1. With completely random output, ROC-AUC is 0.5. So we can use `ROC-AUC` to summarize how well a classifier works. 

In [51]:
train_scores = model.evaluate(train_dataset, [metric], transformers)
print("Training ROC-AUC Score: %f" % train_scores["mean-roc_auc_score"])
test_scores = model.evaluate(test_dataset, [metric], transformers)
print("Test ROC-AUC Score: %f" % test_scores["mean-roc_auc_score"])

computed_metrics: [0.9910237269226294, 0.9961323859987155, 0.9597516509854918, 0.9805740353908223, 0.9050492961128287, 0.984607949740244, 0.9914808443507592, 0.9042056020017908, 0.9883149885511855, 0.9675495723678504, 0.9464931497504032, 0.9766392571460536]
Training ROC-AUC Score: 0.965985
computed_metrics: [0.7954620918649695, 0.8467374810318664, 0.8957458151188826, 0.8130625915846766, 0.7147868145275496, 0.7931598264931599, 0.7228566699768288, 0.7227619004688064, 0.8488553766727804, 0.7260629531970996, 0.8683696375124709, 0.7681079083518108]
Test ROC-AUC Score: 0.792997


In [52]:
print(train_scores)

{'mean-roc_auc_score': 0.9659852049432311}


In [53]:
print(test_scores)

{'mean-roc_auc_score': 0.7929974222334084}


### Create and train a new MNIST model

task: classify handwritten numbers

In [59]:
!mkdir MNIST_data
!cd MNIST_data
!wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
!cd ..

mkdir: cannot create directory ‘MNIST_data’: File exists
--2019-04-25 14:09:48--  http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Resolving yann.lecun.com (yann.lecun.com)... 216.165.22.6
Connecting to yann.lecun.com (yann.lecun.com)|216.165.22.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9912422 (9.5M) [application/x-gzip]
Saving to: ‘train-images-idx3-ubyte.gz.1’


2019-04-25 14:09:50 (5.46 MB/s) - ‘train-images-idx3-ubyte.gz.1’ saved [9912422/9912422]

--2019-04-25 14:09:50--  http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Resolving yann.lecun.com (yann.lecun.com)... 216.165.22.6
Connecting to yann.lecun.com (yann.lecun.com)|216.165.22.6|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 28881 (28K) [application/x-gzip]
Saving to: ‘train-labels-idx1-ubyte.gz.1’


2019-04-25 14:09:51 (292 KB/s) - ‘train-labels-idx1-ubyte.gz.1’ saved [28881/28881]

--2019-04-25 14:09:51--  http://yann.lecun.com/exdb/mnist/t10

In [62]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
#one-hot encoding = convert categorical variable into integer form. 
# In this case, 1-9 are categories
# 1 becomes [1,0,0,0,0,0,0,0,0]
# 9 becomes [0,0,0,0,0,0,0,0,1]

Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [64]:
train_dataset = dc.data.NumpyDataset(mnist.train.images, mnist.train.labels)
test_dataset = dc.data.NumpyDataset(mnist.test.images, mnist.test.labels)

In [65]:
model = dc.models.TensorGraph(model_dir='mnist') # model_dir is required to save model somewhere!

In [66]:
isinstance(model, dc.models.Model)

True

## the commands below here are incomplete, partly because some commands were cut off in the pdf copy

In [85]:
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf

In [80]:
feature = layers.Feature(shape=(None, 784))
label = layers.Label(shape=(None,10))

In [83]:
make_image = layers.Reshape(shape=(None, 28,28), in_layers = feature)

In [86]:
conv2d_1 = layers.Conv2D(num_outputs = 32, activation_fn = tf.nn.relu) #incomplete?
conv2d_2 = layers.Conv2D(num_outputs = 64, activation_fn = tf.nn.relu) #incomplete?

In [89]:
flatten = layers.Flatten(in_layers=conv2d_2)
dense1 = layers.Dense(out_channels =1024, activation_fn = tf.nn.relu) # incomplete?
dense2 = layers.Dense(out_channels = 10, activation_fn=None, in_layers= feature) # incomplete

In [90]:
smce = layers.SoftMaxCrossEntropy(in_layers=[label, dense2])
loss = layers.ReduceMean(in_layers=smce)
model.set_loss(loss)

In [91]:
output = layers.SoftMax(in_layers=dense2)
model.add_output(output)

In [92]:
model.fit(train_dataset, nb_epoch=10)

0.261777402784772

In [None]:
metric = dc.metrics.Metric(dc.metrics.accuracy_score)

In [None]:
train_scores = model.evaluate(train_dataset, [metric])
test_scores = model.evaluate(test_dataset, [metric])