In [1]:
import deepchem as dc
import numpy as np

  return f(*args, **kwds)
  from numpy.core.umath_tests import inner1d


### Create a random dataset and explore

In [2]:
x = np.random.random((4,5))
y = np.random.random((4,1))

In [3]:
x

array([[0.44621009, 0.51024127, 0.66014646, 0.26643113, 0.86685781],
       [0.67478791, 0.98480084, 0.42769923, 0.70659297, 0.20465776],
       [0.880069  , 0.14874706, 0.1930732 , 0.34216168, 0.98266257],
       [0.76281526, 0.37601553, 0.04442908, 0.83226447, 0.97504219]])

In [4]:
y

array([[0.74454328],
       [0.0949587 ],
       [0.44730824],
       [0.36506096]])

In [5]:
dataset = dc.data.NumpyDataset(x,y)

In [6]:
print(dataset.X) # capital bc is matrix

[[0.44621009 0.51024127 0.66014646 0.26643113 0.86685781]
 [0.67478791 0.98480084 0.42769923 0.70659297 0.20465776]
 [0.880069   0.14874706 0.1930732  0.34216168 0.98266257]
 [0.76281526 0.37601553 0.04442908 0.83226447 0.97504219]]


In [7]:
print(dataset.y) # lowercase bc is vector

[[0.74454328]
 [0.0949587 ]
 [0.44730824]
 [0.36506096]]


In [8]:
np.array_equal(x, dataset.X)

True

In [9]:
np.array_equal(y, dataset.y)

True

### Load the tox21 test dataset (and explore)

In [10]:
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /var/folders/1c/s23lyl9x7cgd1rg4n6t867qr0000gn/T/tox21.csv.gz
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 19.596 s
TIMING: dataset construction took 19.987 s
Loading dataset from disk.
TIMING: dataset construction took 0.582 s
Loading dataset from disk.
TIMING: dataset construction took 0.509 s
Loading dataset from disk.
TIMING: dataset construction took 0.383 s
Loading dataset from disk.
TIMING: dataset construction took 0.326 s
Loading dataset from disk.


In [11]:
tox21_tasks

['NR-AR',
 'NR-AR-LBD',
 'NR-AhR',
 'NR-Aromatase',
 'NR-ER',
 'NR-ER-LBD',
 'NR-PPAR-gamma',
 'SR-ARE',
 'SR-ATAD5',
 'SR-HSE',
 'SR-MMP',
 'SR-p53']

In [12]:
len(tox21_tasks)

12

In [13]:
tox21_datasets

(<deepchem.data.datasets.DiskDataset at 0x1a2d3abf60>,
 <deepchem.data.datasets.DiskDataset at 0x10f239f28>,
 <deepchem.data.datasets.DiskDataset at 0x1a2d3944e0>)

In [14]:
train_dataset, valid_dataset, test_dataset = tox21_datasets

In [15]:
train_dataset.X.shape # (samples, feature vector length)

(6264, 1024)

In [16]:
valid_dataset.X.shape

(783, 1024)

In [17]:
test_dataset.X.shape

(784, 1024)

In [18]:
np.shape(train_dataset.y) # 12 data points, called labels, for each sample. correspond to the 12 tasks above

(6264, 12)

In [19]:
train_dataset.w.shape #weights can let you set certain labels to 0. What happens

(6264, 12)

In [20]:
np.count_nonzero(train_dataset.w) # only 62166 values are nonzero/ were measured

62166

In [21]:
np.count_nonzero(train_dataset.w == 0) # there are 13002 zeros in this array of data values.

13002

In [22]:
# keep zero values around so we avoid irregularly shaped arrays. Just remember to handle missing data as we go along.

In [23]:
transformers # the data has been transformed with BalancingTransformer, which is used to correct for unbalanced data

[<deepchem.trans.transformers.BalancingTransformer at 0x1a2d3ab5f8>]

In this dataset, we're looking at which molecules bind to an array of targets. Most molecules do not bind to most of the targets, meaning that most of the labels are zero. So a model "could triviallly achieve >90% accuracy by always predicting 0." To avoid this, "BalancingTransformer adjusts the weights for individual data points so that the total weight assigned to every class is the same. That way the loss function has not systematic preference for any one class."

### Train an existing model (MultitaskClassifier) on the Tox21 dataset

In [24]:
model = dc.models.MultitaskClassifier(n_tasks=12,n_features=1024, layer_sizes =[1000])

In [25]:
model.fit(train_dataset, nb_epoch=10) #an epoch is one complete pass thorugh all samples in a dataset

861.1133545890686

There's usually not enough training data to get to a fully optimized model before running out of data. So we use multiple epochs to train models with smaller amounts of data (use multiple passes over same training dataset). However, the more epochs you use, the more likely you are to end up with an overfit model. 

### Evaluate performance of the trained model

`dc.metrics.Metric` class provides a general way to specify metrics for models

In [26]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean, mode="classification") # np.mean == mean of ROC-AUC across all tasks is returned

`ROC-AUC` is a popular heuristic used to summarize how well a classifier works. 

With this dataset, we want to classify molecules as "toxic" or "non-toxic", but the model outputs continuous numbers, not discrete predictions. In practice, we pick a threshold value over which a molecule is predicted to be "toxic". Choice of this threshold will affect false positive/negative rate. 

The ROC (receiver operating characteristic) curve helps visualize this trade-off. Try many diff threshold vals, plot a curve of the true positive rate vs. false positive rate as the threshold is varied.

`ROC-AUC` is the total area under teh ROC curve. If a perfect threshold exists (every sample classified correctly), ROC-AUC is 1. With completely random output, ROC-AUC is 0.5. So we can use `ROC-AUC` to summarize how well a classifier works. 

In [27]:
train_scores = model.evaluate(train_dataset, [metric], transformers)
print("Training ROC-AUC Score: %f" % train_scores["mean-roc_auc_score"])
test_scores = model.evaluate(test_dataset, [metric], transformers)
print("Test ROC-AUC Score: %f" % test_scores["mean-roc_auc_score"])

computed_metrics: [0.9886872783244987, 0.9961725272960822, 0.9605013584685427, 0.9816490617170226, 0.9002438024313751, 0.9839427470879616, 0.9919344634776031, 0.9091163499117103, 0.9864523812837984, 0.9702288222334021, 0.9472936971856232, 0.9761481232869049]
Training ROC-AUC Score: 0.966031
computed_metrics: [0.7925843940232429, 0.8573034339346934, 0.8975370508583005, 0.7993510571488381, 0.715897949846402, 0.7789122455789121, 0.7169811320754718, 0.7193360079336459, 0.8525915359010937, 0.7127554383651944, 0.8655428999002328, 0.7809959349593496]
Test ROC-AUC Score: 0.790816


In [28]:
print(train_scores)

{'mean-roc_auc_score': 0.9660308843920437}


In [29]:
print(test_scores)

{'mean-roc_auc_score': 0.7908157567104482}


### Create and train a new MNIST model

task: classify handwritten numbers

In [30]:
!mkdir MNIST_data
!cd MNIST_data
!wget http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
!wget http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
!cd ..

mkdir: MNIST_data: File exists
/bin/sh: wget: command not found
/bin/sh: wget: command not found
/bin/sh: wget: command not found
/bin/sh: wget: command not found


In [31]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
#one-hot encoding = convert categorical variable into integer form. 
# In this case, 1-9 are categories
# 1 becomes [1,0,0,0,0,0,0,0,0]
# 9 becomes [0,0,0,0,0,0,0,0,1]

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [32]:
train_dataset = dc.data.NumpyDataset(mnist.train.images, mnist.train.labels)
test_dataset = dc.data.NumpyDataset(mnist.test.images, mnist.test.labels)

In [33]:
model = dc.models.TensorGraph(model_dir='mnist') # model_dir is required to save model somewhere!

In [34]:
isinstance(model, dc.models.Model)

True

In [35]:
import deepchem.models.tensorgraph.layers as layers
import tensorflow as tf

In [36]:
feature = layers.Feature(shape=(None, 784))
label = layers.Label(shape=(None,10))

In [37]:
make_image = layers.Reshape(shape=(None, 28,28), in_layers = feature)

In [38]:
conv2d_1 = layers.Conv2D(num_outputs = 32, activation_fn = tf.nn.relu, in_layers = make_image)
conv2d_2 = layers.Conv2D(num_outputs = 64, activation_fn = tf.nn.relu, in_layers=conv2d_1)

In [39]:
flatten = layers.Flatten(in_layers=conv2d_2)
dense1 = layers.Dense(out_channels =1024, activation_fn = tf.nn.relu, in_layers = flatten)
dense2 = layers.Dense(out_channels = 10, activation_fn=None, in_layers= dense1)

In [40]:
smce = layers.SoftMaxCrossEntropy(in_layers=[label, dense2])
loss = layers.ReduceMean(in_layers=smce)
model.set_loss(loss)

In [41]:
output = layers.SoftMax(in_layers=dense2)
model.add_output(output)

In [42]:
model.fit(train_dataset, nb_epoch=10)

0.0036751583300895594

In [43]:
metric = dc.metrics.Metric(dc.metrics.accuracy_score)

In [44]:
train_scores = model.evaluate(train_dataset, [metric])
test_scores = model.evaluate(test_dataset, [metric])

computed_metrics: [0.9987454545454545]
computed_metrics: [0.9881]
