In [2]:
pip install deepchem

Collecting deepchem
  Downloading deepchem-2.5.0-py3-none-any.whl (552 kB)
[K     |████████████████████████████████| 552 kB 1.9 MB/s eta 0:00:01
Installing collected packages: deepchem
Successfully installed deepchem-2.5.0
Note: you may need to restart the kernel to use updated packages.


# Datasets 

In [8]:
import numpy as np
import deepchem as dc

dataset = dc.data.NumpyDataset(np.random.rand(500, 5))
print(dataset)

<NumpyDataset X.shape: (500, 5), y.shape: (500, 1), w.shape: (500, 1), ids: [0 1 2 ... 497 498 499], task_names: [0]>


In [9]:
##%%capture
##!wget https://raw.githubusercontent.com/deepchem/deepchem/master/datasets/delaney-processed.csv -O delaney-processed.csv 
# this is how to read csv dataset from link

In [10]:
import numpy as np
import deepchem as dc

mols = [
    'C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1',
    'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O',
    'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1'
]
print("Original set of molecules")
print(mols)

splitter = dc.splits.ScaffoldSplitter()
# This should be swapped for simpler splitter API once that's merged in.
dataset = dc.data.NumpyDataset(X=np.array(mols), ids=mols)
train, valid, test = splitter.train_valid_test_split(dataset)
# The return values are dc.data.Dataset objects so we need to extract
# the ids
print("Training set")
print(train)
print("Valid set")
print(valid)
print("Test set")
print(test)

Original set of molecules
['C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1', 'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O', 'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1']
Training set
<NumpyDataset X.shape: (11,), y.shape: (11, 1), w.shape: (11, 1), ids: ['N[C@@H](C)C(=O)O' 'N[C@H](C)C(=O)O' 'CC' ... 'C1CCCCC1' 'C1=[C]NC=C1'
 'C1=CC=CC=C[C+]1'], task_names: [0]>
Valid set
<NumpyDataset X.shape: (1,), y.shape: (1, 1), w.shape: (1, 1), ids: ['C1=C[N]C=C1'], task_names: [0]>
Test set
<NumpyDataset X.shape: (2,), y.shape: (2, 1), w.shape: (2, 1), ids: ['O=C1C=CC(=O)C2=C1OC=CO2' 'C1=CC2=C(C=C1)C1=CC=CC=C21'], task_names: [0]>


# Split

In [12]:
import deepchem as dc

mols = [
    'C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1',
    'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O',
    'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1'
]
print("Original set of molecules")
print(mols)

# once improved splitting API is merged in swap out for simpler
# API
dataset = dc.data.NumpyDataset(X=mols, ids=mols)
splitter = dc.splits.RandomSplitter()
train, valid, test = splitter.train_valid_test_split(dataset)
# The return values are dc.data.Dataset objects so we need to extract
# the ids
print("Training set")
print(train.ids)
print("Valid set")
print(valid.ids)
print("Test set")
print(test.ids)

Original set of molecules
['C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1', 'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O', 'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1']
Training set
['C1=[C]NC=C1' 'CC' 'C#N' 'CC(=O)O' 'C1=CC=CC=C[C+]1' 'N[C@@H](C)C(=O)O'
 'N[C@H](C)C(=O)O' 'C1=CC2=C(C=C1)C1=CC=CC=C21' 'C1CCCCC1' 'CCN(CC)CC'
 'C1=C[N]C=C1']
Valid set
['O=C1C=CC(=O)C2=C1OC=CO2']
Test set
['c1ccccc1' 'O=C=O']


In [None]:
import deepchem as dc

mols = [
    'C1=CC2=C(C=C1)C1=CC=CC=C21', 'O=C1C=CC(=O)C2=C1OC=CO2', 'C1=C[N]C=C1',
    'C1=CC=CC=C[C+]1', 'C1=[C]NC=C1', 'N[C@@H](C)C(=O)O', 'N[C@H](C)C(=O)O',
    'CC', 'O=C=O', 'C#N', 'CCN(CC)CC', 'CC(=O)O', 'C1CCCCC1', 'c1ccccc1'
]
print("Original set of molecules")
print(mols)

# Once improved splitting API is merged in swap to simpler API
dataset = dc.data.NumpyDataset(X=mols, ids=mols)
splitter = dc.splits.ScaffoldSplitter()
train, valid, test = splitter.train_valid_test_split(dataset)
# The return values are dc.data.Dataset objects so we need to extract
# the ids
print("Training set")
print(train.ids)
print("Valid set")
print(valid.ids)
print("Test set")
print(test.ids)