Build chemical descriptors/molecular representation (e.g. ECFP, CoulombMatrix, MolecularGraphs etc) as input into deep learning network

In [1]:
import numpy as np
import pandas as pd
import deepchem as dc
import os

from rdkit.Chem.Fingerprints import FingerprintMols

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD
from keras import regularizers

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Import compound data

In [2]:
data = pd.read_pickle(os.path.join(os.getcwd(), 'data.pkl'))
print (data.shape)
data.head(1)

(28263, 19)


Unnamed: 0,smiles,category,mw_freebase,alogp,hba,hbd,psa,rtb,acd_logp,acd_logd,full_mwt,aromatic_rings,heavy_atoms,qed_weighted,mw_monoisotopic,hba_lipinski,hbd_lipinski,mol,agrochemical
0,Cl.O=C(NCc1ccncc1)[C@@H]2CCCN2C(=O)[C@@H]3CCCN3,toxin,302.38,0.44,4,2,74.33,4,1.04,-0.77,338.84,1,22,0.85,302.174,6,2,<rdkit.Chem.rdchem.Mol object at 0x124f75fb8>,0


Add Fingerprint descriptor using RDKIT

In [3]:
def fingerprint(mol):
    return np.ndarray.flatten(np.array(FingerprintMols.FingerprintMol(mol)))

In [4]:
data['fingerprint'] = data['mol'].apply(fingerprint)

Assign feature (X) and agrochemical classifier (Y)

In [5]:
X = data['fingerprint']
Y = data['agrochemical']

Pad molecular fingerprint with zeros to make them all into same size for conversion into numpy array later

__QUESTION: CAN WE DO PADDING ON MOLECULAR FINGERPRINT? DOES IT CHANGE THE STRUCTURE?"__

In [6]:
def molecular_fingerprint_padding(x):
    result = np.zeros((2048,))
    result[:x.shape[0]] = x
    return result
X = X.apply(molecular_fingerprint_padding)

Stack X into one numpy array

In [7]:
X = np.stack(X)
X.shape

(28263, 2048)

Standard scaling

In [8]:
st = StandardScaler()
X = st.fit_transform(X)

Split into training set and testing set

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

Build models (*Maybe we can try compare using dense neural network and random forest?*)

1) Simple sequential neural network

In [10]:
def build_simple_model():
    model = Sequential()
    model.add(Dense(64, input_dim=2048, activation='softmax', kernel_regularizer=regularizers.l2(0.)))
    model.add(Dense(1, input_dim=64, activation='linear'))
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

model = build_simple_model()

In [11]:
history = model.fit(x_train, y_train, nb_epoch=25, batch_size=32)



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [12]:
score, acc = model.evaluate(x_test, y_test, batch_size=32)

print ("Test score:", score)
print ("Test accuracy:", acc)

Test score: 0.4848937130113545
Test accuracy: 0.9060675747917961


2) Use random forest model

In [13]:
rfc = RandomForestClassifier(n_estimators=200, max_depth=2, random_state=0, n_jobs=-1)

In [14]:
rfc.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [15]:
rfc.score(x_test, y_test)

0.654342826817619