In [1]:
import deepchem as dc
import pandas as pd 
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from tqdm import tqdm

from deepchem.feat.mol_graphs import ConvMol

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.autograd import Variable

In [3]:
esol = pd.read_csv('ESOL/delaney-processed.csv')

In [4]:
def get_fingerprint(data, name):
    smiles = data[name]
    molecules = [Chem.MolFromSmiles(smile) for smile in smiles]
    
    feat = dc.feat.CircularFingerprint(size=100)
    arr = feat.featurize(mols = molecules)
    return arr

In [5]:
X_esol = get_fingerprint(esol, 'smiles')

In [6]:
def conv(data, name):
    smiles = data[name]
    mols = [Chem.MolFromSmiles(s) for s in smiles]
    
    featurizer = dc.feat.ConvMolFeaturizer()
    x = featurizer.featurize(mols)
    return x

In [7]:
c_esol = conv(esol, 'smiles')

In [8]:
X = []
X_pad = []
for i in range(c_esol.shape[0]):
    c = c_esol[i].get_atom_features()
    X.append(c)
    pad = 55 - c.shape[0]
    c = np.pad(c,((0,pad),(0,0)), 'constant')    
    X_pad.append(c)
    
A = []
for i in range(c_esol.shape[0]):
    A.append(c_esol[i].get_adjacency_list())
X_pad = np.asarray(X_pad)

In [9]:
y_esol = esol['measured log solubility in mols per litre']
X_esol = Variable(torch.from_numpy(X_esol))
X_pad = Variable(torch.from_numpy(X_pad))
y_esol = torch.FloatTensor(y_esol)

In [53]:
def Getting_Spatial(X_f, A):
    X_s = np.zeros([X_f.shape[0], X_f.shape[0], X_f.shape[1]])
    n = len(A)
    for i in range(n):
        for j in range(n):
            if j in A[i]:
                for k in range(X_f.shape[1]):
                    X_s[i][j][k] = X_f[j][k]
    X_s = Variable(torch.from_numpy(X_s))
    
    return X_s

In [54]:
class GModel(nn.Module):
    def __init__(self):
        super(GModel, self).__init__()
        self.X_f = nn.Linear(75, 40) #Dimensionality reduction
        self.conv = nn.Conv1d(in_channels = 55, out_channels = 19, kernel_size = 19, stride = 1, padding = 9)
        self.conv1 = nn.Conv1d(in_channels = 20, out_channels = 5 , kernel_size = 11)
        self.conv2 = nn.Conv1d(in_channels = 5, out_channels = 2, kernel_size = 7)
        self.conv3 = nn.Conv1d(in_channels = 2, out_channels = 1, kernel_size = 5)
        self.opt = nn.Linear(100, 40)
        self.relu = nn.ReLU()
        self.pred = nn.Linear(100, 1)
        
    #Forward Pass    
    def forward(self, x_pad, A, x2):
        x_pad = x_pad.float()
        x2 = x2.float()
        
        x_f = self.X_f(x_pad)
        S = Getting_Spatial(x_f, A)

        graph = torch.zeros(60)
        
        for i in range(x_pad.shape[0]):
            s = S[i]
            s = s.unsqueeze(dim = 0)
            s = s.float()
            
            x_k = self.conv(s)
            x_k = x_k.view(19, 40)
        
            f = x_f[i].unsqueeze(dim = 0)
            
            out1 = torch.cat([f, x_k], dim = 0)
            
            out1 = out1.unsqueeze(dim = 0)
            
            out2 = self.conv1(out1)
            out2 = self.conv2(out2)
            out2 = self.conv3(out2)
        
            out2 = out2.view(20)

            out3 = torch.cat([x_f[i], out2], dim = -1)
        
            for j in range(60):
                graph[j] = graph[j] + out3[j]

        opti = self.opt(x2)
        opti = self.relu(opti)
        mol = torch.cat([graph, opti], dim = -1)
        pred = self.pred(mol)
        return pred

In [55]:
def train(X_esol, X_pad, A, y_esol, opt, cost):
    #Going into training mode for model
    net.train()
    loss_list = []
    g = 0
    y = 0
    m = torch.mean(y_esol)
    
    s = np.arange(0,X_esol.shape[0],1)
    np.random.shuffle(s)
    
    for i in range(X_esol.shape[0]):
        j = s[i]
        
        pred = net(X_pad[j], A[j], X_esol[j])
        labels = y_esol[j]
        labels = labels.view(1)
        loss = cost(pred, labels)
    
        loss_list.append(loss.item())
        g = g + torch.sum((labels - pred) **2)
        y = y + torch.sum(((labels - m) ** 2))
        #Backpropogation and optimization
        opt.zero_grad()
        loss.backward()
        opt.step()
    
    r_score = 1 - (g/y)
    epoch_loss = np.sum(np.asarray(loss_list))
    print("Loss is ", epoch_loss/X_esol.shape[0], ' r2_score is: ', r_score)   

In [60]:
def test(X_esol, X_pad, A, y_esol):
    #Going into network evaluation mode
    net.eval()
    
    g = 0
    y = 0
    m = torch.mean(y_esol)

    s = np.arange(0,X_esol.shape[0],1)
    np.random.shuffle(s)
    
    #No gradients will be calculated for testing mode
    with torch.no_grad():
        for i in range(X_esol.shape[0]):
            j = s[i]
            pred = net(X_pad[j], A[j], X_esol[j])
            labels = y_esol[j]
            labels = labels.view(1)
            g = g + torch.sum((labels - pred) **2)
            y = y + torch.sum(((labels - m) ** 2))
    
    r_score = 1 - (g/y)
    
    return r_score    

In [57]:
net = GModel()
#Optimizer
opt = torch.optim.Adam(net.parameters(), lr= 0.002)
#Cost
cost = nn.MSELoss() 

In [58]:
split = 903

In [59]:
for i in tqdm(range(20)):
    train(X_esol[:split,:], X_pad[:split,:,:], A[:split], y_esol[:split], opt, cost)


  0%|          | 0/20 [00:00<?, ?it/s][A

Loss is  1.7681033949425031  r2_score is:  tensor(0.6038, grad_fn=<RsubBackward1>)



  5%|▌         | 1/20 [04:02<1:16:43, 242.31s/it][A

Loss is  1.0066130166314826  r2_score is:  tensor(0.7744, grad_fn=<RsubBackward1>)



 10%|█         | 2/20 [08:01<1:12:26, 241.50s/it][A

Loss is  0.8060504324701584  r2_score is:  tensor(0.8194, grad_fn=<RsubBackward1>)



 15%|█▌        | 3/20 [12:07<1:08:47, 242.79s/it][A

Loss is  0.6072262395155632  r2_score is:  tensor(0.8639, grad_fn=<RsubBackward1>)



 20%|██        | 4/20 [16:15<1:05:06, 244.17s/it][A

Loss is  0.5235124073806163  r2_score is:  tensor(0.8827, grad_fn=<RsubBackward1>)



 25%|██▌       | 5/20 [20:47<1:03:08, 252.54s/it][A

Loss is  0.4190561263145065  r2_score is:  tensor(0.9061, grad_fn=<RsubBackward1>)



 30%|███       | 6/20 [25:09<59:38, 255.61s/it]  [A

Loss is  0.406394633923094  r2_score is:  tensor(0.9089, grad_fn=<RsubBackward1>)



 35%|███▌      | 7/20 [29:26<55:26, 255.92s/it][A

Loss is  0.3484873171931603  r2_score is:  tensor(0.9219, grad_fn=<RsubBackward1>)



 40%|████      | 8/20 [33:43<51:12, 256.07s/it][A

Loss is  0.27004145094352644  r2_score is:  tensor(0.9395, grad_fn=<RsubBackward1>)



 45%|████▌     | 9/20 [38:20<48:05, 262.36s/it][A

Loss is  0.3701926477498495  r2_score is:  tensor(0.9170, grad_fn=<RsubBackward1>)



 50%|█████     | 10/20 [42:50<44:06, 264.67s/it][A

Loss is  0.25245305470212115  r2_score is:  tensor(0.9434, grad_fn=<RsubBackward1>)



 55%|█████▌    | 11/20 [47:21<39:59, 266.61s/it][A

Loss is  0.2077914470155001  r2_score is:  tensor(0.9534, grad_fn=<RsubBackward1>)



 60%|██████    | 12/20 [52:35<37:28, 281.03s/it][A

Loss is  0.19738666250328604  r2_score is:  tensor(0.9558, grad_fn=<RsubBackward1>)



 65%|██████▌   | 13/20 [57:29<33:13, 284.73s/it][A

Loss is  0.18725094104031273  r2_score is:  tensor(0.9580, grad_fn=<RsubBackward1>)



 70%|███████   | 14/20 [1:01:59<28:02, 280.37s/it][A

Loss is  0.21663076508812792  r2_score is:  tensor(0.9515, grad_fn=<RsubBackward1>)



 75%|███████▌  | 15/20 [1:06:30<23:08, 277.70s/it][A

Loss is  0.14666402242819135  r2_score is:  tensor(0.9671, grad_fn=<RsubBackward1>)



 80%|████████  | 16/20 [1:11:11<18:33, 278.42s/it][A

Loss is  0.18011332052814144  r2_score is:  tensor(0.9596, grad_fn=<RsubBackward1>)



 85%|████████▌ | 17/20 [1:15:45<13:51, 277.31s/it][A

Loss is  0.18015858433510148  r2_score is:  tensor(0.9596, grad_fn=<RsubBackward1>)



 90%|█████████ | 18/20 [1:20:19<09:12, 276.21s/it][A

Loss is  0.14580095394378978  r2_score is:  tensor(0.9673, grad_fn=<RsubBackward1>)



 95%|█████████▌| 19/20 [1:24:51<04:34, 274.84s/it][A

Loss is  0.17538416174385665  r2_score is:  tensor(0.9607, grad_fn=<RsubBackward1>)



100%|██████████| 20/20 [1:29:24<00:00, 268.21s/it][A


In [61]:
acc_test = test(X_esol[split:1128,:], X_pad[split:1128,:,:], A[split:1128], y_esol[split:1128])
print('Testing R2 score is: ', acc_test)

Testing R2 score is:  tensor(0.7798)


In [62]:
acc_train = test(X_esol[:split,:], X_pad[:split,:,:], A[:split], y_esol[:split])
print('Training R2 score is:', acc_train)

Training R2 score is: tensor(0.9683)
