In [1]:
import deepchem as dc
import pandas as pd 
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from tqdm import tqdm

from deepchem.feat.mol_graphs import ConvMol

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.autograd import Variable

In [3]:
lipo = pd.read_csv('lipophilicity/Lipophilicity.csv')

In [4]:
def get_fingerprint(data, name):
    smiles = data[name]
    molecules = [Chem.MolFromSmiles(smile) for smile in smiles]
    feat = dc.feat.CircularFingerprint(size=100)
    arr = feat.featurize(mols = molecules)
    return arr

In [5]:
X_lipo = get_fingerprint(lipo, 'smiles')

In [6]:
def conv(data, name):
    smiles = data[name]
    mols = [Chem.MolFromSmiles(s) for s in smiles]
    featurizer = dc.feat.ConvMolFeaturizer()
    x = featurizer.featurize(mols)
    return x

In [7]:
c_lipo = conv(lipo, 'smiles')

In [8]:
X = []
X_pad = []
for i in range(c_lipo.shape[0]):
    c = c_lipo[i].get_atom_features()
    X.append(c)
    pad = 115 - c.shape[0]
    c = np.pad(c,((0,pad),(0,0)), 'constant')    
    X_pad.append(c)
    
A = []
for i in range(c_lipo.shape[0]):
    A.append(c_lipo[i].get_adjacency_list())
X_pad = np.asarray(X_pad)

In [9]:
y_lipo = lipo['exp']
X_lipo = Variable(torch.from_numpy(X_lipo))
X_pad = Variable(torch.from_numpy(X_pad))
y_lipo = torch.FloatTensor(y_lipo)

In [10]:
def Getting_Spatial(X_f, A):
    X_s = np.zeros([X_f.shape[0], X_f.shape[0], X_f.shape[1]])
    n = len(A)
    for i in range(n):
        for j in range(n):
            if j in A[i]:
                for k in range(X_f.shape[1]):
                    X_s[i][j][k] = X_f[j][k]
    X_s = Variable(torch.from_numpy(X_s))
    
    return X_s

In [11]:
class GModel(nn.Module):
    def __init__(self):
        super(GModel, self).__init__()
        self.X_f = nn.Linear(75, 40) #Dimensionality reduction
        self.conv = nn.Conv1d(in_channels = 115, out_channels = 19, kernel_size = 19, stride = 1, padding = 9)
        self.conv1 = nn.Conv1d(in_channels = 20, out_channels = 5 , kernel_size = 11)
        self.conv2 = nn.Conv1d(in_channels = 5, out_channels = 2, kernel_size = 7)
        self.conv3 = nn.Conv1d(in_channels = 2, out_channels = 1, kernel_size = 5)
        self.opt = nn.Linear(100, 40)
        self.relu = nn.ReLU()
        self.pred = nn.Linear(100, 1)
        
    #Forward Pass    
    def forward(self, x_pad, A, x2):
        x_pad = x_pad.float()
        x2 = x2.float()
        
        x_f = self.X_f(x_pad)
        S = Getting_Spatial(x_f, A)

        graph = torch.zeros(60)
        
        for i in range(x_pad.shape[0]):
            s = S[i]
            s = s.unsqueeze(dim = 0)
            s = s.float()
            
            x_k = self.conv(s)
            x_k = x_k.view(19, 40)
        
            f = x_f[i].unsqueeze(dim = 0)
            
            out1 = torch.cat([f, x_k], dim = 0)
            
            out1 = out1.unsqueeze(dim = 0)
            
            out2 = self.conv1(out1)
            out2 = self.conv2(out2)
            out2 = self.conv3(out2)
        
            out2 = out2.view(20)

            out3 = torch.cat([x_f[i], out2], dim = -1)
        
            for j in range(60):
                graph[j] = graph[j] + out3[j]

        opti = self.opt(x2)
        opti = self.relu(opti)
        mol = torch.cat([graph, opti], dim = -1)
        pred = self.pred(mol)
        return pred

In [12]:
def train(X_lipo, X_pad, A, y_lipo, opt, cost):
    #Going into training mode for model
    net.train()
    loss_list = []
    g = 0
    y = 0
    m = torch.mean(y_lipo)
    
    s = np.arange(0,X_lipo.shape[0],1)
    np.random.shuffle(s)
    
    for i in range(X_lipo.shape[0]):
        j = s[i]
        
        pred = net(X_pad[j], A[j], X_lipo[j])
        labels = y_lipo[j]
        labels = labels.view(1)
        loss = cost(pred, labels)
    
        loss_list.append(loss.item())
        g = g + torch.sum((labels - pred) **2)
        y = y + torch.sum(((labels - m) ** 2))
        #Backpropogation and optimization
        opt.zero_grad()
        loss.backward()
        opt.step()
    
    r_score = 1 - (g/y)
    epoch_loss = np.sum(np.asarray(loss_list))
    print("Loss is ", epoch_loss/X_lipo.shape[0], ' r2_score is: ', r_score)   

In [18]:
def test(X_lipo, X_pad, A, y_lipo):
    #Going into network evaluation mode
    net.eval()
    
    g = 0
    y = 0
    m = torch.mean(y_lipo)

    s = np.arange(0,X_lipo.shape[0],1)
    np.random.shuffle(s)
    
    #No gradients will be calculated for testing mode
    with torch.no_grad():
        for i in range(X_lipo.shape[0]):
            j = s[i]
            pred = net(X_pad[j], A[j], X_lipo[j])
            labels = y_lipo[j]
            labels = labels.view(1)
            g = g + torch.sum((labels - pred) **2)
            y = y + torch.sum(((labels - m) ** 2))
    
    r_score = 1 - (g/y)
    
    return r_score    

In [14]:
net = GModel()
#Optimizer
opt = torch.optim.Adam(net.parameters(), lr= 0.002)
#Cost
cost = nn.MSELoss() 

In [15]:
split = 3360

In [16]:
for i in tqdm(range(15)):
    train(X_lipo[:split,:], X_pad[:split,:,:], A[:split], y_lipo[:split], opt, cost)

  0%|          | 0/10 [00:00<?, ?it/s]

Loss is  1.373202623032946  r2_score is:  tensor(0.0522, grad_fn=<RsubBackward1>)


 10%|█         | 1/10 [37:36<5:38:29, 2256.64s/it]

Loss is  1.076833652556037  r2_score is:  tensor(0.2568, grad_fn=<RsubBackward1>)


 20%|██        | 2/10 [1:18:51<5:09:37, 2322.14s/it]

Loss is  0.9598266937145994  r2_score is:  tensor(0.3375, grad_fn=<RsubBackward1>)


 30%|███       | 3/10 [1:59:15<4:34:28, 2352.69s/it]

Loss is  0.8356775903405125  r2_score is:  tensor(0.4232, grad_fn=<RsubBackward1>)


 40%|████      | 4/10 [2:42:15<4:02:05, 2420.95s/it]

Loss is  0.7572962342960703  r2_score is:  tensor(0.4773, grad_fn=<RsubBackward1>)


 50%|█████     | 5/10 [3:25:29<3:26:04, 2472.91s/it]

Loss is  0.6935727440479805  r2_score is:  tensor(0.5213, grad_fn=<RsubBackward1>)


 60%|██████    | 6/10 [4:11:27<2:50:32, 2558.22s/it]

Loss is  0.6370821292309897  r2_score is:  tensor(0.5603, grad_fn=<RsubBackward1>)


 70%|███████   | 7/10 [4:57:55<2:11:21, 2627.26s/it]

Loss is  0.6041415661320078  r2_score is:  tensor(0.5830, grad_fn=<RsubBackward1>)


 80%|████████  | 8/10 [5:44:13<1:29:05, 2672.51s/it]

Loss is  0.569227343123507  r2_score is:  tensor(0.6071, grad_fn=<RsubBackward1>)


 90%|█████████ | 9/10 [6:31:29<45:21, 2721.56s/it]  

Loss is  0.5513783152496008  r2_score is:  tensor(0.6194, grad_fn=<RsubBackward1>)


100%|██████████| 10/10 [7:19:31<00:00, 2637.14s/it]


In [19]:
acc_test = test(X_lipo[split:4200,:], X_pad[split:4200,:,:], A[split:4200], y_lipo[split:4200])
print("Testing R2 score is: ", acc_test)

Testing R2 score is:  tensor(0.4415)


In [20]:
acc_train = test(X_lipo[:split,:], X_pad[:split,:,:], A[:split], y_lipo[:split])
print("Training R2 score is: ", acc_train)

Training R2 score is:  tensor(0.6881)
