### SMILES to molecular graphs, build neural fingerprints, and train a simple feed-forward deep neural network (DNN) to predict target. 

In [1]:
# Imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from rdkit import Chem

from torch_geometric.loader import DataLoader
import torch

## Data Preparation

#### Reading the csv file into a pandas dataframe, converting the SMILES to rdkit.Chem.rdchem.Mol objects, and splitting the SMILES and targets into training and test sets

In [4]:
df = pd.read_csv('./data.csv')
mol_list = [Chem.MolFromSmiles(smi) for smi in df['SMILES']]
df['mol_list'] = mol_list[0]
print(type(mol_list[0]))
X_train, X_test, y_train, y_test = train_test_split(df['mol_list'].values, df['target'].values, random_state=101,test_size=0.2,shuffle=False)

<class 'rdkit.Chem.rdchem.Mol'>


In [5]:
# print(X_train[:5])
print(X_train.shape)
# print(X_test[:5])
print(X_test.shape)
# print(y_train[:5])
print(y_train.shape)
# print(y_test[:5])
print(y_test.shape)

(548,)
(137,)
(548,)
(137,)


#### Converting RDKit objects to tensors with atom and bond features, converting these tensors into torch_geometric Data type and subsequently into a torch_geometric DataLoader. Converting the (training and test) labels or target array into torch_geometric DataLoader objects.

#### Using custom functions from ```featurize.py``` for all of these steps

In [6]:
print(type(X_train))

<class 'numpy.ndarray'>


In [7]:
from sklearn.preprocessing import StandardScaler

yscale = StandardScaler()
y_train = yscale.fit_transform(y_train.reshape(-1,1))



In [8]:
y_test = yscale.transform(y_test.reshape(-1,1))

In [9]:
y_test.shape

(137, 1)

In [10]:
y_test = y_test.reshape(-1,)

In [11]:
y_test.shape

(137,)

In [12]:
y_train = y_train.reshape(-1,)

In [13]:
from featurize import prepare_dataloader

bs = 50 # batch size

train_loader, _ = prepare_dataloader(X_train, batch_size=bs)
train_labels_loader = DataLoader(y_train, batch_size=bs)
test_loader, _ = prepare_dataloader(X_test, batch_size=y_test.shape[0])
test_labels_loader = DataLoader(y_test, batch_size=y_test.shape[0])

tensor([[6, 3, 1, 3, 4, 0],
        [6, 0, 3, 0, 3, 0],
        [8, 0, 1, 0, 3, 0],
        [7, 0, 3, 0, 3, 0],
        [6, 2, 2, 2, 4, 0],
        [6, 2, 2, 2, 4, 0],
        [6, 1, 3, 1, 4, 0],
        [7, 1, 2, 1, 3, 0],
        [6, 0, 3, 0, 3, 1],
        [7, 0, 2, 0, 3, 1],
        [6, 1, 2, 1, 3, 1],
        [6, 1, 2, 1, 3, 1],
        [6, 1, 2, 1, 3, 1],
        [6, 0, 3, 0, 3, 1],
        [6, 0, 3, 0, 3, 1],
        [6, 1, 2, 1, 3, 1],
        [7, 0, 2, 0, 3, 1],
        [6, 0, 3, 0, 3, 1],
        [7, 1, 2, 0, 3, 1],
        [6, 1, 2, 1, 3, 1],
        [6, 1, 2, 1, 3, 1],
        [6, 0, 3, 0, 3, 1],
        [7, 0, 2, 0, 3, 1],
        [6, 2, 2, 2, 4, 0]])
tensor([[6, 3, 1, 3, 4, 0],
        [6, 0, 3, 0, 3, 0],
        [8, 0, 1, 0, 3, 0],
        [7, 0, 3, 0, 3, 0],
        [6, 2, 2, 2, 4, 0],
        [6, 2, 2, 2, 4, 0],
        [6, 1, 3, 1, 4, 0],
        [7, 1, 2, 1, 3, 0],
        [6, 0, 3, 0, 3, 1],
        [7, 0, 2, 0, 3, 1],
        [6, 1, 2, 1, 3, 1],
        [6, 1, 2, 1

In [14]:
print(type(train_loader))
print(type(train_labels_loader))

<class 'torch_geometric.loader.dataloader.DataLoader'>
<class 'torch_geometric.loader.dataloader.DataLoader'>


### Build Model

#### Initializing custom DNN (from ```neural_network.py```), its parameters, the optimizer, and the loss function. The DNN model will call the ```NeuralFingerprint``` class from ```neural_fingerprints.py``` and convert the atom and bond information into neural fingerprints. 

In [15]:
# check neural_network.py for the NeuralNetwork class
from neural_network import NeuralNetwork, initialize_weights

dnn = NeuralNetwork(atom_features=6, fp_size=1024, hidden_size=256)
dnn.apply(initialize_weights)
optimizer = torch.optim.SGD(dnn.parameters(), lr=0.01, weight_decay=0.00001, momentum=0.9)
# optimizer = torch.optim.Adam(dnn.parameters(), lr=0.01, weight_decay=0.001)
lossfn = torch.nn.MSELoss()

In [16]:
dnn

NeuralNetwork(
  (neural_fp): NeuralFP(
    (loop1): NeuralLoop()
    (loop2): NeuralLoop()
    (loops): ModuleList(
      (0): NeuralLoop()
      (1): NeuralLoop()
    )
  )
  (lin1): Linear(in_features=1024, out_features=256, bias=True)
  (lin2): Linear(in_features=256, out_features=128, bias=True)
  (lin3): Linear(in_features=128, out_features=64, bias=True)
  (lin4): Linear(in_features=64, out_features=32, bias=True)
  (lin5): Linear(in_features=32, out_features=1, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

#### Training the DNN and storing the loss

In [17]:
torch.manual_seed(11)
total_epochs = 10
losses = []
for epoch in range(total_epochs):
    #forward feed
    avg_loss = []
    for idx, (batch, labels) in enumerate(zip(train_loader, train_labels_loader)):
        optimizer.zero_grad()
        y_pred = dnn.forward(batch)
        loss = lossfn(y_pred, labels.to(torch.float).reshape(-1,1))
        loss.backward()
        optimizer.step()
        avg_loss.append(loss.item())
    losses.append(np.mean(avg_loss))
    if epoch % 20 == 0:
        print(f'Epoch:{epoch}, Train loss: {np.mean(avg_loss)}')

Epoch:0, Train loss: 1.0210163037885318


In [19]:
for tl in test_loader:
    y_hat = dnn(tl)
    y_hat = y_hat.detach().numpy()

In [20]:
y_hat = yscale.inverse_transform(y_hat)

In [21]:
for trl in train_loader:
    y_train_hat = dnn(trl)
    y_train_hat = y_train_hat.detach().numpy()

In [22]:
r2 = r2_score(y_test.reshape(-1,1), y_hat)
mae = mean_absolute_error(y_test.reshape(-1,1), y_hat)
mse = np.sqrt(mean_squared_error(y_test.reshape(-1,1), y_hat))

print('Results')
print('-------')
print('MSE = ', mse)
print('MAE = ', mae)

Results
-------
MSE =  9.048681777823933
MAE =  9.040992510658514


### Saving the Model

In [None]:
torch.save(dnn.state_dict(),'./Results/NeuralFingerprints/jak1/jak1_trained.pt')
torch.save(optimizer.state_dict(),'./Results/NeuralFingerprints/jak1/jak1_optimizer.pt')