In [1]:
!pip install rdkit
!pip install networkx
!pip install torch-geometric

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5
Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


In [2]:
import gdown
import pandas as pd
import torch
from rdkit import Chem
from torch_geometric.data import Data, Batch
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.utils import to_networkx

In [49]:
from google.colab import drive

drive.mount('/content/drive')

# Load the CSV dataset
data = pd.read_csv('/content/drive/MyDrive/CS 566/Notebooks/DSI.csv')
# data = pd.read_csv('/content/ashmika.csv')
print(data.shape)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
(9982, 4)


In [4]:
data.head()

Unnamed: 0,Drug_ID,Drug,Y,Drug_canon
0,Benzo[cd]indol-2(1H)-one,O=C1Nc2cccc3cccc1c23,-3.254767,O=C1Nc2cccc3cccc1c32
1,4-chlorobenzaldehyde,O=Cc1ccc(Cl)cc1,-2.177078,O=Cc1ccc(Cl)cc1
2,4-({4-[bis(oxiran-2-ylmethyl)amino]phenyl}meth...,c1cc(N(CC2CO2)CC2CO2)ccc1Cc1ccc(N(CC2CO2)CC2CO...,-4.662065,C(c1ccc(cc1)N(CC1CO1)CC1CO1)c1ccc(cc1)N(CC1CO1...
3,vinyltoluene,C=Cc1cccc(C)c1,-3.12315,Cc1cc(C=C)ccc1
4,3-(3-ethylcyclopentyl)propanoic acid,CCC1CCC(CCC(=O)O)C1,-3.286116,CCC1CC(CC1)CCC(O)=O


In [21]:
def molecule_to_graph(molecule):
    num_atoms = molecule.GetNumAtoms()
    x = torch.tensor([atom_feature_vector(atom) for atom in molecule.GetAtoms()], dtype=torch.float)
    edge_index = []
    edge_attr = []
    for bond in molecule.GetBonds():
        edge_index.extend([[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()], [bond.GetEndAtomIdx(), bond.GetBeginAtomIdx()]])
        edge_attr.extend([bond_feature_vector(bond), bond_feature_vector(bond)])
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

def atom_feature_vector(atom):
    return [atom.GetAtomicNum(), atom.GetDegree(), atom.GetHybridization()]

def bond_feature_vector(bond):
    return [bond.GetBondTypeAsDouble(), bond.IsInRing()]

def visualize(graph):
    nx_graph = to_networkx(graph, to_undirected=True)

    fig = plt.figure(figsize=(10, 10))
    pos = nx.spring_layout(nx_graph)

    nx.draw_networkx(nx_graph, pos, with_labels=True, node_size=500, font_size=12, font_weight='bold')

    if 'edge_attr' in next(iter(nx_graph.edges(data=True)))[-1]:
        edge_labels = {(u, v): d['edge_attr'] for u, v, d in nx_graph.edges(data=True)}
        nx.draw_networkx_edge_labels(nx_graph, pos, edge_labels=edge_labels, font_size=10)

    plt.axis('off')
    plt.show()

In [50]:
import numpy as np
graphs = []
indices = []

for _, row in data.iterrows():
    try:
        idx_dict = {}
        if isinstance(row['Drug_canon'], str):
            molecule = Chem.MolFromSmiles(row['Drug_canon'])
            graph = molecule_to_graph(molecule)

            required_feature_size = graph.x.shape[1]  # The total number of features per node

            # Create a feature vector for the "DDI score" node
            # First, initialize a vector of zeros with the required_feature_size
            ddi_score_features = torch.zeros(1, required_feature_size)
            ddi_score_features[0, 0] = row['Y']
            # ddi_score_features[0, 1] = row['Target_encoded']

            ddi_score_node_index = graph.x.shape[0]

            combined_x = torch.cat([graph.x, ddi_score_features], dim=0)
            combined_x_idx = [[], []]
            combined_x_idx[0] = np.arange(0,graph.x.shape[0]) # graph 1 node indices
            combined_x_idx[1] = np.arange(graph.x.shape[0],graph.x.shape[0]+1) # y index

            idx_dict["node"] = combined_x_idx

            additional_edge_index = torch.cat([
                torch.tensor([[i, ddi_score_node_index] for i in range(graph.x.shape[0])])], dim=0).t().contiguous()

            additional_edge_attr = torch.ones((additional_edge_index.shape[1], graph.edge_attr.shape[1]), dtype=torch.float)

            combined_edge_index = torch.cat([graph.edge_index, additional_edge_index], dim=1)

            combined_edge_index_idx = [[], []]
            combined_edge_index_idx[0] = np.arange(0,graph.edge_index.shape[1]) # graph 1 edge index' indices
            combined_edge_index_idx[1] = np.arange(graph.edge_index.shape[1],
                                graph.edge_index.shape[1]+additional_edge_index.shape[1]) # y edge index' index


            idx_dict["edge_index"] = combined_edge_index_idx

            combined_edge_attr = torch.cat([graph.edge_attr, additional_edge_attr], dim=0)
            combined_edge_attr_idx = [[], []]
            combined_edge_attr_idx[0] = np.arange(0,graph.edge_attr.shape[0]) # graph 1 edge attr' indices
            combined_edge_attr_idx[1] = np.arange(graph.edge_attr.shape[0],
                                graph.edge_attr.shape[0]+additional_edge_attr.shape[0]) # y edge attr' index

            idx_dict["edge"] = combined_edge_attr_idx
            combined_graph = Data(x=combined_x, edge_index=combined_edge_index, edge_attr=combined_edge_attr, y=ddi_score_features)
            graphs.append(combined_graph)
            indices.append(idx_dict)

    except:
          continue

# print(len(graphs))
# print(graphs)
# visualize(graphs[0])
# print(indices)

[07:20:20] Can't kekulize mol.  Unkekulized atoms: 3 5


In [23]:
import warnings
warnings.filterwarnings('ignore')

In [53]:
import torch
from torch_geometric.nn import GCNConv, NNConv, BatchNorm
from torch_geometric.data import Data

class DSIGNN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, edge_channels):
        super(DSIGNN, self).__init__()
        self.conv1 = NNConv(in_channels, hidden_channels, nn=torch.nn.Linear(edge_channels, in_channels * hidden_channels))
        self.bn1 = BatchNorm(hidden_channels)

        self.conv1b = NNConv(hidden_channels, hidden_channels, nn=torch.nn.Linear(edge_channels, hidden_channels * hidden_channels))
        self.bn1b = BatchNorm(hidden_channels)

        self.conv2 = NNConv(hidden_channels, hidden_channels, nn=torch.nn.Linear(edge_channels, hidden_channels * hidden_channels))
        self.bn2 = BatchNorm(hidden_channels)

        self.fc1 = torch.nn.Linear(hidden_channels, hidden_channels)
        self.fc2 = torch.nn.Linear(hidden_channels, out_channels)
        self.dropout = torch.nn.Dropout(p=0.5)

    def l2_regularization(self):
        l2_reg = 0
        for param in self.parameters():
            l2_reg += torch.sum(param ** 2)
        return l2_reg

    def forward(self, graph):
        v=graph.x
        edge_index = graph.edge_index
        e = graph.edge_attr
        indices = graph.indices

        v1 = v[torch.tensor(indices["node"][0][0])]

        e1 = e[torch.tensor(indices["edge"][0][0])]
        e2 = e[torch.tensor(indices["edge"][0][1])]

        eidx1 = edge_index[:, torch.tensor(indices["edge_index"][0][0])]
        eidx2 = edge_index[:, torch.tensor(indices["edge_index"][0][1])]

        graph1 = self.conv1(v1, eidx1, e1)
        graph1 = self.bn1(graph1)
        graph1 = torch.nn.functional.elu(graph1)

        graph1 = self.conv1b(graph1, eidx1, e1)
        graph1 = self.bn1(graph1)
        graph1 = torch.nn.functional.elu(graph1)

        combined_nodes = torch.cat((graph1, torch.zeros(1, 40)), dim=0)
        v2 = self.conv2(combined_nodes, eidx2, e2)
        v2 = self.bn2(v2)
        v2 = torch.nn.functional.elu(v2)
        v2 = self.dropout(v2[-1])

        v2 = self.fc1(v2)
        v2 = torch.nn.functional.elu(v2)

        v2 = self.fc2(v2)
        v2 = torch.nn.functional.elu(v2)

        return v2

In [55]:
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import torch
import numpy as np
import torch.nn.functional as F
from sklearn.model_selection import KFold

# Assuming 'graphs' and 'indices' are provided somewhere in your code
data = []
for graph, index in zip(graphs, indices):
    graph.indices = index
    data.append(graph)

# Define the number of folds for k-fold cross-validation
num_folds = 5

# Create a KFold object
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=24)

# Define model, optimizer, and loss function
model = DSIGNN(in_channels=3, hidden_channels=40, out_channels=1, edge_channels=2)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
l2_weight = 0.01
loss_fn = torch.nn.MSELoss()

def r2_score(y_true, y_pred):
    ss_res = ((y_true - y_pred) ** 2).sum()
    ss_tot = ((y_true - y_true.mean()) ** 2).sum()
    r2 = 1 - ss_res / ss_tot
    return r2

num_epochs = 5
patience = 10  # Number of epochs to wait for improvement

# Perform k-fold cross-validation
for fold, (train_idx, val_idx) in enumerate(kfold.split(data)):
    print(f"Fold {fold+1}/{num_folds}")

    # Split the data into train and validation sets for the current fold
    train_dataset = [data[i] for i in train_idx]
    val_dataset = [data[i] for i in val_idx]

    # Create DataLoaders for train and validation sets
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

    best_val_loss = float('inf')
    counter = 0

    for epoch in range(num_epochs):
      model.train()
      total_train_loss = 0
      train_preds, train_targets = [], []
      for graph in train_loader:
          output = model(graph)
          loss = loss_fn(output, graph.y.float()) + l2_weight*model.l2_regularization()
          optimizer.zero_grad()
          loss.backward()
          with torch.no_grad():
                optimizer.step()
          total_train_loss += loss.item()
          train_preds.append(output.detach())
          train_targets.append(graph.y.detach())

      train_preds = torch.cat(train_preds).view(-1, 1)
      train_targets = torch.cat(train_targets)
      train_r2 = r2_score(train_targets, train_preds)
      train_loss = total_train_loss / len(train_loader.dataset)

      model.eval()
      total_val_loss = 0
      val_preds, val_targets = [], []
      with torch.no_grad():
        for graph in val_loader:
            output = model(graph)
            loss = loss_fn(output, graph.y.float())
            total_val_loss += loss.item()
            val_preds.append(output.detach())
            val_targets.append(graph.y.detach())

      val_preds = torch.cat(val_preds).view(-1, 1)
      val_targets = torch.cat(val_targets)
      val_r2 = r2_score(val_targets, val_preds)
      val_loss = total_val_loss / len(val_loader.dataset)

      print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train R2: {train_r2:.4f}, Val Loss: {val_loss:.4f}, Val R2: {val_r2:.4f}')

      # Early stopping
      if val_loss < best_val_loss:
          best_val_loss = val_loss
          counter = 0
      else:
          counter += 1
          if counter >= patience:
              print(f"Early stopping at epoch {epoch+1}")
              break

    print()

Fold 1/5
Epoch 1/5, Train Loss: 20.8806, Train R2: -0.0665, Val Loss: 177.2858, Val R2: -48.1028
Epoch 2/5, Train Loss: 17.2997, Train R2: 0.0065, Val Loss: 16.5511, Val R2: -3.5842
Epoch 3/5, Train Loss: 14.4948, Train R2: 0.0103, Val Loss: 383.4103, Val R2: -105.1930
Epoch 4/5, Train Loss: 12.5342, Train R2: 0.0123, Val Loss: 310.2680, Val R2: -84.9348
Epoch 5/5, Train Loss: 11.1374, Train R2: 0.0149, Val Loss: 108.3367, Val R2: -29.0060

Fold 2/5
Epoch 1/5, Train Loss: 10.1725, Train R2: 0.0172, Val Loss: 158.7392, Val R2: -43.1548
Epoch 2/5, Train Loss: 9.4888, Train R2: 0.0191, Val Loss: 43.2473, Val R2: -11.0296
Epoch 3/5, Train Loss: 8.9979, Train R2: 0.0199, Val Loss: 10.0725, Val R2: -1.8018
Epoch 4/5, Train Loss: 8.5870, Train R2: 0.0225, Val Loss: 5.5179, Val R2: -0.5349
Epoch 5/5, Train Loss: 8.2395, Train R2: 0.0230, Val Loss: 4.0259, Val R2: -0.1198

Fold 3/5
Epoch 1/5, Train Loss: 7.9110, Train R2: 0.0251, Val Loss: 3.7281, Val R2: -0.0235
Epoch 2/5, Train Loss: 7.6376, 

In [56]:
import pickle
with open('dsi_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [37]:
import pickle
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import torch
import numpy as np
import torch.nn.functional as F
from sklearn.model_selection import KFold

# Load the model
with open('/content/drive/MyDrive/CS 566/Pickle Files/dsi_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Put the model in evaluation mode
model.eval()

DSIGNN(
  (conv1): NNConv(3, 40, aggr=add, nn=Linear(in_features=2, out_features=120, bias=True))
  (bn1): BatchNorm(40)
  (conv1b): NNConv(40, 40, aggr=add, nn=Linear(in_features=2, out_features=1600, bias=True))
  (bn1b): BatchNorm(40)
  (conv2): NNConv(40, 40, aggr=add, nn=Linear(in_features=2, out_features=1600, bias=True))
  (bn2): BatchNorm(40)
  (fc1): Linear(in_features=40, out_features=40, bias=True)
  (fc2): Linear(in_features=40, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [62]:
import numpy as np
graphs = []
mol = 'CN1C(=O)OC(C)(C)C1=O'

try:
    idx_dict = {}
    if isinstance(mol, str):
        molecule = Chem.MolFromSmiles(mol)
        graph = molecule_to_graph(molecule)

        required_feature_size = graph.x.shape[1]  # The total number of features per node

        # Initialize a vector of zeros with the required_feature_size
        # Since there is no 'Y' column, we use only zeros
        ddi_score_features = torch.zeros(1, required_feature_size)

        ddi_score_node_index = graph.x.shape[0]

        combined_x = torch.cat([graph.x, ddi_score_features], dim=0)
        combined_x_idx = [[], []]
        combined_x_idx[0] = np.arange(0, graph.x.shape[0])  # graph 1 node indices
        combined_x_idx[1] = np.arange(graph.x.shape[0], graph.x.shape[0] + 1)  # index for the additional zeros

        idx_dict["node"] = combined_x_idx

        # Creating additional edges connecting the new node to all existing nodes
        additional_edge_index = torch.cat([
            torch.tensor([[i, ddi_score_node_index] for i in range(graph.x.shape[0])])], dim=0).t().contiguous()

        # Assuming all additional edges have the same attributes, such as a weight of 1
        additional_edge_attr = torch.ones((additional_edge_index.shape[1], graph.edge_attr.shape[1]), dtype=torch.float)

        combined_edge_index = torch.cat([graph.edge_index, additional_edge_index], dim=1)
        combined_edge_index_idx = [[], []]
        combined_edge_index_idx[0] = np.arange(0, graph.edge_index.shape[1])  # Original edges' indices
        combined_edge_index_idx[1] = np.arange(graph.edge_index.shape[1], graph.edge_index.shape[1] + additional_edge_index.shape[1])  # New edges' indices

        idx_dict["edge_index"] = combined_edge_index_idx

        combined_edge_attr = torch.cat([graph.edge_attr, additional_edge_attr], dim=0)
        combined_edge_attr_idx = [[], []]
        combined_edge_attr_idx[0] = np.arange(0, graph.edge_attr.shape[0])  # Original edge attributes' indices
        combined_edge_attr_idx[1] = np.arange(graph.edge_attr.shape[0], graph.edge_attr.shape[0] + additional_edge_attr.shape[0])  # New edge attributes' indices

        idx_dict["edge"] = combined_edge_attr_idx

        # Create the final graph data object with the combined nodes, edges, and attributes
        combined_graph = Data(x=combined_x, edge_index=combined_edge_index, edge_attr=combined_edge_attr, y=ddi_score_features)
        combined_graph.indices = idx_dict
        graphs.append(combined_graph)

except Exception as e:
    print(f"Error processing row: {e}")



In [63]:
# Create a DataLoader for the test set
test_loader = DataLoader(graphs, batch_size=1, shuffle=False)

# Load the best model from the training phase
model.eval()

# Evaluate the model on the test set
test_preds, test_targets = [], []
with torch.no_grad():
    for graph in test_loader:
        print(graph)
        output = model(graph)
        print(output)
        test_preds.append(output.detach())
        test_targets.append(graph.y.detach())

test_preds = torch.cat(test_preds).view(-1, 1)
test_targets = torch.cat(test_targets)
test_r2 = r2_score(test_targets, test_preds)
test_loss = loss_fn(test_preds, test_targets.float()).item()

print(f'Test Loss: {test_loss:.4f}, Test R2: {test_r2:.4f}')

DataBatch(
  x=[11, 3],
  edge_index=[2, 30],
  edge_attr=[30, 2],
  y=[1, 3],
  indices={
    node=[1],
    edge_index=[1],
    edge=[1],
  },
  batch=[11],
  ptr=[2]
)
tensor([-1.0000])
Test Loss: 0.9999, Test R2: -inf


In [None]:
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import torch
import numpy as np

data = []
for graph, index in zip(graphs, indices):
    graph.indices = index
    data.append(graph)

# dataset = CustomGraphDataset(graphs, indices)
loader = DataLoader(data, batch_size=1, shuffle=True)

# Define model, optimizer, and loss function
model = DSIGNN(in_channels=3, hidden_channels=8, out_channels=1, edge_channels=2)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
loss_fn = torch.nn.MSELoss()

num_epochs = 10
for epoch in range(num_epochs):
    total_loss = 0
    for graph in loader:
        # Forward Pass
        output = model(graph)
        loss = loss_fn(output, graph.y.float())

        # Backward Pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)


        for name, param in model.named_parameters():
          if param.requires_grad and total_loss < 0.01 and name == 'conv1.nn.weight':
            print(f"Gradient for {name}: {param.grad}")


        with torch.no_grad():
          optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(loader.dataset)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {average_loss:.4f}')

RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 8 but got size 40 for tensor number 1 in the list.