In [231]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch_geometric.datasets import MoleculeNet
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import numpy as np
from rdkit.Chem import rdFingerprintGenerator
from torch_geometric.nn import GINConv, global_add_pool
from torch_geometric.loader import DataLoader


In [232]:
def compute_ecfp(smiles, radius=0, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError("Invalid SMILES string")
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=2048)
    fp = mfpgen.GetFingerprint(mol)
    arr = np.zeros((nBits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

In [233]:
class ECFPDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        data = self.dataset[idx]
        if not hasattr(data, 'smiles'):
            raise ValueError("Data object is missing a 'smiles' attribute.")
        fingerprint = compute_ecfp(data.smiles)
        fingerprint = torch.tensor(fingerprint, dtype=torch.float)
        label = data.y.float()
        return fingerprint, label

In [234]:
dataset = MoleculeNet(root='data/MoleculeNet', name='BACE')
print("Total molecules in BACE:", len(dataset))

Total molecules in BACE: 1513


In [235]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
print("Train set size:", len(train_dataset))
print("Test set size:", len(test_dataset))

Train set size: 1210
Test set size: 303


In [237]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim=2048, hidden_dim=128, output_dim=1):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [238]:
class SingleLayerGIN(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(SingleLayerGIN, self).__init__()
        nn1 = nn.Sequential(nn.Linear(in_channels, out_channels), nn.ReLU(), nn.Linear(out_channels, out_channels))
        self.gin_conv = GINConv(nn1)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.gin_conv(x, edge_index))
        graph_embedding = global_add_pool(x, data.batch)
        return graph_embedding
    
class GINWithMLP(nn.Module):
    def __init__(self, in_channels, gin_out_channels, mlp_hidden_dim, output_dim):
        super(GINWithMLP, self).__init__()
        self.gin = SingleLayerGIN(in_channels, gin_out_channels)
        self.mlp = MLPClassifier(gin_out_channels, mlp_hidden_dim, output_dim)
    
    def forward(self, data):
        embedding = self.gin(data)
        out = self.mlp(embedding)
        return out

In [None]:
ecfp_train_dataset = ECFPDataset(train_dataset)
ecfp_test_dataset = ECFPDataset(test_dataset) 

train_loader = DataLoader(ecfp_train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(ecfp_test_dataset, batch_size=32, shuffle=False)

model = MLPClassifier(input_dim=2048, hidden_dim=128, output_dim=1)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

# Training loop
num_epochs = 25
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for fingerprints, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(fingerprints)  
        labels = labels.squeeze(-1)    
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()  
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}")
    
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for fingerprints, labels in test_loader:
        outputs = model(fingerprints)
        predictions = (torch.sigmoid(outputs) > 0.5).float()
        labels = labels.squeeze(-1)  
        total += labels.size(0)
        correct += (predictions == labels).sum().item()

print("correct: ", correct)
print("total: ", total)
accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")
    

Epoch 1/25, Loss: 0.6067
Epoch 2/25, Loss: 0.4166
Epoch 3/25, Loss: 0.3418
Epoch 4/25, Loss: 0.3068
Epoch 5/25, Loss: 0.2541
Epoch 6/25, Loss: 0.2266
Epoch 7/25, Loss: 0.2020
Epoch 8/25, Loss: 0.1798
Epoch 9/25, Loss: 0.1683
Epoch 10/25, Loss: 0.1521
Epoch 11/25, Loss: 0.1485
Epoch 12/25, Loss: 0.1276
Epoch 13/25, Loss: 0.1353
Epoch 14/25, Loss: 0.1173
Epoch 15/25, Loss: 0.1126
Epoch 16/25, Loss: 0.1177
Epoch 17/25, Loss: 0.0974
Epoch 18/25, Loss: 0.0947
Epoch 19/25, Loss: 0.0947
Epoch 20/25, Loss: 0.0841
Epoch 21/25, Loss: 0.0903
Epoch 22/25, Loss: 0.0909
Epoch 23/25, Loss: 0.0784
Epoch 24/25, Loss: 0.0623
Epoch 25/25, Loss: 0.0653
correct:  247
total:  303
Test Accuracy: 0.8152


In [244]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model = GINWithMLP(in_channels=dataset.num_features, gin_out_channels=64, mlp_hidden_dim=128, output_dim=1)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

num_epochs = 200
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for data in train_loader:
        optimizer.zero_grad()
        outputs = model(data)  
        labels = data.y.float()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}")
    
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        outputs = model(data)
        predictions = (torch.sigmoid(outputs) > 0.5).float()
        labels = data.y.float()
        total += labels.size(0)
        correct += (predictions == labels).sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

Epoch 1/200, Loss: 0.8695
Epoch 2/200, Loss: 0.6897
Epoch 3/200, Loss: 0.6766
Epoch 4/200, Loss: 0.7173
Epoch 5/200, Loss: 0.6807
Epoch 6/200, Loss: 0.6860
Epoch 7/200, Loss: 0.6919
Epoch 8/200, Loss: 0.6704
Epoch 9/200, Loss: 0.6708
Epoch 10/200, Loss: 0.6683
Epoch 11/200, Loss: 0.6566
Epoch 12/200, Loss: 0.6581
Epoch 13/200, Loss: 0.6597
Epoch 14/200, Loss: 0.6546
Epoch 15/200, Loss: 0.6622
Epoch 16/200, Loss: 0.6535
Epoch 17/200, Loss: 0.6618
Epoch 18/200, Loss: 0.6613
Epoch 19/200, Loss: 0.6539
Epoch 20/200, Loss: 0.6443
Epoch 21/200, Loss: 0.6312
Epoch 22/200, Loss: 0.6267
Epoch 23/200, Loss: 0.6710
Epoch 24/200, Loss: 0.6465
Epoch 25/200, Loss: 0.6494
Epoch 26/200, Loss: 0.6424
Epoch 27/200, Loss: 0.6396
Epoch 28/200, Loss: 0.6303
Epoch 29/200, Loss: 0.6388
Epoch 30/200, Loss: 0.6135
Epoch 31/200, Loss: 0.6291
Epoch 32/200, Loss: 0.6217
Epoch 33/200, Loss: 0.6099
Epoch 34/200, Loss: 0.6218
Epoch 35/200, Loss: 0.6135
Epoch 36/200, Loss: 0.6394
Epoch 37/200, Loss: 0.6045
Epoch 38/2