<a href="https://colab.research.google.com/github/aliu-7/Molecular-Property-Prediction-and-Optimization/blob/main/4_2_4_Training_a_GNN_Model_for_Molecular_Property_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installations

In [None]:
# Install RDKit (via conda-forge) using a Colab-friendly wrapper
!pip install -q condacolab
import condacolab
condacolab.install()

# After runtime restarts automatically, run this:
!mamba install -c conda-forge rdkit -y

# Reinstall PyTorch Geometric (CPU version shown here; adjust for GPU if needed)
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install -q torch-geometric

⏬ Downloading https://github.com/jaimergp/miniforge/releases/download/24.11.2-1_colab/Miniforge3-colab-24.11.2-1_colab-Linux-x86_64.sh...
📦 Installing...
📌 Adjusting configuration...
🩹 Patching environment...
⏲ Done in 0:00:10
🔁 Restarting kernel...

Looking for: ['rdkit']

[?25l[2K[0G[+] 0.0s
[2K[1A[2K[0G[+] 0.1s
conda-forge/linux-64  ⣾  
conda-forge/noarch    ⣾  [2K[1A[2K[1A[2K[0G[+] 0.2s
conda-forge/linux-64   2%
conda-forge/noarch     4%[2K[1A[2K[1A[2K[0G[+] 0.3s
conda-forge/linux-64   9%
conda-forge/noarch    20%[2K[1A[2K[1A[2K[0G[+] 0.4s
conda-forge/linux-64  13%
conda-forge/noarch    34%[2K[1A[2K[1A[2K[0G[+] 0.5s
conda-forge/linux-64  18%
conda-forge/noarch    45%[2K[1A[2K[1A[2K[0G[+] 0.6s
conda-forge/linux-64  23%
conda-forge/noarch    57%[2K[1A[2K[1A[2K[0G[+] 0.7s
conda-forge/linux-64  28%
conda-forge/noarch    67%[2K[1A[2K[1A[2K[0G[+] 0.8s
conda-forge/linux-64  32%
conda-forge/noarch    75%[2K[1A[2K[1A[2K[0G[+] 0.9s
conda

# Loading and Preprocessing Dataset

In [None]:
# Step 2: Load and preprocess the dataset
import pandas as pd
from rdkit import Chem
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
import torch
from torch.nn.functional import one_hot
from torch_geometric.loader import DataLoader

# Node and bond feature helpers
def atom_features(atom):
    return torch.tensor([
        atom.GetAtomicNum(),
        atom.GetDegree(),
        int(atom.GetIsAromatic())
    ], dtype=torch.float)

def bond_features(bond):
    bond_types = {
        Chem.rdchem.BondType.SINGLE: 0,
        Chem.rdchem.BondType.DOUBLE: 1,
        Chem.rdchem.BondType.TRIPLE: 2,
        Chem.rdchem.BondType.AROMATIC: 3
    }
    btype = bond_types.get(bond.GetBondType(), 4)
    return one_hot(torch.tensor(btype), num_classes=5).float()

def smiles_to_data(smiles, label):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None: return None

    x = torch.stack([atom_features(atom) for atom in mol.GetAtoms()])
    edge_index, edge_attr = [], []

    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        feat = bond_features(bond)
        edge_index += [[i, j], [j, i]]
        edge_attr += [feat, feat]

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.stack(edge_attr)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=torch.tensor([label], dtype=torch.float))

# Step 3: Load BBBP and convert to graphs
url = "https://raw.githubusercontent.com/Data-Chemist-Handbook/Data-Chemist-Handbook.github.io/refs/heads/master/_pages/BBBP.csv"
df = pd.read_csv(url)

graph_list = [smiles_to_data(smi, lbl) for smi, lbl in zip(df['smiles'], df['p_np'])]
graph_list = [g for g in graph_list if g is not None]

# Step 4: Split and load data
train_data, test_data = train_test_split(graph_list, test_size=0.2, random_state=42)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

[05:27:15] Explicit valence for atom # 1 N, 4, is greater than permitted
[05:27:15] Explicit valence for atom # 6 N, 4, is greater than permitted
[05:27:15] Explicit valence for atom # 6 N, 4, is greater than permitted
[05:27:16] Explicit valence for atom # 11 N, 4, is greater than permitted
[05:27:16] Explicit valence for atom # 12 N, 4, is greater than permitted
[05:27:16] Explicit valence for atom # 5 N, 4, is greater than permitted
[05:27:16] Explicit valence for atom # 5 N, 4, is greater than permitted
[05:27:16] Explicit valence for atom # 5 N, 4, is greater than permitted
[05:27:16] Explicit valence for atom # 5 N, 4, is greater than permitted
[05:27:16] Explicit valence for atom # 5 N, 4, is greater than permitted
[05:27:16] Explicit valence for atom # 5 N, 4, is greater than permitted


# Define the GCN Model

In [None]:
import torch.nn as nn
from torch_geometric.nn import GCNConv, global_mean_pool

class GCNModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(3, 64)
        self.conv2 = GCNConv(64, 64)
        self.fc = nn.Linear(64, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = torch.relu(self.conv1(x, edge_index))
        x = torch.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        return torch.sigmoid(self.fc(x))

# Train and Evaluate the Model

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GCNModel().to(device)

import torch.optim as optim

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train():
    model.train()
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        output = model(batch).squeeze()
        loss = criterion(output, batch.y)
        loss.backward()
        optimizer.step()

# Evaluation loop
def test():
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for batch in test_loader:
            batch = batch.to(device)
            output = model(batch).squeeze()
            pred = (output > 0.5).float()
            correct += (pred == batch.y).sum().item()
            total += batch.y.size(0)
    return correct / total

# Run training
for epoch in range(5):
    train()
    acc = test()
    print(f"Epoch {epoch+1}, Test Accuracy: {acc:.2f}")

Epoch 1, Test Accuracy: 0.76
Epoch 2, Test Accuracy: 0.76
Epoch 3, Test Accuracy: 0.76
Epoch 4, Test Accuracy: 0.76
Epoch 5, Test Accuracy: 0.76
