<a href="https://colab.research.google.com/github/aliu-7/Molecular-Property-Prediction-and-Optimization/blob/main/4_2_6_Full_GNN_Pipeline_for_Molecular_Property_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code Example: BBBP Prediction with GCN

In [None]:
# Step 1: Install required libraries
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install torch-geometric
!pip install rdkit!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-2.0.0+cpu.html
!pip install torch-geometric
!pip install rdkit

In [None]:
# Step 2: Install required libraries
import torch
import pandas as pd
from rdkit import Chem
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split


url = "https://raw.githubusercontent.com/Data-Chemist-Handbook/Data-Chemist-Handbook.github.io/refs/heads/master/_pages/BBBP.csv"
df = pd.read_csv(url)

# Step 3: Create atom features
def atom_features(atom):
    return [
        atom.GetAtomicNum(),
        atom.GetTotalDegree(),
        atom.GetFormalCharge(),
        int(atom.GetIsAromatic())
    ]

# Step 4: Build PyTorch Geometric Data objects
molecules = []
for i, row in df.iterrows():
    mol = Chem.MolFromSmiles(row['smiles'])
    if mol is None:
        continue

    atoms = mol.GetAtoms()
    atom_feats = [atom_features(atom) for atom in atoms]

    edge_index = []
    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_index.append([start, end])
        edge_index.append([end, start])  # undirected

    x = torch.tensor(atom_feats, dtype=torch.float)
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    y = torch.tensor([row['p_np']], dtype=torch.float)

    molecules.append(Data(x=x, edge_index=edge_index, y=y))

# Step 5: Train/test split
train_data, test_data = train_test_split(molecules, test_size=0.2, random_state=42)

Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Looking in links: https://data.pyg.org/whl/torch-2.0.0+cpu.html
Collecting rdkit
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl (34.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.3


[05:46:45] Explicit valence for atom # 1 N, 4, is greater than permitted
[05:46:45] Explicit valence for atom # 6 N, 4, is greater than permitted
[05:46:46] Explicit valence for atom # 6 N, 4, is greater than permitted
[05:46:46] Explicit valence for atom # 11 N, 4, is greater than permitted
[05:46:46] Explicit valence for atom # 12 N, 4, is greater than permitted
[05:46:46] Explicit valence for atom # 5 N, 4, is greater than permitted
[05:46:46] Explicit valence for atom # 5 N, 4, is greater than permitted
[05:46:46] Explicit valence for atom # 5 N, 4, is greater than permitted
[05:46:46] Explicit valence for atom # 5 N, 4, is greater than permitted
[05:46:46] Explicit valence for atom # 5 N, 4, is greater than permitted
[05:46:46] Explicit valence for atom # 5 N, 4, is greater than permitted


# Building the GCN Model

In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.loader import DataLoader

class GCNClassifier(torch.nn.Module):
    def __init__(self):
        super(GCNClassifier, self).__init__()
        self.conv1 = GCNConv(4, 32)
        self.conv2 = GCNConv(32, 64)
        self.linear = torch.nn.Linear(64, 1)

    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        return torch.sigmoid(self.linear(x)).squeeze()

# Training the GCN

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCNClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.BCELoss()

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32)

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index, batch.batch)
        loss = loss_fn(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 29.0699
Epoch 2, Loss: 27.9700
Epoch 3, Loss: 27.8474
Epoch 4, Loss: 27.7612
Epoch 5, Loss: 27.6931


# Evaluating Performance

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        preds = model(batch.x, batch.edge_index, batch.batch) > 0.5
        correct += (preds == batch.y.bool()).sum().item()
        total += batch.y.size(0)

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.76
