In [1]:
%load_ext autoreload
%autoreload 2
import os
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
# Verify working directory and raw data files
print('Current working directory:', os.getcwd())
print('Project files:', os.listdir(os.getcwd()))
print('Raw directory files:', os.listdir(os.path.join(os.getcwd(), 'raw')))
## reproducibility seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
cudnn.deterministic = True
cudnn.benchmark = False

Current working directory: /home/calvin/code/vibe_cmpnn
Project files: ['dataset.py', '.pytest_cache', 'raw', 'vibe_test.ipynb', '__pycache__', 'processed', '0392-1.pdf', 'model.py', 'model_tests.py']
Raw directory files: ['val.csv', 'SAMPL.csv', 'train.csv', 'test.csv']


In [2]:
import torch
from torch_geometric.loader import DataLoader
from torch_geometric.data import Batch
from dataset import CMPNNDataset
from model import CMPNNEncoder, FFNHead

# 1. Prepare data
dataset = CMPNNDataset(root='.', csv_file='SAMPL.csv')
loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=Batch.from_data_list)
# Check for None entries
none_indices = [i for i in range(len(dataset)) if dataset[i] is None]
print("Found None at indices:", none_indices)

# 2. Build model, encoder + FFN head
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CMPNNEncoder(in_node_feats=127, in_edge_feats=12, hidden_dim=128, num_steps=5, dropout=0.05, n_tasks=1).to(device)
head = FFNHead(in_dim=256, hidden_dim=64, out_dim=1, dropout=0.1).to(device)
optimizer = torch.optim.Adam(list(model.parameters()) + list(head.parameters()), lr=1e-3)
criterion = torch.nn.MSELoss()  # or BCEWithLogitsLoss()

# 3. Training loop
model.train(); head.train()
for epoch in range(1, 51):
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        # extract graph embeddings
        z = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
        # predict with FFN head
        y_pred = head(z).view(-1)
        # compute loss
        loss = criterion(y_pred, batch.y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch.num_graphs
    print(f'Epoch {epoch:02d}  Loss: {total_loss/len(dataset):.4f}')

Processing...
Done!


Found None at indices: []


  return F.linear(input, self.weight, self.bias)


Epoch 01  Loss: 0.8556
Epoch 02  Loss: 0.6391
Epoch 03  Loss: 0.6104
Epoch 04  Loss: 0.4765
Epoch 05  Loss: 0.2860
Epoch 06  Loss: 0.1734
Epoch 07  Loss: 0.2184
Epoch 08  Loss: 0.1343
Epoch 09  Loss: 0.1082
Epoch 10  Loss: 0.1085
Epoch 11  Loss: 0.0912
Epoch 12  Loss: 0.0846
Epoch 13  Loss: 0.2057
Epoch 14  Loss: 0.1537
Epoch 15  Loss: 0.1168
Epoch 16  Loss: 0.0947
Epoch 17  Loss: 0.1033
Epoch 18  Loss: 0.1424
Epoch 19  Loss: 0.0831
Epoch 20  Loss: 0.0704
Epoch 21  Loss: 0.0653
Epoch 22  Loss: 0.0624
Epoch 23  Loss: 0.0584
Epoch 24  Loss: 0.0613
Epoch 25  Loss: 0.0604
Epoch 26  Loss: 0.0524
Epoch 27  Loss: 0.0813
Epoch 28  Loss: 0.0837
Epoch 29  Loss: 0.0567
Epoch 30  Loss: 0.0551
Epoch 31  Loss: 0.0504
Epoch 32  Loss: 0.0473
Epoch 33  Loss: 0.0506
Epoch 34  Loss: 0.0592
Epoch 35  Loss: 0.0530
Epoch 36  Loss: 0.0608
Epoch 37  Loss: 0.0412
Epoch 38  Loss: 0.0463
Epoch 39  Loss: 0.0425
Epoch 40  Loss: 0.0395
Epoch 41  Loss: 0.0410
Epoch 42  Loss: 0.0433
Epoch 43  Loss: 0.0341
Epoch 44  L

In [3]:
N, E = 40, 60
x         = torch.randn(N, 10)
edge_idx  = torch.randint(0, N, (2, E))
# make it bidirectional
edge_idx  = torch.cat([edge_idx, edge_idx[[1,0]]], dim=1)
edge_attr = torch.randn(edge_idx.size(1), 6)
batch_vec = torch.zeros(N, dtype=torch.long)

model = CMPNNEncoder(10, 6, hidden_dim=32, num_steps=3)
out   = model(x, edge_idx, edge_attr, batch_vec)
print(out.shape)            # torch.Size([1, 1])

torch.Size([1, 1])


In [4]:
# 6. Dataset & preprocessing tests
import torch, pandas as pd
from rdkit import Chem
from rdkit.Chem.SaltRemover import SaltRemover
from dataset import CMPNNDataset, atom_features, bond_features

# atom feature vector length: 100+6+5+4+5+5+1+1 = 127
mol = Chem.MolFromSmiles('CCO')
feat0 = atom_features(mol.GetAtomWithIdx(0))
assert feat0.shape == (127,), f'Atom feature length {feat0.shape} != 127'

# bond feature vector length: 4+1+1+6 = 12
bf = bond_features(mol.GetBondBetweenAtoms(0, 1))
assert bf.shape == (12,), f'Bond feature length {bf.shape} != 12'

# bidirected edges doubling test
data = CMPNNDataset(root='.', csv_file='SAMPL.csv')[0]
# raw Data stores undirected bonds duplicated in process()
orig = pd.read_csv('./raw/SAMPL.csv').iloc[0]['smiles']
mol0 = Chem.MolFromSmiles(orig)
n_bonds = mol0.GetNumBonds()
E = data.edge_index.size(1)//2
assert E == n_bonds, f'Edges per direction {E} != original bonds {n_bonds}'



Processing...
Done!


In [5]:
# 0. Scaffold-based train/val/test split in-memory
import pandas as pd
from dataset import scaffold_split_df
# read full CSV
df = pd.read_csv('raw/SAMPL.csv')
# split by Bemis–Murcko scaffold
train_df, val_df, test_df = scaffold_split_df(df, valid_ratio=0.1, test_ratio=0.1, seed=42)
# write splits back to disk for CMPNNDataset
train_df.to_csv('raw/train.csv', index=False)
val_df.to_csv  ('raw/val.csv',   index=False)
test_df.to_csv ('raw/test.csv',  index=False)
print(f'Splits: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}')

Splits: train=514, val=64, test=64


In [6]:
# 1. Load datasets and create DataLoaders
from dataset import CMPNNDataset
from torch_geometric.loader import DataLoader
from torch_geometric.data import Batch
train_ds = CMPNNDataset(root='.', csv_file='train.csv')
val_ds   = CMPNNDataset(root='.', csv_file='val.csv')
test_ds  = CMPNNDataset(root='.', csv_file='test.csv')
print(f'Train/Val/Test sizes: {len(train_ds)}/{len(val_ds)}/{len(test_ds)}')
# batching
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  collate_fn=Batch.from_data_list)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, collate_fn=Batch.from_data_list)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, collate_fn=Batch.from_data_list)

Train/Val/Test sizes: 514/64/64


Processing...
Done!
Processing...
Done!
Processing...
Done!


In [7]:
# 2. Training loop with validation
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CMPNNEncoder(in_node_feats=127, in_edge_feats=12, hidden_dim=128, num_steps=5, dropout=0.05, n_tasks=1, readout='gru', use_booster=True).to(device)
head  = FFNHead(in_dim=256, hidden_dim=64, out_dim=1, dropout=0.1).to(device)
optimizer = torch.optim.Adam(list(model.parameters()) + list(head.parameters()), lr=1e-3)
criterion = torch.nn.MSELoss()
best_val = float('inf')
best_weights = None
for epoch in range(1, 51):
    model.train(); head.train()
    train_loss = 0
    for batch in train_loader:
        batch = batch.to(device)
        z = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
        y_pred = head(z).view(-1)
        loss = criterion(y_pred, batch.y.view(-1))
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        train_loss += loss.item() * batch.num_graphs
    avg_train = train_loss / len(train_ds)
    # validation
    model.eval(); head.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            batch = batch.to(device)
            z = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
            y_pred = head(z).view(-1)
            val_loss += criterion(y_pred, batch.y.view(-1)).item() * batch.num_graphs
    avg_val = val_loss / len(val_ds)
    print(f'Epoch {epoch:02d}: Train {avg_train:.4f}  Val {avg_val:.4f}')
    if avg_val < best_val:
        best_val = avg_val
        best_weights = (model.state_dict(), head.state_dict())
# load best model
model.load_state_dict(best_weights[0])
head.load_state_dict(best_weights[1])

Epoch 01: Train 0.7850  Val 0.9049
Epoch 02: Train 0.5953  Val 0.7930
Epoch 03: Train 0.5666  Val 0.7836
Epoch 04: Train 0.4378  Val 1.4682
Epoch 05: Train 0.6099  Val 1.1244
Epoch 06: Train 0.4168  Val 0.9850
Epoch 07: Train 0.4590  Val 0.7920
Epoch 08: Train 0.4062  Val 0.7474
Epoch 09: Train 0.3422  Val 0.6725
Epoch 10: Train 0.3576  Val 0.9579
Epoch 11: Train 0.3116  Val 1.1360
Epoch 12: Train 0.3127  Val 0.8146
Epoch 13: Train 0.3156  Val 0.8058
Epoch 14: Train 0.3163  Val 1.0900
Epoch 15: Train 0.3135  Val 0.5861
Epoch 16: Train 0.2900  Val 0.4939
Epoch 17: Train 0.2600  Val 0.6162
Epoch 18: Train 0.2606  Val 0.4906
Epoch 19: Train 0.3982  Val 0.9715
Epoch 20: Train 0.3573  Val 0.9137
Epoch 21: Train 0.4912  Val 0.8282
Epoch 22: Train 0.3222  Val 0.6584
Epoch 23: Train 0.2775  Val 0.5727
Epoch 24: Train 0.2503  Val 0.6441
Epoch 25: Train 0.2577  Val 0.6238
Epoch 26: Train 0.2542  Val 0.6298
Epoch 27: Train 0.3466  Val 0.8389
Epoch 28: Train 0.3432  Val 0.9546
Epoch 29: Train 0.41

<All keys matched successfully>

In [8]:
# 3. Test set evaluation
model.eval(); head.eval()
test_loss = 0
with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        z = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
        y_pred = head(z).view(-1)
        test_loss += criterion(y_pred, batch.y.view(-1)).item() * batch.num_graphs
print('Test RMSE:', (test_loss / len(test_ds)) ** 0.5)

Test RMSE: 0.8999365532551704


In [9]:
train_y = train_df.y.values
mean, std = train_y.mean(), train_y.std()
# during training
y_norm = (batch.y - mean) / std
loss = criterion(pred.view(-1), y_norm)
# at eval time
mse = criterion(pred.view(-1), y_norm).item()
rmse = std * sqrt(mse)
print(f"RMSE (kcal/mol): {rmse:.3f}")

NameError: name 'pred' is not defined