In [1]:
%load_ext autoreload
%autoreload 2
import os
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import torch
from torch_geometric.loader import DataLoader
from torch_geometric.data import Batch
from dataset import CMPNNDataset
from model import CMPNNEncoder, FFNHead
# Verify working directory and raw data files
print('Current working directory:', os.getcwd())
print('Project files:', os.listdir(os.getcwd()))
print('Raw directory files:', os.listdir(os.path.join(os.getcwd(), 'raw')))
## reproducibility seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
cudnn.deterministic = True
cudnn.benchmark = False

Current working directory: /home/calvin/code/vibe_cmpnn
Project files: ['preprocessing.py', 'dataset.py', '.pytest_cache', 'raw', 'vibe_test.ipynb', '__pycache__', 'processed', '0392-1.pdf', 'Sample_Model_Run.ipynb', 'model.py', 'model_tests.py']
Raw directory files: ['val.csv', 'SAMPL.csv', 'train.csv', 'test.csv']


In [2]:
# 0. Scaffold-based train/val/test split in-memory
import pandas as pd
from dataset import scaffold_split_df
from preprocessing import StandardScaler

# read full CSV
df = pd.read_csv('raw/SAMPL.csv')
# split by Bemis–Murcko scaffold
train_df, val_df, test_df = scaffold_split_df(df, valid_ratio=0.1, test_ratio=0.1, seed=42)
# write splits back to disk for CMPNNDataset


scaler = StandardScaler()
scaler.fit(train_df.y.values)
train_df['y'] = scaler.transform(train_df.y.values)
val_df['y']   = scaler.transform(val_df.y.values)
test_df['y']  = scaler.transform(test_df.y.values)

train_df.to_csv('raw/train.csv', index=False)
val_df.to_csv  ('raw/val.csv',   index=False)
test_df.to_csv ('raw/test.csv',  index=False)
print(f'Splits: train={len(train_df)}, val={len(val_df)}, test={len(test_df)}')

Splits: train=514, val=64, test=64


In [3]:
# 1. Load datasets and create DataLoaders
from dataset import CMPNNDataset
from torch_geometric.loader import DataLoader
from torch_geometric.data import Batch
train_ds = CMPNNDataset(root='.', csv_file='train.csv')
val_ds   = CMPNNDataset(root='.', csv_file='val.csv')
test_ds  = CMPNNDataset(root='.', csv_file='test.csv')
print(f'Train/Val/Test sizes: {len(train_ds)}/{len(val_ds)}/{len(test_ds)}')
# batching
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  collate_fn=Batch.from_data_list)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, collate_fn=Batch.from_data_list)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, collate_fn=Batch.from_data_list)

Train/Val/Test sizes: 514/64/64


Processing...
Done!
Processing...
Done!
Processing...
Done!


In [38]:
# 2. Build model, optimizer, criterion,
device   = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model    = CMPNNEncoder(in_node_feats=127, in_edge_feats=12, hidden_dim=300, num_steps=6, dropout=0, n_tasks=1,
                        readout='sum').to(device)
head     = FFNHead(in_dim=300, hidden_dim=300, out_dim=1, dropout=0.0).to(device)
optimizer= torch.optim.Adam(list(model.parameters()) + list(head.parameters()), lr=3e-4, weight_decay=1e-5)
criterion= torch.nn.MSELoss()

# training with early‐stopping on MSE_scaled,
best_val = float('inf')
best_wts = None
for epoch in range(1, 101):
    model.train(); head.train()
    train_loss = 0.0
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        z     = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
        y_pred= head(z).view(-1)
        loss  = criterion(y_pred, batch.y.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(list(model.parameters()) + list(head.parameters()), 5.0)
        optimizer.step()
        train_loss += loss.item() * batch.num_graphs
    mse_train = train_loss / len(train_ds)
    # freeze BN stats after first epoch
    if epoch == 2:
        model.bn_node.eval()
        model.bn_edge.eval()

    # validation on scaled targets,
    model.eval(); head.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            batch   = batch.to(device)
            z       = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
            y_pred  = head(z).view(-1)
            val_loss+= criterion(y_pred, batch.y.view(-1)).item() * batch.num_graphs
    mse_val = val_loss / len(val_ds)
    print(f'Epoch {epoch:02d}  MSE_scaled Train: {mse_train:.4f}, Val: {mse_val:.4f}')
    if mse_val < best_val:
        best_val = mse_val
        best_wts = (model.state_dict(), head.state_dict())

# load best weights,
model.load_state_dict(best_wts[0])
head.load_state_dict(best_wts[1])

Epoch 01  MSE_scaled Train: 309819512891.5402, Val: 3.9098
Epoch 02  MSE_scaled Train: 0.9010, Val: 0.6115
Epoch 03  MSE_scaled Train: 0.7806, Val: 1.5661
Epoch 04  MSE_scaled Train: 0.8214, Val: 1.3700
Epoch 03  MSE_scaled Train: 0.7806, Val: 1.5661
Epoch 04  MSE_scaled Train: 0.8214, Val: 1.3700
Epoch 05  MSE_scaled Train: 0.3598, Val: 2.7203
Epoch 06  MSE_scaled Train: 0.3205, Val: 0.8201
Epoch 05  MSE_scaled Train: 0.3598, Val: 2.7203
Epoch 06  MSE_scaled Train: 0.3205, Val: 0.8201
Epoch 07  MSE_scaled Train: 0.1477, Val: 2.9974
Epoch 08  MSE_scaled Train: 0.2667, Val: 1.8507
Epoch 07  MSE_scaled Train: 0.1477, Val: 2.9974
Epoch 08  MSE_scaled Train: 0.2667, Val: 1.8507
Epoch 09  MSE_scaled Train: 0.1305, Val: 0.3536
Epoch 10  MSE_scaled Train: 0.1036, Val: 2.0967
Epoch 09  MSE_scaled Train: 0.1305, Val: 0.3536
Epoch 10  MSE_scaled Train: 0.1036, Val: 2.0967
Epoch 11  MSE_scaled Train: 0.2031, Val: 0.7059
Epoch 12  MSE_scaled Train: 0.0838, Val: 1.2527
Epoch 11  MSE_scaled Train: 0

<All keys matched successfully>

In [39]:
model.eval(); head.eval()
test_loss = 0.0
y_true, y_pred = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        z     = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
        y_s   = head(z).view(-1).cpu().numpy()                  # scaled
        y_o   = scaler.inverse_transform(y_s)                   # real units
        y_pred.append(y_o)
        y_true.append(scaler.inverse_transform(batch.y.cpu().numpy()))
                                 # already real units now
y_pred = np.concatenate(y_pred)
y_true = np.concatenate(y_true)

rmse = np.sqrt(((y_pred - y_true) ** 2).mean())
mae  = np.abs(y_pred - y_true).mean()
r2   = 1 - ((y_pred - y_true) ** 2).sum() / ((y_true - y_true.mean()) ** 2).sum()
print(f'Test RMSE: {rmse:.3f} kcal/mol   MAE: {mae:.3f} kcal/mol   R²: {r2:.3f}')


Test RMSE: 2.666 kcal/mol   MAE: 2.172 kcal/mol   R²: 0.420


In [40]:
# after training
best_val_rmse_real = (best_val ** 0.5) * scaler.std_
print(f'Best-epoch Val RMSE = {best_val_rmse_real:.2f} kcal/mol')


Best-epoch Val RMSE = 2.08 kcal/mol


In [41]:
print('σ used for un-scaling :', float(scaler.std_))   # expect ~2.6–3.0
print('Best val MSE (scaled) :', best_val)
print('Best val RMSE (scaled):', best_val**0.5)


σ used for un-scaling : 3.5009811920179015
Best val MSE (scaled) : 0.3536395765841007
Best val RMSE (scaled): 0.5946760265759001


In [43]:
import pandas as pd
from preprocessing import (
    scaffold_cross_validation_repeated,
    StandardScaler
)
from dataset import CMPNNDataset
from model import CMPNNEncoder, FFNHead
from torch_geometric.loader import DataLoader
from torch_geometric.data import Batch
import torch, numpy as np

# load full dataset
df = pd.read_csv('raw/SAMPL.csv')

# get 5 repeats × 5 folds each
all_splits = scaffold_cross_validation_repeated(df, n_splits=5, n_repeats=5, seed=42)

results = []
for repeat_idx, folds in enumerate(all_splits, start=1):
    for fold_idx, (train_df, test_df) in enumerate(folds, start=1):
        # fit scaler only on train targets
        scaler = StandardScaler().fit(train_df.y.values)
        train_df['y'] = scaler.transform(train_df.y.values)
        test_df ['y'] = scaler.transform(test_df.y.values)
        # save splits (or pass DataFrames directly to a custom Dataset)
        train_df.to_csv('raw/train.csv', index=False)
        test_df .to_csv('raw/test.csv',  index=False)

        # load & train
        train_ds = CMPNNDataset(root='.', csv_file='train.csv')
        test_ds  = CMPNNDataset(root='.', csv_file='test.csv')
        loader   = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=Batch.from_data_list)
        model    = CMPNNEncoder(127,12,hidden_dim=128,num_steps=5).to(device)
        head     = FFNHead(256,64,1).to(device)
        optimizer= torch.optim.Adam(list(model.parameters())+list(head.parameters()), lr=1e-3, weight_decay=1e-5)
        criterion= torch.nn.MSELoss()

        # (run your train loop here, tracking best weights)
        # …

        # final evaluation on test split (undo scaling)
        model.eval(); head.eval()
        y_pred_s, y_true_s = [], []
        with torch.no_grad():
            for batch in DataLoader(test_ds, batch_size=32, collate_fn=Batch.from_data_list):
                batch = batch.to(device)
                z     = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
                y_s   = head(z).view(-1).cpu().numpy()
                y_true_s.append(batch.y.cpu().numpy())
                y_pred_s.append(y_s)
        y_pred = scaler.inverse_transform(np.concatenate(y_pred_s))
        y_true = scaler.inverse_transform(np.concatenate(y_true_s))
        rmse = np.sqrt(((y_pred - y_true)**2).mean())
        results.append((repeat_idx, fold_idx, rmse))

# summarize
for r,f,rmse in results:
    print(f'Repeat {r} Fold {f} → RMSE: {rmse:.3f}')

Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 1 Fold 1 → RMSE: 4.371
Repeat 1 Fold 2 → RMSE: 4.323
Repeat 1 Fold 3 → RMSE: 3.990
Repeat 1 Fold 4 → RMSE: 4.069
Repeat 1 Fold 5 → RMSE: 4.293
Repeat 2 Fold 1 → RMSE: 4.403
Repeat 2 Fold 2 → RMSE: 4.279
Repeat 2 Fold 3 → RMSE: 4.150
Repeat 2 Fold 4 → RMSE: 4.182
Repeat 2 Fold 5 → RMSE: 4.183
Repeat 3 Fold 1 → RMSE: 4.365
Repeat 3 Fold 2 → RMSE: 4.267
Repeat 3 Fold 3 → RMSE: 4.122
Repeat 3 Fold 4 → RMSE: 4.011
Repeat 3 Fold 5 → RMSE: 4.125
Repeat 4 Fold 1 → RMSE: 4.400
Repeat 4 Fold 2 → RMSE: 4.264
Repeat 4 Fold 3 → RMSE: 4.019
Repeat 4 Fold 4 → RMSE: 4.105
Repeat 4 Fold 5 → RMSE: 4.243
Repeat 5 Fold 1 → RMSE: 4.361
Repeat 5 Fold 2 → RMSE: 4.247
Repeat 5 Fold 3 → RMSE: 4.137
Repeat 5 Fold 4 → RMSE: 4.101
Repeat 5 Fold 5 → RMSE: 4.209


Done!
Processing...
Done!


In [49]:
import os, uuid
from pathlib import Path
Path('raw').mkdir(exist_ok=True)          # make sure ./raw exists

def df_to_dataset(df):
    """Write *inside* ./raw and build a CMPNNDataset that can find it."""
    fname = f'tmp_{uuid.uuid4().hex}.csv'
    path  = os.path.join('raw', fname)    # <-- write into ./raw
    df.to_csv(path, index=False)
    return CMPNNDataset(root='.', csv_file=fname)


In [51]:
import pandas as pd, numpy as np, torch, torch.nn as nn
from torch_geometric.loader import DataLoader
from torch_geometric.data import Batch
from preprocessing import scaffold_cross_validation_repeated, StandardScaler
from dataset       import CMPNNDataset
from model         import CMPNNEncoder, FFNHead

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def split_train_val(df, valid_ratio=0.1, seed=0):
    """Random 90/10 split that keeps scaffolds in train (ok for inner val)."""
    rng = np.random.RandomState(seed)
    perm = rng.permutation(len(df))
    k = int(len(df) * (1 - valid_ratio))
    return df.iloc[perm[:k]].reset_index(drop=True), df.iloc[perm[k:]].reset_index(drop=True)

def train_one_fold(train_df, test_df, seed):
    # ── scale targets on train only ────────────────────────────────
    scaler = StandardScaler().fit(train_df.y.values)
    for df in (train_df, test_df):
        df.loc[:, 'y'] = scaler.transform(df.y.values)

    # inner val split
    train_df, val_df = split_train_val(train_df, valid_ratio=0.1, seed=seed)

    # ── datasets & loaders ─────────────────────────────────────────
    def ds_from(df): return CMPNNDataset(dataframe=df)
    ds_tr, ds_val, ds_te = map(df_to_dataset, (train_df, val_df, test_df))


    L  = lambda d, bs, sh: DataLoader(d, bs, sh, collate_fn=Batch.from_data_list)
    ld_tr, ld_val, ld_te = L(ds_tr, 32, True), L(ds_val, 64, False), L(ds_te, 64, False)

    # ── model, head, optimiser ─────────────────────────────────────
    torch.manual_seed(seed)
    model = CMPNNEncoder(127, 12, hidden_dim=256, num_steps=4,
                         dropout=0.0, readout='sum',
                         use_booster=False).to(device)
    head  = FFNHead(256, 256, 1, dropout=0.0).to(device)
    opt   = torch.optim.Adam(
             list(model.parameters())+list(head.parameters()),
             lr=2e-4, weight_decay=5e-6)
    criterion = nn.MSELoss()

    # ── train with early-stopping ──────────────────────────────────
    best_val, best_w, patience, PATIENCE = float('inf'), None, 0, 25
    for epoch in range(200):                    # hard cap
        model.train(); head.train()
        for batch in ld_tr:
            batch = batch.to(device)
            opt.zero_grad()
            z = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
            loss = criterion(head(z).view(-1), batch.y.view(-1))
            loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
            opt.step()
        # validation
        model.eval(); head.eval()
        val_mse, n = 0.0, 0
        with torch.no_grad():
            for batch in ld_val:
                batch = batch.to(device)
                z = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
                val_mse += criterion(head(z).view(-1), batch.y.view(-1)).item()*batch.num_graphs
                n += batch.num_graphs
        val_mse /= n
        if val_mse < best_val:
            best_val, patience, best_w = val_mse, 0, (model.state_dict(), head.state_dict())
        else:
            patience += 1
            if patience >= PATIENCE:
                break

    # ── test evaluation (real units) ───────────────────────────────
    model.load_state_dict(best_w[0]); head.load_state_dict(best_w[1])
    model.eval(); head.eval()
    yp, yt = [], []
    with torch.no_grad():
        for batch in ld_te:
            batch = batch.to(device)
            z = model.embed(batch.x, batch.edge_index, batch.edge_attr, batch.batch)
            yp.append(head(z).view(-1).cpu().numpy())
            yt.append(batch.y.cpu().numpy())
    yp = scaler.inverse_transform(np.concatenate(yp))
    yt = scaler.inverse_transform(np.concatenate(yt))
    rmse = np.sqrt(((yp - yt) ** 2).mean())
    return rmse

# ── 5 × 5 scaffold CV ──────────────────────────────────────────────
full_df = pd.read_csv('raw/SAMPL.csv')
results = []
for rep, folds in enumerate(scaffold_cross_validation_repeated(full_df, 5, 5), 1):
    for fold, (train_df, test_df) in enumerate(folds, 1):
        rmse = train_one_fold(train_df, test_df, seed=1000*rep+fold)
        results.append(rmse)
        print(f'Repeat {rep}  Fold {fold}  RMSE = {rmse:.3f} kcal/mol')

print('── summary ───────────────────────────────────────────────────')
print(f'Mean RMSE  {np.mean(results):.3f} ± {np.std(results):.3f} kcal/mol')


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 1  Fold 1  RMSE = 2.065 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 1  Fold 2  RMSE = 2.047 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 1  Fold 3  RMSE = 2.407 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!
Done!


Repeat 1  Fold 4  RMSE = 3.925 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 1  Fold 5  RMSE = 3.053 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!
Done!


Repeat 2  Fold 1  RMSE = 2.837 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 2  Fold 2  RMSE = 2.138 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 2  Fold 3  RMSE = 3.176 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 2  Fold 4  RMSE = 3.432 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 2  Fold 5  RMSE = 2.522 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 3  Fold 1  RMSE = 2.485 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 3  Fold 2  RMSE = 2.197 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 3  Fold 3  RMSE = 2.823 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 3  Fold 4  RMSE = 3.820 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 3  Fold 5  RMSE = 2.493 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 4  Fold 1  RMSE = 2.991 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 4  Fold 2  RMSE = 2.268 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 4  Fold 3  RMSE = 2.870 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 4  Fold 4  RMSE = 3.778 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 4  Fold 5  RMSE = 2.267 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 5  Fold 1  RMSE = 2.243 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 5  Fold 2  RMSE = 2.112 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 5  Fold 3  RMSE = 3.023 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 5  Fold 4  RMSE = 3.594 kcal/mol


Processing...
Done!
Processing...
Done!
Processing...
Done!


Repeat 5  Fold 5  RMSE = 2.336 kcal/mol
── summary ───────────────────────────────────────────────────
Mean RMSE  2.756 ± 0.581 kcal/mol
