In [None]:
%load_ext autoreload
%autoreload 2
import os
os.chdir("../..")
print(os.getcwd())
import torch
import numpy as np
import random

os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' 
seed = 21
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.use_deterministic_algorithms(False)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

In [None]:
import modules.datasplit_module as dsm
import modules.trainer_module as tm
import modules.dtmpnn as gm
from torch_geometric.loader import DataLoader
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from pathlib import Path

# --- 1. Setup & Seeding ---
seed = 21
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

In [None]:
# --- 2. Hard-code final HPO Params ---
final_params = {
    'lr': 0.001,
    'weight_decay': 1e-07,
    'gd_weight': 1,
    'graph_hidden_dim': 16,
    'latent_dim': 16,
    'context_dim': 16,
    'graph_layers': 2,
    'constraint_type': 'soft',
    'include_gd': False,
}
print("\n--- final Hyperparameters ---")
for k, v in final_params.items():
    print(f"  {k}: {v}")

In [None]:
# --- 3. Load Full Dataset ---
print("\n--- Loading Full Dataset ---")
from modules.data_pipeline import DataPipeline
pipeline = DataPipeline(components_csv='datasets/components.csv')
canonical_data, graph_list = pipeline.run_pipeline(raw_csv='datasets/dataset.csv')
pipeline.save_canonical_df(canonical_data, 'datasets/canonical_data.csv')

train, val, test = \
    dsm.system_disjoint_split(graph_list[:1000], random_state=seed, stratify_by_components=True)
full_cv_dataset = train + val

final_test_loader = DataLoader(
    dataset=test,
    batch_size=1024,
    shuffle=False,
    follow_batch=['component_batch']
)
print(f"Loaded {len(full_cv_dataset)} total points for K-Fold CV.")
print(f"Loaded {len(test)} total 'unseen' test data points.")

try:
    groups = [g.system_id for g in full_cv_dataset]
except AttributeError:
    print("Warning: '.system_id' not found, trying '.system'.")
    try:
        groups = [g.system for g in full_cv_dataset] 
    except Exception as e:
        print(f"CRITICAL ERROR: Could not get group IDs for splitting. {e}")
        groups = np.arange(len(full_cv_dataset)) 

In [None]:
# --- 4. The K-Fold CV  ---
print("\n--- STARTING K-FOLD CROSS-VALIDATION ---")
K_FOLDS = 5
gkf = GroupKFold(n_splits=K_FOLDS)

fold_results = {
    'train_rmse': [],
    'val_rmse': [],
    'history': []
}

def create_fresh_model():
    """Helper function to init a new model from scratch."""
    return gm.DTMPNN(
        node_dim=train[0].x.size(1),
        edge_dim=train[0].edge_attr.size(1),
        graph_hidden_dim=final_params['graph_hidden_dim'],
        latent_dim=final_params['latent_dim'],
        context_dim=final_params['context_dim'],
        graph_layers=final_params['graph_layers'],
        constraint_type=final_params['constraint_type']
    ).to(device)

for fold, (train_idx, val_idx) in enumerate(gkf.split(full_cv_dataset, groups=groups)):
    print(f"\n--- Running Fold {fold+1}/{K_FOLDS} ---")
    
    # 1. Get data for this fold
    train_fold_data = [full_cv_dataset[i] for i in train_idx]
    val_fold_data = [full_cv_dataset[i] for i in val_idx]
    
    # 2. Create loaders
    train_loader = DataLoader(train_fold_data, batch_size=512, shuffle=True, follow_batch=['component_batch'])
    val_loader = DataLoader(val_fold_data, batch_size=512, shuffle=False, follow_batch=['component_batch'])
    
    # 3. Build a BRAND NEW model
    model = create_fresh_model()
    
    trainer = tm.DTMPNNTrainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        test_loader=None,
        include_gd=final_params['include_gd'],
        device=device,
        lr=final_params['lr'],
        weight_decay=final_params['weight_decay'],
        gd_weight=final_params['gd_weight'],
    )
    
    # 4. Train to convergence
    c_type = final_params['constraint_type']
    gd_inc = final_params['include_gd']

    history = trainer.train(
        epochs=5, 
        save_dir=f'notebooks/training_phase/crossval/log/{c_type}_gd_{gd_inc}_cv/fold_{fold+1}',
        log_file_path=f'notebooks/training_phase/crossval/log/{c_type}_gd_{gd_inc}_cv/fold_{fold+1}/train.log',
        save_best=True, 
        save_every=None,
        patience=25
    )
    trainer.plot_history(save_path = f'notebooks/training_phase/crossval/log/{c_type}_gd_{gd_inc}_cv/fold_{fold+1}/train.png')
    
    # 5. Get metrics from the BEST epoch
    best_epoch = trainer.best_epoch
    val_rmse_at_best = history['val_rmse'][best_epoch-1]
    train_rmse_at_best = history['train_rmse'][best_epoch-1]
    
    print(f"  Fold {fold+1} Best Epoch: {best_epoch}")
    print(f"  Fold {fold+1} Train RMSE: {train_rmse_at_best:.6f}")
    print(f"  Fold {fold+1} Val RMSE:   {val_rmse_at_best:.6f}")
    
    # 6. Store results
    fold_results['train_rmse'].append(train_rmse_at_best)
    fold_results['val_rmse'].append(val_rmse_at_best)
    fold_results['history'].append(history)

In [None]:
# --- 5. Aggregate CV Results (for Table 1) ---
train_mu = np.mean(fold_results['train_rmse'])
train_sig = np.std(fold_results['train_rmse'])
val_mu = np.mean(fold_results['val_rmse'])
val_sig = np.std(fold_results['val_rmse'])

print(f"\n\n--- {K_FOLDS}-FOLD CV AGGREGATE RESULTS ---")
print(f"  CV Train RMSE: {train_mu:.6f} ± {train_sig:.6f}")
print(f"  CV Val RMSE:   {val_mu:.6f} ± {val_sig:.6f}")

In [None]:
# --- 6. Train Final Model ---
print("\n\n--- STARTING FINAL MODEL TRAINING ---")
final_train_data = train
final_val_data = test
print(f"Final Model: Training on {len(final_train_data)}, Validating on {len(final_val_data)}")

final_train_loader = DataLoader(final_train_data, batch_size=1024, shuffle=True, follow_batch=['component_batch'])
final_val_loader = DataLoader(final_val_data, batch_size=1024, shuffle=False, follow_batch=['component_batch'])
trained_model = create_fresh_model()
trained_trainer = tm.DTMPNNTrainer(
        model=model,
        train_loader=final_train_loader,
        val_loader=final_val_loader,
        test_loader=final_test_loader,
        include_gd=final_params['include_gd'],
        device=device,
        lr=final_params['lr'],
        weight_decay=final_params['weight_decay'],
        gd_weight=final_params['gd_weight'],
    )

trained_history = trained_trainer.train(
    epochs=5,
    save_dir=f'notebooks/training_phase/final_model/log/{c_type}_gd_{gd_inc}',
    log_file_path=f'notebooks/training_phase/final_model/log/{c_type}_gd_{gd_inc}/train.log',
    save_best=True,
    save_every=None,
    patience=50
)
save_path = f'notebooks/training_phase/final_model/log/{c_type}_gd_{gd_inc}/final_training_history.png'
trained_trainer.plot_history(save_path=save_path)
print(f"\ntrained model training history saved to {save_path}")

In [None]:
# --- 7. Final Unbiased Evaluation  ---
print("\n--- trained Training Complete ---")
print(f"Loading best trained model from epoch: {trained_trainer.best_epoch}")
model_path = f'notebooks/training_phase/final_model/log/{c_type}_gd_{gd_inc}/best_model.pt'
print("Load location:", model_path)
trained_trainer.load_checkpoint(model_path)

print("\n--- EVALUATING trained MODEL ON UNSEEN TEST SET ---")
test_loss, test_data_driven, test_gd, test_rmse, test_mae = trained_trainer.validate(final_test_loader)

print("\n--- FINAL TEST METRICS (TABLE 1) ---")
print(f"  Test Loss (Data-driven): {test_loss:.6f}")
print(f"  Test GD Loss (monitored): {test_gd:.6f}")
print(f"  Test RMSE: {test_rmse:.6f}")
print(f"  Test MAE:  {test_mae:.6f}")
print("\n--- RUN COMPLETE ---")
torch.cuda.empty_cache()