In [1]:
%load_ext autoreload
%autoreload 2
import os
import torch
import numpy as np
import random

os.chdir("../..")
print(os.getcwd())

os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8' 
seed = 21
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.use_deterministic_algorithms(False)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True

d:\aw_workspace\main_project\vle_dtmpnn


In [2]:
from modules.data_pipeline import DataPipeline
pipeline = DataPipeline(components_csv='datasets/components.csv')
canonical_data, graph_list = pipeline.run_pipeline(raw_csv='datasets/dataset.csv')

Loaded 859 components from registry
EXECUTION STARTED

Step 1: Parsing raw data...

Step 2: Parsing systems...
Identified 80000 unique systems

Step 3: Parsing mole fractions and activity coefficients...

Step 4: Constructing molecular graphs...

=== DIAGNOSTIC SUMMARY ===
Total input rows: 440000
Graphs created: 440000
Graphs missing: 0

Skip reasons:
  - Systems with no valid components: 0
  - Component instances missing SMILES: 0
  - Component instances with invalid molecules: 0

First 10 skipped systems:

PIPELINE COMPLETE!
Processed 440000 data points
Created 440000 molecular graphs
Unique systems: 80000


In [3]:
import modules.datasplit_module as dsm
random.shuffle(graph_list)
sampled_graph_list = graph_list
train, val, test = \
    dsm.system_disjoint_split(sampled_graph_list, random_state=seed, stratify_by_components=True)

Component distribution:
  Train: {2: 196000, 3: 112000}
  Val:   {2: 42000, 3: 24000}
  Test:  {2: 42000, 3: 24000}

Datapoints -> Train: 308000, Val: 66000, Test: 66000
Unique systems -> Train: 56000, Val: 12000, Test: 12000


In [4]:
from torch_geometric.loader import DataLoader

train_loader = DataLoader(
    dataset=train,
    batch_size=1024,
    shuffle=True,
    follow_batch=['component_batch']
)

val_loader = DataLoader(
    dataset=val,
    batch_size=1024,
    shuffle=False,
    follow_batch=['component_batch']
)

test_loader = DataLoader(
    dataset=test,
    batch_size=1024,
    shuffle=False,
    follow_batch=['component_batch']
)

In [10]:
# --- 1. Configuration ---
device = 'cuda' if torch.cuda.is_available() else 'cpu'
track_grad = True
include_gd = False
N_TRIALS = 20
N_EPOCHS = 50

In [11]:
import optuna
from optuna.pruners import MedianPruner
import modules.trainer_module as tm
import modules.dtmpnn as gm
from pathlib import Path
import joblib
import sys


print(f"Device: {device}")
print(f"Include GD: {include_gd}")
print(f"Trials: {N_TRIALS}")
print(f"Epochs per trial: {N_EPOCHS}")

def objective(trial: optuna.trial.Trial):
    """
    This function takes an optuna 'trial' object,
    builds a model, trains it, and returns the best validation loss.
    """
    log_dir = Path(f"notebooks/hyperparams_search/HPO_reports/optuna_logs_{include_gd}")
    log_dir.mkdir(exist_ok=True, parents=True)
    log_file_path = log_dir / f'trial_{trial.number}.log'

    # --- 1. DEFINE THE HYPERPARAMETER SEARCH SPACE ---
    lr = trial.suggest_float('lr', 1e-5, 1e-2, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-7, 1e-4, log=True)
    gd_weight = 1.0
    graph_hidden_dim = trial.suggest_categorical('graph_hidden_dim', [16, 32, 64, 128])
    latent_dim = trial.suggest_categorical('latent_dim', [16, 32, 64])
    context_dim = trial.suggest_categorical('context_dim', [16, 32, 64])
    graph_layers = trial.suggest_int('graph_layers', 2, 5)

    # --- 2. RUN THE TRIAL ---
    try:
        model = gm.DTMPNN(
                node_dim=train[0].x.shape[1],
                edge_dim=train[0].edge_attr.shape[1],
                graph_hidden_dim=graph_hidden_dim,
                latent_dim=latent_dim,
                context_dim=context_dim,
                graph_layers=graph_layers,
                track_grad=True
                        ).to(device)

        trainer = tm.DTMPNNTrainer(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            test_loader=test_loader,
            include_gd=include_gd,
            device=device,
            lr=lr,
            weight_decay=weight_decay,
            gd_weight=gd_weight
        )

        # Train the model
        trainer.train(
            epochs=N_EPOCHS,
            save_dir=f'notebooks/hyperparams_search/HPO_reports/optuna_checkpoints_{include_gd}/trial_{trial.number}',
            log_file_path=log_file_path,
            save_best=False,
            save_every=None,
            optuna_trial=trial
        )
        
        torch.cuda.empty_cache()

        # --- 3. RETURN THE METRIC TO MINIMIZE ---
        return trainer.best_val_loss

    except optuna.TrialPruned:
        print(f"Trial {trial.number} was pruned.")
        torch.cuda.empty_cache()
        raise

    except Exception as e:
        print(f"Trial {trial.number} failed with error: {e}", file=sys.stderr)
        torch.cuda.empty_cache()
        return float('inf')

Device: cuda
Include GD: False
Trials: 20
Epochs per trial: 50


In [12]:
# --- 5. Study Runner ---
def run_study():
    pruner = MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=5,
        interval_steps=1
    )
    
    study_name = f"DTMPNN_hpo_{'gd' if include_gd else 'no_gd'}"
    study_db_path = f"sqlite:///notebooks/hyperparams_search/HPO_reports/dashboard/master_hpo_study.db"
    
    study = optuna.create_study(
        study_name=study_name,
        storage=study_db_path,
        load_if_exists=True,
        direction='minimize',
        pruner=pruner
    )
    
    print(f"--- Starting/Resuming study: {study_name} ---")
    print(f"--- Database at: {study_db_path} ---")

    try:
        study.optimize(
            objective, 
            n_trials=N_TRIALS,
            timeout=None,
            gc_after_trial=True
        ) 
    except KeyboardInterrupt:
        print("--- HPO interrupted by user. Study is saved. ---")
    
    print(f"--- Study complete ---")
    
    # --- 5. PRINT RESULTS ---
    pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
    completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
    
    print(f"Total trials: {len(study.trials)}")
    print(f"  Completed: {len(completed_trials)}")
    print(f"  Pruned:    {len(pruned_trials)}")

    if completed_trials:
        print(f"\nBest trial:")
        best_trial = study.best_trial
        print(f"  Value (min val_loss): {best_trial.value:.6f}")
        print(f"  Params: ")
        for key, value in best_trial.params.items():
            print(f"    {key}: {value}")
    else:
        print("No trials completed successfully.")
    
    return study

In [13]:
# --- 6. Run the HPO Study ---
if __name__ == "__main__":
    Path("notebooks/hyperparams_search/HPO_reports").mkdir(exist_ok=True)
    Path("notebooks/hyperparams_search/HPO_reports/dashboard").mkdir(exist_ok=True)
    study = run_study()

[I 2025-11-18 14:39:08,452] Using an existing study with name 'DTMPNN_hpo_no_gd' instead of creating a new one.


--- Starting/Resuming study: DTMPNN_hpo_no_gd ---
--- Database at: sqlite:///notebooks/hyperparams_search/HPO_reports/dashboard/master_hpo_study.db ---


[I 2025-11-18 15:17:30,280] Trial 51 finished with value: 0.16409032585529182 and parameters: {'lr': 0.0014414995626272722, 'weight_decay': 2.2879840314182504e-07, 'graph_hidden_dim': 128, 'latent_dim': 64, 'context_dim': 16, 'graph_layers': 5}. Best is trial 48 with value: 0.15843770481072939.



--- Full log successfully finalized and written to: trial_51.log ---


[I 2025-11-18 15:49:57,408] Trial 52 pruned.               


--- Trial pruned at epoch 43. ---
Trial 52 was pruned.


[I 2025-11-18 16:31:32,362] Trial 53 finished with value: 0.1512628116286718 and parameters: {'lr': 0.0012293526076691823, 'weight_decay': 2.429199151209364e-07, 'graph_hidden_dim': 128, 'latent_dim': 64, 'context_dim': 16, 'graph_layers': 5}. Best is trial 53 with value: 0.1512628116286718.



--- Full log successfully finalized and written to: trial_53.log ---


[I 2025-11-18 17:14:20,930] Trial 54 finished with value: 0.15216840482675112 and parameters: {'lr': 0.0010411605698050982, 'weight_decay': 1.0060964951517347e-07, 'graph_hidden_dim': 128, 'latent_dim': 64, 'context_dim': 16, 'graph_layers': 5}. Best is trial 53 with value: 0.1512628116286718.



--- Full log successfully finalized and written to: trial_54.log ---


[I 2025-11-18 17:18:38,996] Trial 55 pruned.               


--- Trial pruned at epoch 5. ---
Trial 55 was pruned.


[I 2025-11-18 17:23:45,934] Trial 56 pruned.               


--- Trial pruned at epoch 6. ---
Trial 56 was pruned.


[I 2025-11-18 17:28:03,479] Trial 57 pruned.               


--- Trial pruned at epoch 5. ---
Trial 57 was pruned.


[I 2025-11-18 18:10:54,458] Trial 58 finished with value: 0.15141738997055934 and parameters: {'lr': 0.0010221095157197783, 'weight_decay': 4.360182667707147e-07, 'graph_hidden_dim': 128, 'latent_dim': 64, 'context_dim': 16, 'graph_layers': 5}. Best is trial 53 with value: 0.1512628116286718.



--- Full log successfully finalized and written to: trial_58.log ---


[I 2025-11-18 18:15:12,059] Trial 59 pruned.               


--- Trial pruned at epoch 5. ---
Trial 59 was pruned.


[I 2025-11-18 18:19:31,444] Trial 60 pruned.               


--- Trial pruned at epoch 5. ---
Trial 60 was pruned.


[I 2025-11-18 19:02:59,054] Trial 61 finished with value: 0.15181168409494253 and parameters: {'lr': 0.0010151099487462466, 'weight_decay': 2.615389453337195e-07, 'graph_hidden_dim': 128, 'latent_dim': 64, 'context_dim': 16, 'graph_layers': 5}. Best is trial 53 with value: 0.1512628116286718.



--- Full log successfully finalized and written to: trial_61.log ---


[I 2025-11-18 19:39:27,573] Trial 62 finished with value: 0.15352897632580537 and parameters: {'lr': 0.0010660069123260318, 'weight_decay': 2.443970189545713e-07, 'graph_hidden_dim': 128, 'latent_dim': 64, 'context_dim': 16, 'graph_layers': 5}. Best is trial 53 with value: 0.1512628116286718.



--- Full log successfully finalized and written to: trial_62.log ---


[I 2025-11-18 19:42:30,380] Trial 63 pruned.               


--- Trial pruned at epoch 5. ---
Trial 63 was pruned.


[I 2025-11-18 19:45:32,704] Trial 64 pruned.               


--- Trial pruned at epoch 5. ---
Trial 64 was pruned.


[I 2025-11-18 19:49:47,735] Trial 65 pruned.               


--- Trial pruned at epoch 7. ---
Trial 65 was pruned.


[I 2025-11-18 19:52:51,263] Trial 66 pruned.               


--- Trial pruned at epoch 5. ---
Trial 66 was pruned.


[I 2025-11-18 19:55:53,530] Trial 67 pruned.               


--- Trial pruned at epoch 5. ---
Trial 67 was pruned.


[I 2025-11-18 20:07:58,457] Trial 68 pruned.               


--- Trial pruned at epoch 20. ---
Trial 68 was pruned.


[I 2025-11-18 20:11:02,247] Trial 69 pruned.               


--- Trial pruned at epoch 5. ---
Trial 69 was pruned.


[I 2025-11-18 20:14:05,798] Trial 70 pruned.               


--- Trial pruned at epoch 5. ---
Trial 70 was pruned.
--- Study complete ---
Total trials: 71
  Completed: 35
  Pruned:    35

Best trial:
  Value (min val_loss): 0.151263
  Params: 
    lr: 0.0012293526076691823
    weight_decay: 2.429199151209364e-07
    graph_hidden_dim: 128
    latent_dim: 64
    context_dim: 16
    graph_layers: 5
