# INITIALIZATION

In [None]:
# IMPORTING LIBRARIES
import copy
import csv
import os
import random
import time
os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'  # or ':16:8'
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdmolops
from rdkit.Chem import rdDistGeom as molDG
from rdkit.Chem import Descriptors
from rdkit.Chem.rdchem import GetPeriodicTable
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torch_geometric.nn import MessagePassing, GCNConv, global_mean_pool, GATConv
from torch_geometric.utils import add_self_loops, degree
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn.inits import reset
import networkx as nx
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
# FUNCTIONS
from data_processing import load_dataset, smiles_to_graph, process_dataset, generate_graphs
from path_helpers import get_path
from stats_compute import compute_statistics, scale_graphs
from mol_visualize import recon_3d, viz_3d
from smart_loader import load_model_for_inference
import ModelArchitecture
from EnhancedDataSplit import DataSplitter
from collections import defaultdict
from typing import Tuple, List
# DIRECTORY SETUP
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)

In [None]:
# HYPERPARAMETER SETTINGS
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Reproducibility settings
seed = 21
split_seed = 42
batch_size = 32
runtime = timestamp
selected_device = 'cuda' # either 'cuda' or 'cpu
device = torch.device(selected_device)

# CUDA Deterministic (ON/OFF SETTING)
# For PyTorch
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.use_deterministic_algorithms(False)
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark = True
print('device           :', device)
print('seed             :', seed)
print('split seed       :', split_seed)

# PARITY PLOT GENERATION (MEDIAN MODEL)

In [None]:
model_inference_dir = os.path.join(
    os.path.dirname(os.getcwd()),
    "models",
    "models_root",
    "model_for_inference",
    "ras_baseline"
)
# List files and pick the first one
files = sorted(os.listdir(model_inference_dir))  # sorted to make it deterministic
if len(files) == 0:
    raise FileNotFoundError(f"No files found in {model_inference_dir}")
model_file = files[1]  # first file in directory
print(f"Using model file: {model_file}")
# Full path
path = os.path.join(model_inference_dir, model_file)
########### IMPORTING MODEL ###############
selected_device = 'cuda'
model = load_model_for_inference(path, device=device)
############################################

In [None]:
# LOAD & GRAPH GENERATION FOR EITHER SRS OR RAS
df_components = load_dataset(get_path(file_name = 'components_set.csv', folder_name='datasets'))
smiles_dict = dict(zip(df_components['Abbreviation'], df_components['SMILES']))
df_systems = load_dataset(get_path(file_name = 'systems_set.csv', folder_name='datasets'))
smiles_list = df_components["SMILES"].dropna().tolist()
mol_name_dict = smiles_dict.copy()
# GRAPH
system_graphs = process_dataset(df_systems, smiles_dict)
# LOAD DATASET
splitter = DataSplitter(system_graphs, random_state=split_seed)
splitter.print_dataset_stats()
# Options: rarity_aware_unseen_amine_split stratified_random_split
train_data, val_data, test_data = splitter.rarity_aware_unseen_amine_split()
#Retrieve the statistics of train_data
stats = compute_statistics(train_data)
conc_mean = stats[0]
conc_std = stats[1]
temp_mean = stats[2]
temp_std = stats[3]
pco2_mean = stats[4]
pco2_std = stats[5]
#Apply the scaling to validation and test
original_train_data = copy.deepcopy(train_data)
original_val_data = copy.deepcopy(val_data)
original_test_data = copy.deepcopy(test_data)
combined_original_data = original_train_data + original_val_data + original_test_data
train_data = scale_graphs(train_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
val_data = scale_graphs(val_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
test_data = scale_graphs(test_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
#Load the data into DataLoader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [None]:
# Computing varience of the alpha_co2 on datasets
def get_alpha_stats(dataset):
    values = torch.cat([torch.tensor([data.aco2]) for data in dataset])
    
    # Variance calculations
    sample_var = torch.var(values, unbiased=True)      # N-1
    population_var = torch.var(values, unbiased=False) # N
    
    # Range and statistics
    min_val = torch.min(values)
    max_val = torch.max(values)
    mean_val = torch.mean(values)
    std_val = torch.std(values, unbiased=True)
    median_val = torch.median(values)
    
    return {
        'sample_var': sample_var.item(),
        'population_var': population_var.item(),
        'min': min_val.item(),
        'max': max_val.item(),
        'range': (max_val - min_val).item(),
        'mean': mean_val.item(),
        'std': std_val.item(),
        'median': median_val.item(),
        'count': len(values)
    }

# Calculate statistics for all datasets
train_stats = get_alpha_stats(original_train_data)
val_stats = get_alpha_stats(original_val_data)
test_stats = get_alpha_stats(original_test_data)

# Print comprehensive statistics
print("=" * 80)
print("α_CO2 DATASET STATISTICS")
print("=" * 80)

datasets = [('Train', train_stats), ('Val', val_stats), ('Test', test_stats)]

for name, stats in datasets:
    print(f"\n{name:>5} Dataset (n={stats['count']}):")
    print(f"  Range:     [{stats['min']:.4f}, {stats['max']:.4f}]  (span: {stats['range']:.4f})")
    print(f"  Mean:      {stats['mean']:.4f}  ±  {stats['std']:.4f}")
    print(f"  Median:    {stats['median']:.4f}")
    print(f"  Variance:  {stats['sample_var']:.6f} (sample), {stats['population_var']:.6f} (population)")

print("\n" + "=" * 80)
print("RELATIVE COMPARISONS (vs Train)")
print("=" * 80)

for name, stats in datasets[1:]:  # Skip train for comparison
    print(f"\n{name} vs Train:")
    print(f"  Range ratio:    {stats['range']/train_stats['range']:.3f}")
    print(f"  Mean ratio:     {stats['mean']/train_stats['mean']:.3f}")
    print(f"  Variance ratio: {stats['sample_var']/train_stats['sample_var']:.3f}")
    print(f"  Min overlap:    {'Yes' if stats['min'] >= train_stats['min'] else 'No'} ({stats['min']:.4f} vs {train_stats['min']:.4f})")
    print(f"  Max overlap:    {'Yes' if stats['max'] <= train_stats['max'] else 'No'} ({stats['max']:.4f} vs {train_stats['max']:.4f})")

In [None]:
""" # LOAD DATASET FOR RASHYB
df_components = load_dataset(get_path(file_name = 'components_set.csv', folder_name='datasets'))
smiles_dict = dict(zip(df_components['Abbreviation'], df_components['SMILES']))
df_systems = load_dataset(get_path(file_name = 'systems_set.csv', folder_name='datasets'))
smiles_list = df_components["SMILES"].dropna().tolist()
mol_name_dict = smiles_dict.copy()
# GRAPH
system_graphs = process_dataset(df_systems, smiles_dict)
splitter_1 = DataSplitter(system_graphs, random_state=split_seed)
RASset1, RASset2, RASset3 = splitter_1.rarity_aware_unseen_amine_split()
opt_data = RASset1 + RASset2

# HYBRID
splitter_2 = DataSplitter(opt_data, random_state=split_seed)
SRSset1, SRSset2, SRSset3 = splitter_2.stratified_random_split()
train_data = SRSset1
val_data = SRSset2 + SRSset3
test_data = RASset3
#Retrieve the statistics of train_data
stats = compute_statistics(train_data)
conc_mean = stats[0]
conc_std = stats[1]
temp_mean = stats[2]
temp_std = stats[3]
pco2_mean = stats[4]
pco2_std = stats[5]
#Apply the scaling to validation and test
original_train_data = copy.deepcopy(train_data)
original_val_data = copy.deepcopy(val_data)
original_test_data = copy.deepcopy(test_data)
combined_original_data = original_train_data + original_val_data + original_test_data
train_data = scale_graphs(train_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
val_data = scale_graphs(val_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
test_data = scale_graphs(test_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
#Load the data into DataLoader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False) """

In [None]:
# PARITY PLOT GENERATION
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
import torch

def collect_predictions_and_true_values(model, data_loader, device):
    predictions = []
    true_values = []
    
    model.eval()
    with torch.no_grad():
        for data in data_loader:
            data = data.to(device)
            output = model(data)
            predictions.extend(output.cpu().numpy())
            true_values.extend(data.aco2.cpu().numpy())
    
    return predictions, true_values

# Function to calculate R² and RMSE
def calculate_metrics(true_values, predictions):
    r2 = r2_score(true_values, predictions)
    rmse = np.sqrt(mean_squared_error(true_values, predictions))
    return r2, rmse

# Function to save metrics to CSV
def save_metrics_to_csv(r2_train, rmse_train, r2_val, rmse_val, r2_test, rmse_test, parent_directory):
    # Create the metrics dictionary
    metrics_data = {
        'Dataset': ['Training', 'Validation', 'Test'],
        'R2': [r2_train, r2_val, r2_test],
        'RMSE': [rmse_train, rmse_val, rmse_test]
    }

# Function to plot the parity plot with marginal histograms
def plot_parity_plot(train_true_values, train_predictions, 
                     val_true_values, val_predictions, 
                     test_true_values, test_predictions,
                     parent_directory=None):
    fontsize = 16
    matplotlib.rcParams['font.family'] = 'Times New Roman'

    # Calculate metrics
    r2_train, rmse_train = calculate_metrics(train_true_values, train_predictions)
    r2_val, rmse_val = calculate_metrics(val_true_values, val_predictions)
    r2_test, rmse_test = calculate_metrics(test_true_values, test_predictions)

    # Create figure with gridspec for histograms
    fig = plt.figure(figsize=(8, 8))
    gs = gridspec.GridSpec(2, 2, width_ratios=[4, 1], height_ratios=[1, 4], 
                          hspace=0.00, wspace=0.00)
    
    # Main plot
    ax = fig.add_subplot(gs[1, 0])
    ax_histx = fig.add_subplot(gs[0, 0], sharex=ax)
    ax_histy = fig.add_subplot(gs[1, 1], sharey=ax)

    # Scatter plots (keeping your exact style)
    ax.scatter(train_true_values, train_predictions, 
                edgecolors='b', alpha=0.5, c='b', marker='o', 
                label=f'Train   (R² = {r2_train:.4f},  RMSE = {rmse_train:.4f})')
    ax.scatter(val_true_values, val_predictions, 
                edgecolors='g', alpha=0.5, c='g', marker='^', 
                label=f'Val      (R² = {r2_val:.4f},  RMSE = {rmse_val:.4f})')
    ax.scatter(test_true_values, test_predictions, 
                edgecolors='r', alpha=0.5, c='r', marker='v', 
                label=f'Test     (R² = {r2_test:.4f},  RMSE = {rmse_test:.4f})')

    # Parity line (keeping your exact style)
    max_val = max(max(train_true_values), max(val_true_values), max(test_true_values))
    ax.plot([-0.1, max_val+0.5], [-0.1, max_val+0.5], '--', linewidth=1.5, color='black')

    # Labels & ticks (keeping your exact formatting)
    ax.set_xlabel('Actual Solubility', fontsize=fontsize)
    ax.set_ylabel('Predicted Solubility', fontsize=fontsize)
    ax.set_xlim(-0.1, 2.5)
    ax.set_ylim(-0.1, 2.5)
    ax.tick_params(axis='both', which='major', length=6, width=0.8, labelsize=fontsize)
    ax.tick_params(axis='both', which='minor', length=4, width=0.8)
    ax.minorticks_on()
    ax.legend(fontsize=fontsize-3, loc='upper left', frameon=False)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)    

    # Add histograms with dataset differentiation
    bins = np.linspace(-0.1, 2.5, 27)
    
    # Top histogram (experimental values) - stacked by dataset
    ax_histx.hist([np.array(train_true_values).flatten(), 
                   np.array(val_true_values).flatten(), 
                   np.array(test_true_values).flatten()], 
                  bins=bins, color=['b', 'g', 'r'], 
                  alpha=0.5, stacked=True, edgecolor='black', linewidth=0.5)
    ax_histx.tick_params(labelbottom=False, labelleft=False, left=False)
    ax_histx.spines['top'].set_visible(False)
    ax_histx.spines['right'].set_visible(False)
    ax_histx.spines['left'].set_visible(False)
    #ax_histx.spines['bottom'].set_visible(False)

    # Right histogram (predicted values) - stacked by dataset
    ax_histy.hist([np.array(train_predictions).flatten(), 
                   np.array(val_predictions).flatten(), 
                   np.array(test_predictions).flatten()], 
                  bins=bins, orientation='horizontal', color=['b', 'g', 'r'], 
                  alpha=0.5, stacked=True, edgecolor='black', linewidth=0.5)
    ax_histy.tick_params(labelbottom=False, labelleft=False, bottom=False)
    ax_histy.spines['top'].set_visible(False)
    ax_histy.spines['right'].set_visible(False)
    #ax_histy.spines['left'].set_visible(False)
    ax_histy.spines['bottom'].set_visible(False)

    plt.show()

    # Save metrics if needed
    if parent_directory:
        save_metrics_to_csv(r2_train, rmse_train, r2_val, rmse_val, r2_test, rmse_test, parent_directory)


# Collect predictions and true values for training, validation, and test data
train_predictions, train_true_values = collect_predictions_and_true_values(model, train_loader, device)
val_predictions, val_true_values = collect_predictions_and_true_values(model, val_loader, device)
test_predictions, test_true_values = collect_predictions_and_true_values(model, test_loader, device)

# Plot the parity plot
plot_parity_plot(train_true_values, train_predictions, 
                 val_true_values, val_predictions, 
                 test_true_values, test_predictions)

In [None]:
# Extracting embeddings
unique_named_graphs = {}
for graph in test_data:
    name = graph['name']
    if name not in unique_named_graphs:
        unique_named_graphs[name] = graph  # Keep the first occurrence

# Convert to a list
unique_graph_list = list(unique_named_graphs.values())
unique_graph_list

unique_list = DataLoader(unique_graph_list, batch_size=batch_size, shuffle=False)
graph_representations = []
with torch.no_grad():
    for data in unique_list:
        data = data.to(device)
        output = model(data)
        graph_rep = model(data, extract_embeddings=True)
        graph_representations.append(graph_rep.cpu().numpy())
graph_representations_df = pd.DataFrame(np.vstack(graph_representations))
graph_representations_df.columns = [f'dim_{i}' for i in range(graph_representations_df.shape[1])]
embeddings_df = graph_representations_df

# MODEL COMPARISON

In [None]:
# LOAD & GRAPH GENERATION FOR EITHER SRS OR RAS
df_components = load_dataset(get_path(file_name = 'components_set.csv', folder_name='datasets'))
smiles_dict = dict(zip(df_components['Abbreviation'], df_components['SMILES']))
df_systems = load_dataset(get_path(file_name = 'systems_set.csv', folder_name='datasets'))
smiles_list = df_components["SMILES"].dropna().tolist()
mol_name_dict = smiles_dict.copy()
# GRAPH
system_graphs = process_dataset(df_systems, smiles_dict)
# LOAD DATASET
splitter = DataSplitter(system_graphs, random_state=split_seed)
splitter.print_dataset_stats()
# Options: rarity_aware_unseen_amine_split stratified_random_split
train_data, val_data, test_data = splitter.rarity_aware_unseen_amine_split()
#Retrieve the statistics of train_data
stats = compute_statistics(train_data)
conc_mean = stats[0]
conc_std = stats[1]
temp_mean = stats[2]
temp_std = stats[3]
pco2_mean = stats[4]
pco2_std = stats[5]
#Apply the scaling to validation and test
original_train_data = copy.deepcopy(train_data)
original_val_data = copy.deepcopy(val_data)
original_test_data = copy.deepcopy(test_data)
combined_original_data = original_train_data + original_val_data + original_test_data
train_data = scale_graphs(train_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
val_data = scale_graphs(val_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
test_data = scale_graphs(test_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
#Load the data into DataLoader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [None]:
""" # LOAD DATASET FOR RASHYB
df_components = load_dataset(get_path(file_name = 'components_set.csv', folder_name='datasets'))
smiles_dict = dict(zip(df_components['Abbreviation'], df_components['SMILES']))
df_systems = load_dataset(get_path(file_name = 'systems_set.csv', folder_name='datasets'))
smiles_list = df_components["SMILES"].dropna().tolist()
mol_name_dict = smiles_dict.copy()
# GRAPH
system_graphs = process_dataset(df_systems, smiles_dict)
splitter_1 = DataSplitter(system_graphs, random_state=split_seed)
RASset1, RASset2, RASset3 = splitter_1.rarity_aware_unseen_amine_split()
opt_data = RASset1 + RASset2

# HYBRID
splitter_2 = DataSplitter(opt_data, random_state=split_seed)
SRSset1, SRSset2, SRSset3 = splitter_2.stratified_random_split()
train_data = SRSset1
val_data = SRSset2 + SRSset3
test_data = RASset3
#Retrieve the statistics of train_data
stats = compute_statistics(train_data)
conc_mean = stats[0]
conc_std = stats[1]
temp_mean = stats[2]
temp_std = stats[3]
pco2_mean = stats[4]
pco2_std = stats[5]
#Apply the scaling to validation and test
original_train_data = copy.deepcopy(train_data)
original_val_data = copy.deepcopy(val_data)
original_test_data = copy.deepcopy(test_data)
combined_original_data = original_train_data + original_val_data + original_test_data
train_data = scale_graphs(train_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
val_data = scale_graphs(val_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
test_data = scale_graphs(test_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
#Load the data into DataLoader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False) """

In [None]:
# Baseline
baseline_dir = os.path.join(
    os.path.dirname(os.getcwd()),
    "models",
    "models_root",
    "model_for_inference",
    "ras_baseline"
)
files = sorted(os.listdir(baseline_dir))
if len(files) == 0:
    raise FileNotFoundError(f"No files found in {baseline_dir}")
model_file = files[0]
print(f"Using model file: {model_file}")
path = os.path.join(baseline_dir, model_file)
selected_device = 'cuda'
model_1 = load_model_for_inference(path, device=device)

# PIGNN
pinn_dir = os.path.join(
    os.path.dirname(os.getcwd()),
    "models",
    "models_root",
    "model_for_inference",
    "ras_pinn"
)
files = sorted(os.listdir(pinn_dir))
if len(files) == 0:
    raise FileNotFoundError(f"No files found in {pinn_dir}")
model_file = files[0]
print(f"Using model file: {model_file}")
path = os.path.join(pinn_dir, model_file)
selected_device = 'cuda'
model_2 = load_model_for_inference(path, device=device)

# Load both models and prepare for comparison
models = {
    "Baseline": (model_1, lambda g: scale_graphs(g, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)),
    "PINN": (model_2, lambda g: scale_graphs(g, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)),
}

In [None]:
# AAD CALCULATION
models = {
    "Baseline": (model_1, lambda g: scale_graphs(g, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)),
    "PINN": (model_2, lambda g: scale_graphs(g, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)),
}

# Min points per temperature (filtering)
min_points_per_temp = 2

used_data_for_inference = original_test_data
# Extract unique values from the data
unique_systems = list(set(graph.name for graph in used_data_for_inference))
unique_concentrations = list(set(graph.conc for graph in used_data_for_inference))
unique_temperatures = list(set(graph.temp for graph in used_data_for_inference))
unique_references = list(set(graph.ref for graph in used_data_for_inference))
# Store AAD results
aad_results = {model_name: {} for model_name in models.keys()}

for amine in unique_systems:
    # Collect all graphs for this amine across all refs and concentrations
    graphs_amine = [g for g in used_data_for_inference if g.name == amine]
    
    # Temperature filtering
    temp_counts = {temp: sum(1 for g in graphs_amine if g.temp == temp) for temp in set(g.temp for g in graphs_amine)}
    filtered_temps = sorted([t for t, count in temp_counts.items() if count >= min_points_per_temp])
    
    if not filtered_temps:
        continue  # skip amines without enough points
    
    # Keep only graphs in filtered temperatures
    graphs_filtered = [g for g in graphs_amine if g.temp in filtered_temps]
    
    # For each model
    for model_name, (model, scaler) in models.items():
        abs_errors = []
        for g in graphs_filtered:
            g_pred = g.clone()
            
            # Ensure all scalar features are tensors
            g_pred.temp = torch.tensor([g_pred.temp], dtype=torch.float)
            g_pred.conc = torch.tensor([g_pred.conc], dtype=torch.float)
            g_pred.pco2 = torch.tensor([g_pred.pco2], dtype=torch.float)
            
            g_pred = scaler(g_pred).to(device)
            with torch.no_grad():
                pred = model(g_pred).cpu().numpy().flatten()
            
            abs_errors.append(np.abs(pred - g_pred.aco2))  # g_pred.aco2 can stay float
            
        # Average absolute deviation
        aad_results[model_name][amine] = np.mean(abs_errors)

# Print AAD per amine per model
for model_name, amines in aad_results.items():
    print(f"\n=== {model_name} AAD per amine ===")
    for amine, aad in amines.items():
        print(f"{amine}: {aad:.4f}")

In [None]:
# MAPE CALCULATION
models = {
    "Baseline": (model_1, lambda g: scale_graphs(g, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)),
    "PINN": (model_2, lambda g: scale_graphs(g, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)),
}

# Min points per temperature (filtering)
min_points_per_temp = 2
used_data_for_inference = original_test_data
# Extract unique values from the data
unique_systems = list(set(graph.name for graph in used_data_for_inference))
unique_concentrations = list(set(graph.conc for graph in used_data_for_inference))
unique_temperatures = list(set(graph.temp for graph in used_data_for_inference))
unique_references = list(set(graph.ref for graph in used_data_for_inference))
# Store MAPE results
mape_results = {model_name: {} for model_name in models.keys()}

for amine in unique_systems:
    # Collect all graphs for this amine across all refs and concentrations
    graphs_amine = [g for g in used_data_for_inference if g.name == amine]
    
    # Temperature filtering
    temp_counts = {temp: sum(1 for g in graphs_amine if g.temp == temp) for temp in set(g.temp for g in graphs_amine)}
    filtered_temps = sorted([t for t, count in temp_counts.items() if count >= min_points_per_temp])
    
    if not filtered_temps:
        continue  # skip amines without enough points
    
    # Keep only graphs in filtered temperatures
    graphs_filtered = [g for g in graphs_amine if g.temp in filtered_temps]
    
    # For each model
    for model_name, (model, scaler) in models.items():
        percentage_errors = []
        for g in graphs_filtered:
            g_pred = g.clone()
            
            # Ensure all scalar features are tensors
            g_pred.temp = torch.tensor([g_pred.temp], dtype=torch.float)
            g_pred.conc = torch.tensor([g_pred.conc], dtype=torch.float)
            g_pred.pco2 = torch.tensor([g_pred.pco2], dtype=torch.float)
            
            g_pred = scaler(g_pred).to(device)
            with torch.no_grad():
                pred = model(g_pred).cpu().numpy().flatten()
            
            # Avoid division by zero
            if g_pred.aco2 != 0:
                percentage_errors.append(np.abs(pred - g_pred.aco2) / np.abs(g_pred.aco2))
        
        # Mean Absolute Percentage Error
        mape_results[model_name][amine] = np.mean(percentage_errors) * 100

# Print MAPE per amine per model
for model_name, amines in mape_results.items():
    print(f"\n=== {model_name} MAPE per amine (%) ===")
    for amine, mape in amines.items():
        print(f"{amine}: {mape:.2f}%")


# ENSEMBLE PREDICTION

In [None]:
# LOAD & GRAPH GENERATION FOR EITHER SRS OR RAS
df_components = load_dataset(get_path(file_name = 'components_set.csv', folder_name='datasets'))
smiles_dict = dict(zip(df_components['Abbreviation'], df_components['SMILES']))
df_systems = load_dataset(get_path(file_name = 'systems_set.csv', folder_name='datasets'))
smiles_list = df_components["SMILES"].dropna().tolist()
mol_name_dict = smiles_dict.copy()
# GRAPH
system_graphs = process_dataset(df_systems, smiles_dict)
# LOAD DATASET
splitter = DataSplitter(system_graphs, random_state=split_seed)
splitter.print_dataset_stats()
# Options: rarity_aware_unseen_amine_split stratified_random_split
train_data, val_data, test_data = splitter.rarity_aware_unseen_amine_split()
#Retrieve the statistics of train_data
stats = compute_statistics(train_data)
conc_mean = stats[0]
conc_std = stats[1]
temp_mean = stats[2]
temp_std = stats[3]
pco2_mean = stats[4]
pco2_std = stats[5]
#Apply the scaling to validation and test
original_train_data = copy.deepcopy(train_data)
original_val_data = copy.deepcopy(val_data)
original_test_data = copy.deepcopy(test_data)
combined_original_data = original_train_data + original_val_data + original_test_data
train_data = scale_graphs(train_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
val_data = scale_graphs(val_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
test_data = scale_graphs(test_data, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
#Load the data into DataLoader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [None]:
import glob
import torch

def load_ensemble_models(model_dir, device, scaler_fn):
    """
    Load all .pth models in a directory and return a list of (model, scaler_fn).
    """
    model_paths = sorted(glob.glob(f"{model_dir}/*.pth"))
    models = []
    for path in model_paths:
        model = load_model_for_inference(path, device=device)
        models.append((model, scaler_fn))
        print(f"Loaded model: {path}")
    return models
# BASELINE
model_dir_baseline = os.path.join(
    os.path.dirname(os.getcwd()),
    "models",
    "models_root",
    "model_for_inference",
    "ras_baseline"
)
baseline_models = load_ensemble_models(
    f"{model_dir_baseline}", 
    device,
    lambda g: scale_graphs(g, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
)
# PIGNN
model_dir_pinn = os.path.join(
    os.path.dirname(os.getcwd()),
    "models",
    "models_root",
    "model_for_inference",
    "ras_pinn"
)
pinn_models = load_ensemble_models(
    f"{model_dir_pinn}", 
    device,
    lambda g: scale_graphs(g, conc_mean, conc_std, temp_mean, temp_std, pco2_mean, pco2_std)
)
# Wrap into a dict
models = {
    "Baseline": baseline_models,
    "PINN": pinn_models,
}

In [None]:
# AAD CALCULATION WITH ENSEMBLE
# Min points per temperature (filtering)
min_points_per_temp = 2

used_data_for_inference = original_train_data

# Extract unique values from the data
unique_systems = list(set(graph.name for graph in used_data_for_inference))
unique_concentrations = list(set(graph.conc for graph in used_data_for_inference))
unique_temperatures = list(set(graph.temp for graph in used_data_for_inference))
unique_references = list(set(graph.ref for graph in used_data_for_inference))

# Store AAD results
aad_results = {model_name: {} for model_name in models.keys()}

def predict_with_ensemble(model_list, graph, device):
    """
    Run inference with an ensemble of models and return the averaged prediction.
    """
    preds = []
    for model, scaler in model_list:
        g_pred = graph.clone()

        # Ensure all scalar features are tensors
        g_pred.temp = torch.tensor([g_pred.temp], dtype=torch.float)
        g_pred.conc = torch.tensor([g_pred.conc], dtype=torch.float)
        g_pred.pco2 = torch.tensor([g_pred.pco2], dtype=torch.float)

        g_pred = scaler(g_pred).to(device)

        with torch.no_grad():
            pred = model(g_pred).cpu().numpy().flatten()
        preds.append(pred)

    return np.mean(preds, axis=0)  # average across ensemble

# === Main loop ===
for amine in unique_systems:
    # Collect all graphs for this amine across all refs and concentrations
    graphs_amine = [g for g in used_data_for_inference if g.name == amine]
    
    # Temperature filtering
    temp_counts = {temp: sum(1 for g in graphs_amine if g.temp == temp) for temp in set(g.temp for g in graphs_amine)}
    filtered_temps = sorted([t for t, count in temp_counts.items() if count >= min_points_per_temp])
    
    if not filtered_temps:
        continue  # skip amines without enough points
    
    # Keep only graphs in filtered temperatures
    graphs_filtered = [g for g in graphs_amine if g.temp in filtered_temps]
    
    # For each model family (Baseline / PINN)
    for model_name, model_list in models.items():
        abs_errors = []
        for g in graphs_filtered:
            pred = predict_with_ensemble(model_list, g, device)
            abs_errors.append(np.abs(pred - g.aco2))  # g.aco2 can stay float
        
        # Average absolute deviation
        aad_results[model_name][amine] = np.mean(abs_errors)

# === Print results ===
for model_name, amines in aad_results.items():
    print(f"\n=== {model_name} AAD per amine ===")
    for amine, aad in amines.items():
        print(f"{amine}: {aad:.4f}")

In [None]:
# MAPE CALCULATION WITH ENSEMBLE
# Min points per temperature (filtering)
min_points_per_temp = 2
used_data_for_inference = original_train_data

# Extract unique values from the data
unique_systems = list(set(graph.name for graph in used_data_for_inference))
unique_concentrations = list(set(graph.conc for graph in used_data_for_inference))
unique_temperatures = list(set(graph.temp for graph in used_data_for_inference))
unique_references = list(set(graph.ref for graph in used_data_for_inference))

# Store MAPE results
mape_results = {model_name: {} for model_name in models.keys()}

def predict_with_ensemble(model_list, graph, device):
    """
    Run inference with an ensemble of models and return the averaged prediction.
    """
    preds = []
    for model, scaler in model_list:
        g_pred = graph.clone()

        # Ensure all scalar features are tensors
        g_pred.temp = torch.tensor([g_pred.temp], dtype=torch.float)
        g_pred.conc = torch.tensor([g_pred.conc], dtype=torch.float)
        g_pred.pco2 = torch.tensor([g_pred.pco2], dtype=torch.float)

        g_pred = scaler(g_pred).to(device)

        with torch.no_grad():
            pred = model(g_pred).cpu().numpy().flatten()
        preds.append(pred)

    return np.mean(preds, axis=0)  # average across ensemble

# === Main loop ===
for amine in unique_systems:
    # Collect all graphs for this amine across all refs and concentrations
    graphs_amine = [g for g in used_data_for_inference if g.name == amine]
    
    # Temperature filtering
    temp_counts = {temp: sum(1 for g in graphs_amine if g.temp == temp) for temp in set(g.temp for g in graphs_amine)}
    filtered_temps = sorted([t for t, count in temp_counts.items() if count >= min_points_per_temp])
    
    if not filtered_temps:
        continue  # skip amines without enough points
    
    # Keep only graphs in filtered temperatures
    graphs_filtered = [g for g in graphs_amine if g.temp in filtered_temps]
    
    # For each model family (Baseline / PINN)
    for model_name, model_list in models.items():
        percentage_errors = []
        for g in graphs_filtered:
            pred = predict_with_ensemble(model_list, g, device)

            # Avoid division by zero
            if g.aco2 != 0:
                percentage_errors.append(np.abs(pred - g.aco2) / np.abs(g.aco2))
        
        # Mean Absolute Percentage Error
        mape_results[model_name][amine] = np.mean(percentage_errors) * 100

# === Print results ===
for model_name, amines in mape_results.items():
    print(f"\n=== {model_name} MAPE per amine (%) ===")
    for amine, mape in amines.items():
        print(f"{amine}: {mape:.2f}%")

In [None]:
# Organize original_test_data to see available combinations for isotherm visualization
# Get unique graph
unique_named_graphs = {}
for graph in test_data:
    name = graph['name']
    if name not in unique_named_graphs:
        unique_named_graphs[name] = graph  # Keep the first occurrence

# Optional: convert to a list
unique_graph_list = list(unique_named_graphs.values())
unique_list = DataLoader(unique_graph_list, batch_size=batch_size, shuffle=False)
data_summary = []
for point in original_test_data:
    data_summary.append({
        'name': point['name'],
        'temp': point.temp,
        'conc': point.conc,
        'pco2': point.pco2,
        'aco2': point.aco2
    })

# Convert to DataFrame for easier analysis
df_summary = pd.DataFrame(data_summary)

# Group by amine to see available conditions
print("Available experimental conditions by amine:")
for amine in df_summary['name'].unique():
    amine_data = df_summary[df_summary['name'] == amine]
    print(f"\n{amine}:")
    
    # List unique temperatures
    unique_temps = sorted(amine_data['temp'].unique())
    print(f"  Temperatures (K): {[float(t) for t in unique_temps]}")
    
    # List unique concentrations
    unique_concs = sorted(amine_data['conc'].unique())
    print(f"  Concentrations (M): {[float(c) for c in unique_concs]}")
    
    print(f"  pCO2 range: {amine_data['pco2'].min():.2e} kPa - {amine_data['pco2'].max():.2e} kPa")
    print(f"  Total data points: {len(amine_data)}")

In [None]:
# Choose specific conditions based on data availability
# Simply type the amine name you want to analyze
target_amine = "1DMA2P"  # Replace with the amine name from the list above
amine_data = df_summary[df_summary['name'] == target_amine]
selected_temp = 313.15  # Replace with desired temperature from the list
selected_conc = 2    # Replace with desired concentration from the list

# Show all available combinations for this amine
print(f"\nAll available temperature-concentration combinations for {target_amine}:")
temp_conc_combinations = amine_data[['temp', 'conc']].drop_duplicates().sort_values(['temp', 'conc'])
for idx, row in temp_conc_combinations.iterrows():
   count = len(amine_data[(amine_data['temp'] == row['temp']) & (amine_data['conc'] == row['conc'])])
   print(f"  T={float(row['temp'])} K, C={float(row['conc'])} M ({count} data points)")

# Filter experimental data for these exact conditions
exp_points = amine_data[
   (amine_data['temp'] == selected_temp) & 
   (amine_data['conc'] == selected_conc)
]

print(f"\nExperimental points for {target_amine} at T={selected_temp} K, C={selected_conc} M: {len(exp_points)}")
if len(exp_points) > 0:
   print("pCO2 values:")
   print(f"  {[float(p) for p in sorted(exp_points['pco2'].unique())]}")

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt

# Use the conditions determined from data organization above
temp = selected_temp  # From your previous analysis
conc = selected_conc  # From your previous analysis

# Create pCO2 grid based on experimental data range + extra coverage
if len(exp_points) > 0:
    exp_pco2_min = exp_points['pco2'].min()
    exp_pco2_max = exp_points['pco2'].max()
    
    # Extend range by factor by 20% on each side
    pco2_min = exp_pco2_min * 0.8
    pco2_max = exp_pco2_max * 1.2
    
    # Create log-spaced grid that includes experimental points
    pco2_values = np.logspace(np.log10(pco2_min), np.log10(pco2_max), 200)
    
    print(f"Experimental pCO2 range: {exp_pco2_min:.2e} - {exp_pco2_max:.2e}")
    print(f"Prediction pCO2 range: {pco2_min:.2e} - {pco2_max:.2e}")
else:
    # Fallback to default range if no experimental data
    pco2_values = np.logspace(-2, 4, 200)
    print("No experimental data - using default pCO2 range: 1e-2 to 1e4")

for model_name, model_list in models.items():  # models = {"Baseline": [...], "PINN": [...]}
    all_preds = []

    # Use the already filtered experimental points from above
    current_amine = target_amine
    
    # Extract experimental pco2 and aco2 values (already in real scale)
    exp_pco2 = exp_points['pco2'].tolist()
    exp_aco2 = exp_points['aco2'].tolist()

    # Collect predictions from all models in the ensemble
    for model, scaler in model_list:
        preds = []
        for pco2 in pco2_values:
            # Find the graph object for current_amine
            graph = next(g for g in unique_graph_list if g['name'] == current_amine)
            g = graph.clone()
            g.temp = torch.tensor([temp], dtype=torch.float)
            g.conc = torch.tensor([conc], dtype=torch.float)
            g.pco2 = torch.tensor([pco2], dtype=torch.float)
            g = scaler(g).to(device)
            with torch.no_grad():
                pred = model(g).cpu().numpy().flatten()
            preds.append(pred)
        all_preds.append(np.array(preds).flatten())

    all_preds = np.array(all_preds)  # shape = (num_models, num_points)
    mean_pred = all_preds.mean(axis=0)
    std_pred = all_preds.std(axis=0)

    # Plot with experimental data
    plt.figure(figsize=(8,6))
    plt.plot(pco2_values, mean_pred, color='blue', label=f'{model_name} Mean')
    plt.fill_between(pco2_values, mean_pred - std_pred, mean_pred + std_pred,
                     color='blue', alpha=0.3, label=f'±1 std')
    
    # Add experimental points
    if exp_pco2:
        plt.scatter(exp_pco2, exp_aco2, color='red', s=50, zorder=5, 
                   label='Experimental data', edgecolors='black', linewidth=0.5)
    
    plt.xlabel('pCO2')
    plt.ylabel('αCO2')
    plt.xscale('log')
    plt.xlim(pco2_min if 'pco2_min' in locals() else 1e-2, 
             pco2_max if 'pco2_max' in locals() else 1e4)
    plt.title(f'{current_amine}, {conc:.1f} M, {temp:.1f} K - {model_name}')
    plt.legend()
    plt.show()

    print(f"{model_name} ensemble mean ±1 std calculated from {len(model_list)} models")
    if exp_pco2:
        print(f"Experimental points plotted: {len(exp_pco2)} at {temp:.1f}K, {conc:.1f}M")
    else:
        print(f"No experimental data found for {current_amine} at these conditions")

In [None]:
import numpy as np
import torch
import matplotlib.pyplot as plt

# Use the conditions determined from data organization above
temp = selected_temp  # From your previous analysis
conc = selected_conc  # From your previous analysis

# Create pCO2 grid based on experimental data range + extra coverage
if len(exp_points) > 0:
    exp_pco2_min = exp_points['pco2'].min()
    exp_pco2_max = exp_points['pco2'].max()
    
    # Extend range by factor by 20% on each side
    pco2_min = exp_pco2_min * 0.8
    pco2_max = exp_pco2_max * 1.2
    
    # Create log-spaced grid that includes experimental points
    pco2_values = np.linspace(0, 200, 200)
    
    print(f"Experimental pCO2 range: {exp_pco2_min:.2e} - {exp_pco2_max:.2e}")
    print(f"Prediction pCO2 range: {pco2_min:.2e} - {pco2_max:.2e}")
else:
    # Fallback to default range if no experimental data
    pco2_values = np.logspace(-2, 4, 200)
    print("No experimental data - using default pCO2 range: 1e-2 to 1e4")

# Extract experimental data once
current_amine = target_amine
exp_pco2 = exp_points['pco2'].tolist() if len(exp_points) > 0 else []
exp_aco2 = exp_points['aco2'].tolist() if len(exp_points) > 0 else []

# Create single figure for both models
plt.figure(figsize=(10, 7))

# Define colors for each model type
colors = {'Baseline': 'blue', 'PINN': 'red'}

# Process both model types
model_predictions = {}

for model_name, model_list in models.items():  # models = {"Baseline": [...], "PINN": [...]}
    all_preds = []

    # Collect predictions from all models in the ensemble
    for model, scaler in model_list:
        preds = []
        for pco2 in pco2_values:
            # Find the graph object for current_amine
            graph = next(g for g in unique_graph_list if g['name'] == current_amine)
            g = graph.clone()
            g.temp = torch.tensor([temp], dtype=torch.float)
            g.conc = torch.tensor([conc], dtype=torch.float)
            g.pco2 = torch.tensor([pco2], dtype=torch.float)
            g = scaler(g).to(device)
            with torch.no_grad():
                pred = model(g).cpu().numpy().flatten()
            preds.append(pred)
        all_preds.append(np.array(preds).flatten())

    all_preds = np.array(all_preds)  # shape = (num_models, num_points)
    mean_pred = all_preds.mean(axis=0)
    std_pred = all_preds.std(axis=0)
    
    # Store predictions for this model type
    model_predictions[model_name] = {
        'mean': mean_pred,
        'std': std_pred,
        'num_models': len(model_list)
    }
    
    # Plot mean prediction line
    color = colors.get(model_name, 'black')
    plt.plot(pco2_values, mean_pred, color=color, label=f'{model_name} Mean', linewidth=2)
    
    # Plot uncertainty band
    plt.fill_between(pco2_values, mean_pred - std_pred, mean_pred + std_pred,
                     color=color, alpha=0.2, label=f'{model_name} ±1 std')
    
    print(f"{model_name} ensemble mean ±1 std calculated from {len(model_list)} models")

# Add experimental points (only once, after both models are plotted)
if exp_pco2:
    plt.scatter(exp_pco2, exp_aco2, color='black', s=60, zorder=10, 
               label='Experimental data', edgecolors='white', linewidth=1)
    print(f"Experimental points plotted: {len(exp_pco2)} at {temp:.1f}K, {conc:.1f}M")
else:
    print(f"No experimental data found for {current_amine} at these conditions")

# Formatting
plt.xlabel('pCO2', fontsize=12)
plt.ylabel('αCO2', fontsize=12)
plt.xscale('linear')
plt.xlim(0, 
         pco2_max if 'pco2_max' in locals() else 1e4)
plt.title(f'{current_amine}, {conc:.1f} M, {temp:.1f} K - Model Comparison', fontsize=14)
plt.legend(loc='best', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# Print summary statistics
print("\n" + "="*50)
print("MODEL COMPARISON SUMMARY")
print("="*50)
for model_name, pred_data in model_predictions.items():
    mean_values = pred_data['mean']
    print(f"{model_name}:")
    print(f"  - Ensemble size: {pred_data['num_models']} models")
    print(f"  - αCO2 range: {mean_values.min():.4f} - {mean_values.max():.4f}")
    print(f"  - Mean uncertainty (std): {pred_data['std'].mean():.4f}")

# If both models exist, calculate and show differences
if 'Baseline' in model_predictions and 'PINN' in model_predictions:
    baseline_mean = model_predictions['Baseline']['mean']
    pinn_mean = model_predictions['PINN']['mean']
    
    abs_diff = np.abs(baseline_mean - pinn_mean)
    rel_diff = abs_diff / np.maximum(baseline_mean, 1e-10) * 100  # Avoid division by zero
    
    print(f"\nModel Differences:")
    print(f"  - Maximum absolute difference: {abs_diff.max():.4f}")
    print(f"  - Mean absolute difference: {abs_diff.mean():.4f}")
    print(f"  - Maximum relative difference: {rel_diff.max():.2f}%")
    print(f"  - Mean relative difference: {rel_diff.mean():.2f}%")