# Run Sample Configuration

In [None]:
import pandas as pd

from main import main

val_best_params_file = "global_best_params.csv"
df = pd.read_csv(val_best_params_file)

# Cast radius from float -> int
df["radius"] = df["radius"].astype("Int64")

# Remove unused columns
df = df.drop(columns=["mean_val_loss", "std_val_loss", "source_file", "mae_test_scaffold"])

dic = df.to_dict(orient="records")
print(dic[50])

main(dic[50])


{'task': 'potency', 'target_task': 'pIC50 (SARS-CoV-2 Mpro)', 'batch_size': 64, 'dropout': 0.1, 'epochs': 100, 'lr': 0.001, 'weight_decay': 0.0001, 'repr_model': 'HIMP', 'num_layers': 3, 'encoding_dim': 8, 'hidden_channels': 16, 'out_channels': 16, 'proj_hidden_dim': 32, 'out_dim': 1, 'num_cv_folds': 5, 'num_cv_bins': 10, 'scaffold_split_val_sz': 0.1, 'use_erg': False, 'use_jt': False, 'jt_coarsity': 1, 'rg_embedding_dim': 8, 'radius': None}


Processing...
Done!
Processing...
Done!


Validation losses: [0.4443412125110626, 0.47676292061805725, 0.5799012581507365, 0.46699686845143634, 0.5134066939353943]
Average validation loss: 0.4962817907333374
Mean absolute error for pIC50 (SARS-CoV-2 Mpro) on test_scaffold: 0.367


# Evaluation of Results

In [17]:
import glob
import os

import pandas as pd

# path to your folder with CSV files
folder_path = "results/global_best_params"

csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

# display(df[["target_task", "repr_model", "seed", "mae_test_scaffold"]])

# Columns to ignore when deciding "identical rows"
discard = {"seed", "mean_val_loss", "std_val_loss", "mae_test_scaffold"}
group_cols = [c for c in df.columns if c not in discard]

# Group and aggregate mae_test_scaffold
df = (
    df.groupby(group_cols, dropna=False)["mae_test_scaffold"]
    .agg(["mean", "std"])  # compute mean & std
    .rename(columns={"mean": "mae_test_scaffold_mean", "std": "mae_test_scaffold_std"})
    .reset_index()
)

# display(df[["target_task", "repr_model", "mae_test_scaffold_mean", "mae_test_scaffold_std"]])

# Define the GNN models of interest
gnn_models = ["GIN", "GCN", "GraphSAGE", "GAT"]

# Split the dataframe
df_gnn = df[df["repr_model"].isin(gnn_models)]
df_other = df[~df["repr_model"].isin(gnn_models)]

# For GNN rows: keep only the best per target_task
df_gnn_best = df_gnn.loc[df_gnn.groupby("target_task")["mae_test_scaffold_mean"].idxmin()]
df_gnn_best["repr_model"] = "GNN"

# Combine back together
df = pd.concat([df_gnn_best, df_other]).reset_index(drop=True)


df["mae_test_scaffold"] = (
    df["mae_test_scaffold_mean"].round(3).astype(str)
    + " ± "
    + df["mae_test_scaffold_std"].round(3).astype(str)
)

# display(df[["target_task", "repr_model", "mae_test_scaffold"]])

pivot = df.pivot_table(
    index="repr_model",
    columns="target_task",
    values="mae_test_scaffold",
    aggfunc="first"
)

column_order = ["HLM", "KSOL", "LogD", "MDR1-MDCKII", "MLM", "pIC50 (MERS-CoV Mpro)", "pIC50 (SARS-CoV-2 Mpro)", "ESOL", "FreeSolv", "Lipo"]

pivot = pivot.reindex(columns=column_order)

pivot

target_task,HLM,KSOL,LogD,MDR1-MDCKII,MLM,pIC50 (MERS-CoV Mpro),pIC50 (SARS-CoV-2 Mpro),ESOL,FreeSolv,Lipo
repr_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ECFP,0.476 ± 0.015,0.377 ± 0.024,0.725 ± 0.014,0.364 ± 0.018,0.545 ± 0.017,0.753 ± 0.011,0.519 ± 0.023,1.161 ± 0.053,2.832 ± 0.07,0.731 ± 0.024
GNN,0.536 ± 0.05,0.455 ± 0.079,0.674 ± 0.03,0.386 ± 0.026,0.562 ± 0.046,0.712 ± 0.016,0.42 ± 0.023,0.705 ± 0.035,1.615 ± 0.139,0.523 ± 0.015
HIMP,0.492 ± 0.042,0.344 ± 0.059,0.809 ± 0.073,0.331 ± 0.025,0.567 ± 0.064,0.638 ± 0.031,0.412 ± 0.058,0.793 ± 0.072,1.819 ± 0.105,0.536 ± 0.023
HOIMP,0.485 ± 0.069,0.344 ± 0.046,0.687 ± 0.055,0.319 ± 0.015,0.556 ± 0.057,0.68 ± 0.043,0.419 ± 0.051,0.866 ± 0.105,1.93 ± 0.245,0.525 ± 0.011
