# Model Selection
This notebook selects the best hyperparameters configurations for the DGN models.

# Imports

In [2]:
import pandas as pd
import pickle
from pathlib import Path
import wandb
import numpy as np

# Load data from wandb

In [None]:
api = wandb.Api()

def get_project_df(project: str, out_path: Path = None) -> pd.DataFrame:
    runs = api.runs(project)
    summary_list, config_list, name_list = [], [], []
    for run in runs: 
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files 
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k,v in run.config.items()
            if not k.startswith('_')})

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame({
        "summary": summary_list,
        "config": config_list,
        "name": name_list
        })
    
    # expand the config column that contains dictionaries
    runs_df = pd.concat([runs_df.drop(['config'], axis=1), runs_df['config'].apply(pd.Series)], axis=1)
    runs_df = pd.concat([runs_df.drop(['summary'], axis=1), runs_df['summary'].apply(pd.Series)], axis=1)
    # drop all the "parameters/..." columns
    runs_df = runs_df.drop(columns=[col for col in runs_df.columns if "parameters/" in col])
    runs_df = runs_df.drop(columns=[col for col in runs_df.columns if "gradients/" in col])

    runs_df['hold_out_by'] = runs_df['hold_out_by'].replace({'model':'UC3','protein':'UC2','random':'UC1'})

    if out_path:
        runs_df.to_csv(out_path, index=False)
    return runs_df

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

In [3]:
results_df = get_project_df("tesi-gnn/peppina-final", out_path=Path("results_final.csv"))

In [None]:
# all_runs_df = get_project_df("tesi-gnn/peppina-final", out_path=Path("all_runs.csv"))

# Load runs data

In [5]:
results_df = pd.read_csv("results.csv", low_memory=False)

In [6]:
baselines_df = all_runs_df[all_runs_df["model"] == "deepsets"]

  all_runs_df = pd.read_csv("all_runs.csv")


In [43]:
baselines_df.embeddings_len = baselines_df.embeddings_len.apply(str)
results_df.embeddings_len = results_df.embeddings_len.apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baselines_df.embeddings_len = baselines_df.embeddings_len.apply(str)


In [44]:
metrics = ["acc","auroc","f1","mcc"]
metrics_cols = [f"{s}_{m}" for s in ["train","test","val"] for m in metrics]

config_cols = ['model','conv', 'layers','hidden_dim', 'batch_size','lr', 'pool_from', 'weight_decay','warmup_steps','undirected','dirgnn_alpha','dirgnn_conv','es_eps','patience','weight_initializer','aggr','weighted_sampler','uniform_bound','dropout']

In [45]:
# drop columns having Nan metrics
results_df = results_df[(results_df.biogrid_ver=="2024-10")].dropna(axis=0, how='any', subset=metrics_cols)

In [46]:
gnn_df = results_df[results_df["model"] == "gcn"]

In [47]:
results_df.embeddings_len = results_df.embeddings_len.apply(str)

In [None]:
def select_bests(df, selection_metric='val_f1', features=["0","128","onehot"]):
    #select the best config for each fold
    agg_dict = {m: ['mean','std','count'] for m in metrics_cols}
    # for the trial names, concatenate them
    agg_dict['name'] = list
    all_the_bests = []
    for hold_out_by in ["UC1","UC2","UC3"]:
        for embeddings_len in features:
            best_per_fold = []
            for fold in range(4):
                try:
                    best = df[(df.hold_out_by==hold_out_by) & (df.embeddings_len==embeddings_len) & (df.test_fold==fold)][metrics_cols+config_cols+['embeddings_len','hold_out_by','name','test_fold']]\
                            .groupby(config_cols+['embeddings_len','hold_out_by'], dropna=False)\
                            .agg('max')\
                            .sort_values((selection_metric), ascending=False).reset_index().loc[0]
                    best_per_fold.append(best)
                except:
                    print(f"no results for {hold_out_by}, {embeddings_len}, {fold}")
                    
            if len(best_per_fold)>0:
                all_the_bests.append(pd.concat(best_per_fold, axis=1).T)

    return pd.concat(all_the_bests, axis=0)

In [50]:
def format_value(s):
    return f"{s:.2f}"
table_metrics=["acc","f1","auroc","mcc"]
def format_metrics(df, metrics=table_metrics, sets=['test']):
    formatted_metrics = []
    for m in metrics:
        for s in sets:
            df[f"{s} {m.upper()} (\%)"] = (df[(f"{s}_{m}","mean")]*100).apply(format_value) + "$\pm$" + (df[(f"{s}_{m}","std")]*100).apply(format_value)
            df.drop(columns=f"{s}_{m}", inplace=True)
            formatted_metrics.append(f"{s} {m.upper()} (\%)")
        
    df = df[["model","hold_out_by","embeddings_len"]+formatted_metrics]

    # rename columns to be less pythonish
    df.columns = ['Model','Hold-out by','Embeddings length'] + formatted_metrics

    df.Model = df.Model.replace({"gcn":"DGN","deepsets":"DeepSets"})
    return df

In [None]:
baselines_perf = select_bests(baselines_df, 'val_f1')[['hold_out_by','embeddings_len','name','test_fold']+config_cols+metrics_cols]

In [None]:
baselines_perf_disp = baselines_perf[metrics_cols+['hold_out_by','embeddings_len','test_fold','model']].groupby(['hold_out_by','embeddings_len','model']).agg(['mean','std','count']).reset_index()

baselines_perf_disp = baselines_perf_disp[['hold_out_by','embeddings_len','model']+['test_acc','test_f1','test_auroc','test_mcc']]

baselines_perf_disp = format_metrics(baselines_perf_disp)
baselines_perf_disp['Model'] = "DeepSets"
baselines_perf_disp

  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)


Unnamed: 0,Model,Hold-out by,Embeddings length,test ACC (\%),test F1 (\%),test AUROC (\%),test MCC (\%)
0,DeepSets,UC1,0,57.42$\pm$16.71,28.36$\pm$14.00,65.01$\pm$0.40,6.16$\pm$4.32
1,DeepSets,UC1,128,76.35$\pm$0.36,64.72$\pm$0.38,81.75$\pm$0.24,47.18$\pm$0.58
2,DeepSets,UC1,onehot,77.77$\pm$0.72,66.10$\pm$0.81,83.37$\pm$0.30,49.78$\pm$0.92
3,DeepSets,UC2,0,57.22$\pm$16.88,21.83$\pm$19.95,56.01$\pm$14.20,2.12$\pm$3.61
4,DeepSets,UC2,128,74.09$\pm$1.30,60.49$\pm$3.84,78.30$\pm$3.27,41.50$\pm$3.12
5,DeepSets,UC2,onehot,75.90$\pm$1.53,59.48$\pm$4.70,79.39$\pm$2.65,42.74$\pm$3.19
6,DeepSets,UC3,0,56.49$\pm$17.39,25.88$\pm$16.91,63.38$\pm$3.70,4.85$\pm$3.71
7,DeepSets,UC3,128,63.70$\pm$4.68,31.99$\pm$18.61,55.37$\pm$8.71,10.39$\pm$9.35
8,DeepSets,UC3,onehot,67.73$\pm$2.75,28.43$\pm$10.75,62.18$\pm$4.20,14.59$\pm$7.57


In [None]:
metric = 'val_f1'
gnn_perf = select_bests(gnn_df, metric)
gnn_df.hold_out_by=gnn_df.hold_out_by.apply(str)

In [54]:
pd.set_option("display.max_columns", None)
gnn_perf[metrics_cols+['hold_out_by','embeddings_len','test_fold']].groupby(['hold_out_by','embeddings_len']).agg(['mean']).reset_index()[['hold_out_by','embeddings_len']+['train_f1','val_f1','test_f1']]


Unnamed: 0_level_0,hold_out_by,embeddings_len,train_f1,val_f1,test_f1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,mean,mean
0,UC1,0,0.819709,0.747035,0.750113
1,UC1,128,0.871954,0.830135,0.831219
2,UC1,onehot,0.899549,0.839345,0.841645
3,UC2,0,0.751345,0.645942,0.604067
4,UC2,128,0.831149,0.698766,0.666694
5,UC2,onehot,0.860387,0.717582,0.690009
6,UC3,0,0.754066,0.606494,0.527726
7,UC3,128,0.789202,0.632069,0.515358
8,UC3,onehot,0.836639,0.62763,0.48568


In [55]:
gnn_perf[metrics_cols+['hold_out_by','embeddings_len','test_fold']]

Unnamed: 0,train_acc,train_auroc,train_f1,train_mcc,test_acc,test_auroc,test_f1,test_mcc,val_acc,val_auroc,val_f1,val_mcc,hold_out_by,embeddings_len,test_fold
0,0.874023,0.946074,0.80245,0.710485,0.832285,0.907119,0.746607,0.621166,0.840062,0.908387,0.752106,0.634066,UC1,0,0
0,0.849854,0.931418,0.852765,0.70084,0.819198,0.902227,0.746751,0.613799,0.813276,0.895923,0.74374,0.608996,UC1,0,1
0,0.862429,0.929304,0.776235,0.677211,0.840867,0.907519,0.748672,0.632695,0.836568,0.905222,0.743761,0.624013,UC1,0,2
0,0.843994,0.926291,0.847385,0.688117,0.826148,0.912407,0.758421,0.631464,0.817158,0.90577,0.748532,0.616407,UC1,0,3
0,0.917725,0.976603,0.872009,0.811558,0.890519,0.949871,0.82746,0.748195,0.897904,0.952826,0.837954,0.764138,UC1,128,0
0,0.91748,0.97838,0.867347,0.808339,0.889562,0.955801,0.833485,0.751013,0.884705,0.951345,0.828423,0.742894,UC1,128,1
0,0.920304,0.97766,0.879484,0.820621,0.888164,0.955966,0.829853,0.746664,0.889752,0.954485,0.833918,0.751936,UC1,128,2
0,0.917496,0.976594,0.868976,0.810458,0.889769,0.952746,0.834077,0.751797,0.880047,0.947117,0.820244,0.731014,UC1,128,3
0,0.935791,0.987526,0.897864,0.851046,0.899138,0.951327,0.846006,0.770973,0.893245,0.949571,0.835821,0.756746,UC1,onehot,0
0,0.933112,0.98534,0.898342,0.848514,0.893523,0.952755,0.840858,0.761334,0.887034,0.945434,0.833238,0.749744,UC1,onehot,1


# Tables export

In [59]:
tables_path = "tables/"
Path(tables_path).mkdir(exist_ok=True)

In [60]:
gnn_perf_disp = gnn_perf[metrics_cols+['hold_out_by','embeddings_len','test_fold']].groupby(['hold_out_by','embeddings_len']).agg(['mean','std','count']).reset_index()
gnn_perf_disp = gnn_perf_disp[['hold_out_by','embeddings_len']+[c for c in metrics_cols if 'test' in c]]
gnn_perf_disp['model'] = "DGN"
gnn_perf_disp = format_metrics(gnn_perf_disp)
gnn_perf_disp

  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)


Unnamed: 0,Model,Hold-out by,Embeddings length,test ACC (\%),test F1 (\%),test AUROC (\%),test MCC (\%)
0,DGN,UC1,0,82.96$\pm$0.92,75.01$\pm$0.56,90.73$\pm$0.42,62.48$\pm$0.90
1,DGN,UC1,128,88.95$\pm$0.10,83.12$\pm$0.31,95.36$\pm$0.29,74.94$\pm$0.24
2,DGN,UC1,onehot,89.57$\pm$0.40,84.16$\pm$0.51,95.22$\pm$0.06,76.41$\pm$0.81
3,DGN,UC2,0,72.80$\pm$5.22,60.41$\pm$4.32,79.07$\pm$3.85,40.41$\pm$7.68
4,DGN,UC2,128,78.81$\pm$1.89,66.67$\pm$3.20,84.40$\pm$2.52,51.24$\pm$4.05
5,DGN,UC2,onehot,80.09$\pm$1.78,69.00$\pm$3.89,85.14$\pm$1.35,54.32$\pm$4.33
6,DGN,UC3,0,58.10$\pm$9.82,52.77$\pm$2.90,63.13$\pm$8.60,23.03$\pm$7.42
7,DGN,UC3,128,68.18$\pm$2.99,51.54$\pm$4.01,65.72$\pm$4.14,27.71$\pm$5.78
8,DGN,UC3,onehot,67.70$\pm$2.73,48.57$\pm$4.72,65.09$\pm$4.88,25.33$\pm$5.76


In [61]:
hyperparams = gnn_perf[['hold_out_by','embeddings_len','name']+config_cols]
hyperparams

Unnamed: 0,hold_out_by,embeddings_len,name,model,conv,layers,hidden_dim,batch_size,lr,pool_from,weight_decay,warmup_steps,undirected,dirgnn_alpha,dirgnn_conv,es_eps,patience,weight_initializer,aggr,weighted_sampler,uniform_bound,dropout
0,UC1,0,train_gcn_cec96_00000,gcn,DirGNNConv,4,512,4096,0.0005,last,1e-05,0,False,0.5,GraphConv,0.0,100,uniform,add,,,0.5
0,UC1,0,train_gcn_eb86c_00026,gcn,DirGNNConv,4,1024,4096,0.0005,last,1e-05,0,False,0.5,GraphConv,0.0,100,uniform,add,class,,0.5
0,UC1,0,train_gcn_cec96_00005,gcn,DirGNNConv,4,512,4096,0.0005,last,1e-05,0,False,0.5,GraphConv,0.0,100,uniform,mean,,,0.5
0,UC1,0,train_gcn_eb86c_00059,gcn,DirGNNConv,4,1024,4096,0.0005,last,1e-05,0,False,0.5,GraphConv,0.0,100,uniform,mean,class,,0.5
0,UC1,128,train_gcn_499a3_00011,gcn,DirGNNConv,6,512,4096,0.0005,all,0.0001,0,False,0.5,GraphConv,0.0,100,uniform,mean,,,0.5
0,UC1,128,train_gcn_14f2a_00003,gcn,DirGNNConv,4,512,4096,0.0005,last,1e-05,0,False,0.5,GraphConv,0.0,100,kaiming_uniform,mean,,,0.5
0,UC1,128,train_gcn_14f2a_00012,gcn,DirGNNConv,4,512,4096,0.0005,last,1e-05,0,False,0.5,GraphConv,0.0,100,uniform,add,,,0.5
0,UC1,128,train_gcn_14f2a_00006,gcn,DirGNNConv,4,512,4096,0.0005,last,1e-05,0,False,0.5,GraphConv,0.0,100,kaiming_uniform,add,,,0.5
0,UC1,onehot,train_gcn_64d90_00003,gcn,DirGNNConv,4,512,4096,0.0005,last,1e-05,0,False,0.5,GraphConv,0.0,100,uniform,mean,,,0.5
0,UC1,onehot,train_gcn_64d90_00006,gcn,DirGNNConv,4,512,4096,0.0005,last,1e-05,0,False,0.5,GraphConv,0.0,100,uniform,add,,,0.5


In [62]:
hyperparams.to_latex(tables_path+"hyperparams.tex", index=False)

In [63]:
best_trials_gcn = {}
for i, r in gnn_perf.iterrows():
    best_trials_gcn[(r.hold_out_by, r.embeddings_len, r.test_fold)] = r['name']

pickle.dump(best_trials_gcn, open("gcn_2024-4_best_trial_names.pkl",'wb'))

In [65]:
gnn_perf_disp[gnn_perf_disp["Embeddings length"]!='onehot'].to_latex(tables_path+"gcn_performances.tex", index=False)

In [66]:
# merge the tables by hold out
for hold_out_by in ["UC1","UC2","UC3"]:
    table = pd.concat([
        baselines_perf[baselines_perf["hold_out_by"]==hold_out_by],
        gnn_perf[gnn_perf["hold_out_by"]==hold_out_by]
    ])[['model','embeddings_len','hold_out_by']+metrics_cols].groupby(['model','embeddings_len','hold_out_by']).agg(['mean','std']).reset_index()

    # # remove the onehot related rows
    table = table[table.embeddings_len!="onehot"]
    table.embeddings_len = table.embeddings_len.replace({"0":"", "128":"+emb","onehot":"+onehot"})

    table = format_metrics(table, metrics=["f1","acc","auroc"], sets=["test"]).drop(columns=['Hold-out by'])
    # concatenate embedding len and model name
    table['Model'].replace({"deepsets":"DeepSets","gcn":"DGN"}, inplace=True)
    table['Model'] = table['Model']+table['Embeddings length']

    
    table = table.drop(columns=['Embeddings length'])
    
    table.to_latex(f"{tables_path}{hold_out_by}.tex", index=False)


  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Model = df.Model.replace({"gcn":"DGN","deepsets":"DeepSets"})
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Model = df.Model.replace({"gcn":"DGN","deepsets":"DeepSets"})
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", in

In [67]:
# create unique table with onehot results
table = pd.concat([
    baselines_perf[baselines_perf["embeddings_len"]=="onehot"],
    gnn_perf[gnn_perf["embeddings_len"]=="onehot"]
])[['model','hold_out_by','embeddings_len']+metrics_cols].groupby(['hold_out_by','model','embeddings_len']).agg(['mean','std']).reset_index().sort_values(by=['hold_out_by','model'])
# table.columns = table.columns.droplevel(1)

format_metrics(table, metrics=["f1","acc","auroc"], sets=["test"]).drop(columns=['Embeddings length']).to_latex(tables_path+f'onehot_performances.tex', index=False)

  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Model = df.Model.replace({"gcn":"DGN","deepsets":"DeepSets"})
