# Model Selection
This notebook selects the best hyperparameters configurations for the DGN models.

# Imports

In [4]:
import pandas as pd
import pickle
from pathlib import Path
import wandb
import numpy as np
import os

# Load data from wandb

In [2]:
api = wandb.Api()

def get_project_df(project: str, out_path: Path = None) -> pd.DataFrame:
    runs = api.runs(project)
    summary_list, config_list, name_list = [], [], []
    for run in runs: 
        # .summary contains the output keys/values for metrics like accuracy.
        #  We call ._json_dict to omit large files 
        summary_list.append(run.summary._json_dict)

        # .config contains the hyperparameters.
        #  We remove special values that start with _.
        config_list.append(
            {k: v for k,v in run.config.items()
            if not k.startswith('_')})

        # .name is the human-readable name of the run.
        name_list.append(run.name)

    runs_df = pd.DataFrame({
        "summary": summary_list,
        "config": config_list,
        "name": name_list
        })
    
    # expand the config column that contains dictionaries
    runs_df = pd.concat([runs_df.drop(['config'], axis=1), runs_df['config'].apply(pd.Series)], axis=1)
    runs_df = pd.concat([runs_df.drop(['summary'], axis=1), runs_df['summary'].apply(pd.Series)], axis=1)
    # drop all the "parameters/..." columns
    runs_df = runs_df.drop(columns=[col for col in runs_df.columns if "parameters/" in col])
    runs_df = runs_df.drop(columns=[col for col in runs_df.columns if "gradients/" in col])

    runs_df['hold_out_by'] = runs_df['hold_out_by'].replace({'model':'UC3','protein':'UC2','random':'UC1'})

    if out_path:
        runs_df.to_csv(out_path, index=False)
    return runs_df

In [6]:
os.makedirs("results", exist_ok=True)

In [9]:
results_df = get_project_df("tesi-gnn/peppina-final", out_path=Path("results/results_final.csv"))

# Load runs data

In [13]:
gnn_df = pd.read_csv("results/gcn.csv", low_memory=False)

In [14]:
baselines_df = pd.read_csv("results/baselines.csv")
baselines_df = baselines_df[baselines_df["model"] == "deepsets"]

In [15]:
baselines_df.embeddings_len = baselines_df.embeddings_len.apply(str)
results_df.embeddings_len = results_df.embeddings_len.apply(str)

In [16]:
metrics = ["acc","auroc","f1","mcc"]
metrics_cols = [f"{s}_{m}" for s in ["train","test","val"] for m in metrics]

config_cols = ['model','conv', 'layers','hidden_dim', 'batch_size','lr', 'pool_from', 'weight_decay','warmup_steps','undirected','dirgnn_alpha','dirgnn_conv','es_eps','patience','weight_initializer','aggr','weighted_sampler','uniform_bound','dropout']

In [17]:
results_df.embeddings_len = results_df.embeddings_len.apply(str)

In [18]:
def select_bests(df, selection_metric='val_f1', features=["0","128","onehot"]):
    #select the best config for each fold
    baselines_table=[]
    agg_dict = {m: ['mean','std','count'] for m in metrics_cols}
    # for the trial names, concatenate them
    agg_dict['name'] = list
    all_the_bests = []
    for hold_out_by in ["UC1","UC2","UC3"]:
        for embeddings_len in features:
            best_per_fold = []
            for fold in range(4):
                try:
                    best = df[(df.hold_out_by==hold_out_by) & (df.embeddings_len==embeddings_len) & (df.test_fold==fold)][metrics_cols+config_cols+['embeddings_len','hold_out_by','name','test_fold']]\
                            .groupby(config_cols+['embeddings_len','hold_out_by'], dropna=False)\
                            .agg('max')\
                            .sort_values((selection_metric), ascending=False).reset_index().loc[0]
                    best_per_fold.append(best)
                except:
                    print(f"no results for {hold_out_by}, {embeddings_len}, {fold}")
                    
            if len(best_per_fold)>0:
                all_the_bests.append(pd.concat(best_per_fold, axis=1).T)

    return pd.concat(all_the_bests, axis=0)

In [75]:
def format_value(s):
    # format a float to have 3 decimal digits without zero before the decimal point
    return f"{s:.3f}".replace("0.",".")

table_metrics=["mcc","acc","f1","auroc"]

def format_metrics(df, metrics=table_metrics, sets=['test']):
    formatted_metrics = []
    for m in metrics:
        for s in sets:
            sorted= df[(f"{s}_{m}","mean")].sort_values(ascending=False)
            best = sorted.index[0]
            second = sorted.index[1]

            col = "\\textbf{" + f"{m.upper()}" + "}"
            df[col] = "$" + (df[(f"{s}_{m}","mean")]).apply(format_value) + "\scriptstyle \pm " + (df[(f"{s}_{m}","std")]).apply(format_value) +"$"
            
            # put the best in bold
            def mathbf(x):
                # insert \mathbf{ after the first $
                x = x.replace("$","$\\mathbf{",1)
                # insert } befort the last character
                x = x[:-1] + "}$"
                return x
            
            df.loc[best, col] = df.loc[best, col].apply(lambda x: mathbf(x))[0]
            # put the second in underline
            # df.loc[second, col] = df.loc[second, col].apply(lambda x: "\\underline{" + x + "}")[0]
            
            df.drop(columns=f"{s}_{m}", inplace=True)

            formatted_metrics.append(col)
        
    df = df[["model","hold_out_by","embeddings_len"]+formatted_metrics]

    # rename columns to be less pythonish
    df.columns = ['Model','Hold-out by','Embeddings length'] + formatted_metrics

    df.Model = df.Model.replace({"gcn":"DGN","deepsets":"DeepSets"})
    return df

In [76]:
baselines_perf = select_bests(baselines_df, 'val_f1')[['hold_out_by','embeddings_len','name','test_fold']+config_cols+metrics_cols]

In [77]:
baselines_perf_disp = baselines_perf[metrics_cols+['hold_out_by','embeddings_len','test_fold','model']].groupby(['hold_out_by','embeddings_len','model']).agg(['mean','std','count']).reset_index()

baselines_perf_disp = baselines_perf_disp[['hold_out_by','embeddings_len','model']+['test_acc','test_f1','test_auroc','test_mcc']]

baselines_perf_disp = format_metrics(baselines_perf_disp)
baselines_perf_disp['Model'] = "DeepSets"
baselines_perf_disp

  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)


Unnamed: 0,Model,Hold-out by,Embeddings length,\textbf{MCC},\textbf{ACC},\textbf{F1},\textbf{AUROC}
0,DeepSets,UC1,0,$.062\scriptstyle \pm .043$,$.574\scriptstyle \pm .167$,$.284\scriptstyle \pm .140$,$.650\scriptstyle \pm .004$
1,DeepSets,UC1,128,$.472\scriptstyle \pm .006$,$.764\scriptstyle \pm .004$,$.647\scriptstyle \pm .004$,$.817\scriptstyle \pm .002$
2,DeepSets,UC1,onehot,$\mathbf{.498\scriptstyle \pm .009}$,$\mathbf{.778\scriptstyle \pm .007}$,$\mathbf{.661\scriptstyle \pm .008}$,$\mathbf{.834\scriptstyle \pm .003}$
3,DeepSets,UC2,0,$.021\scriptstyle \pm .036$,$.572\scriptstyle \pm .169$,$.218\scriptstyle \pm .200$,$.560\scriptstyle \pm .142$
4,DeepSets,UC2,128,$.415\scriptstyle \pm .031$,$.741\scriptstyle \pm .013$,$.605\scriptstyle \pm .038$,$.783\scriptstyle \pm .033$
5,DeepSets,UC2,onehot,$.427\scriptstyle \pm .032$,$.759\scriptstyle \pm .015$,$.595\scriptstyle \pm .047$,$.794\scriptstyle \pm .026$
6,DeepSets,UC3,0,$.048\scriptstyle \pm .037$,$.565\scriptstyle \pm .174$,$.259\scriptstyle \pm .169$,$.634\scriptstyle \pm .037$
7,DeepSets,UC3,128,$.104\scriptstyle \pm .094$,$.637\scriptstyle \pm .047$,$.320\scriptstyle \pm .186$,$.554\scriptstyle \pm .087$
8,DeepSets,UC3,onehot,$.146\scriptstyle \pm .076$,$.677\scriptstyle \pm .028$,$.284\scriptstyle \pm .107$,$.622\scriptstyle \pm .042$


In [78]:
metric = 'val_f1'
gnn_perf = select_bests(gnn_df, metric)
gnn_df.hold_out_by = gnn_df.hold_out_by.apply(str)

In [79]:
pd.set_option("display.max_columns", None)
gnn_perf[metrics_cols+['hold_out_by','embeddings_len','test_fold']].groupby(['hold_out_by','embeddings_len']).agg(['mean']).reset_index()[['hold_out_by','embeddings_len']+['train_f1','val_f1','test_f1']]


Unnamed: 0_level_0,hold_out_by,embeddings_len,train_f1,val_f1,test_f1
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,mean,mean
0,UC1,0,0.819709,0.747035,0.750113
1,UC1,128,0.89638,0.840223,0.843225
2,UC1,onehot,0.899549,0.839345,0.841645
3,UC2,0,0.751345,0.645942,0.604067
4,UC2,128,0.831149,0.698766,0.666694
5,UC2,onehot,0.860387,0.717582,0.690009
6,UC3,0,0.754066,0.606494,0.527726
7,UC3,128,0.789202,0.632069,0.515358
8,UC3,onehot,0.836639,0.62763,0.48568


In [80]:
gnn_perf[metrics_cols+['hold_out_by','embeddings_len','test_fold']]

Unnamed: 0,train_acc,train_auroc,train_f1,train_mcc,test_acc,test_auroc,test_f1,test_mcc,val_acc,val_auroc,val_f1,val_mcc,hold_out_by,embeddings_len,test_fold
0,0.874023,0.946074,0.80245,0.710485,0.832285,0.907119,0.746607,0.621166,0.840062,0.908387,0.752106,0.634066,UC1,0,0
0,0.849854,0.931418,0.852765,0.70084,0.819198,0.902227,0.746751,0.613799,0.813276,0.895923,0.74374,0.608996,UC1,0,1
0,0.862429,0.929304,0.776235,0.677211,0.840867,0.907519,0.748672,0.632695,0.836568,0.905222,0.743761,0.624013,UC1,0,2
0,0.843994,0.926291,0.847385,0.688117,0.826148,0.912407,0.758421,0.631464,0.817158,0.90577,0.748532,0.616407,UC1,0,3
0,0.93042,0.981686,0.893058,0.84151,0.894479,0.954717,0.842584,0.763803,0.897516,0.955005,0.845433,0.769313,UC1,128,0
0,0.928368,0.981176,0.890817,0.837669,0.898416,0.957806,0.842052,0.767323,0.890916,0.951582,0.832439,0.751603,UC1,128,1
0,0.939209,0.987025,0.906846,0.862129,0.900513,0.959449,0.846363,0.772893,0.899068,0.958582,0.844869,0.770082,UC1,128,2
0,0.932373,0.981723,0.894797,0.845432,0.894663,0.957216,0.841903,0.763382,0.891304,0.956359,0.83815,0.757507,UC1,128,3
0,0.935791,0.987526,0.897864,0.851046,0.899138,0.951327,0.846006,0.770973,0.893245,0.949571,0.835821,0.756746,UC1,onehot,0
0,0.933112,0.98534,0.898342,0.848514,0.893523,0.952755,0.840858,0.761334,0.887034,0.945434,0.833238,0.749744,UC1,onehot,1


In [86]:
best_trials_gcn = {}
for i, r in gnn_perf.iterrows():
    best_trials_gcn[(r.hold_out_by, r.embeddings_len, r.test_fold)] = r['name']

pickle.dump(best_trials_gcn, open("gcn_best_trial_names.pkl",'wb'))

# Tables export

In [81]:
tables_path = "tables/"
Path(tables_path).mkdir(exist_ok=True)

In [82]:
gnn_perf_disp = gnn_perf[metrics_cols+['hold_out_by','embeddings_len','test_fold']].groupby(['hold_out_by','embeddings_len']).agg(['mean','std','count']).reset_index()
gnn_perf_disp = gnn_perf_disp[['hold_out_by','embeddings_len']+[c for c in metrics_cols if 'test' in c]]
gnn_perf_disp['model'] = "DGN"
gnn_perf_disp = format_metrics(gnn_perf_disp)

  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)


In [83]:
null_models_rows = {
    "UC1": ["Null","$.000{\scriptstyle \pm .000}$","$.000{\scriptstyle \pm .000}$","$.676{\scriptstyle \pm .000}$","$.500{\scriptstyle \pm .000}$"],
    "UC2": ["Null","$.000{\scriptstyle \pm .000}$","$.000{\scriptstyle \pm .000}$","$.680{\scriptstyle \pm .024}$","$.500{\scriptstyle \pm .000}$"],
    "UC3": ["Null","$.000{\scriptstyle \pm .000}$","$.000{\scriptstyle \pm .000}$","$.678{\scriptstyle \pm .014}$","$.500{\scriptstyle \pm .000}$"]
}

In [84]:
# merge the tables by hold out
tables = []
for hold_out_by in ["UC1","UC2","UC3"]:
    table = pd.concat([
        baselines_perf[baselines_perf["hold_out_by"]==hold_out_by],
        gnn_perf[gnn_perf["hold_out_by"]==hold_out_by]
    ])[['model','embeddings_len','hold_out_by']+metrics_cols].groupby(['model','embeddings_len','hold_out_by']).agg(['mean','std']).reset_index()

    # # remove the onehot related rows
    table = table[table.embeddings_len!="onehot"]
    table.embeddings_len = table.embeddings_len.replace({"0":"", "128":"+emb","onehot":"+onehot"})

    table = format_metrics(table.reset_index(), metrics=["mcc","f1","acc","auroc"], sets=["test"]).drop(columns=['Hold-out by'])
    # instert the null model row as first row, without deleting the previous one

    # concatenate embedding len and model name
    table['Model'].replace({"deepsets":"DeepSets","gcn":"DGN"}, inplace=True)
    table['Model'] = table['Model']+table['Embeddings length']

    
    table = table.drop(columns=['Embeddings length'])
    null_row = pd.DataFrame([null_models_rows[hold_out_by]], columns=table.columns)
    table = pd.concat([null_row, table], ignore_index=True)
    tables.append(table)
    table.to_latex(f"{tables_path}{hold_out_by}.tex", index=False)


  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Model = df.Model.replace({"gcn":"DGN","deepsets":"DeepSets"})
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Model = df.Model.replace({"gcn":"DGN","deepsets":"

In [85]:
# create unique table with onehot results
table = pd.concat([
    baselines_perf[baselines_perf["embeddings_len"]=="onehot"],
    gnn_perf[gnn_perf["embeddings_len"]=="onehot"]
])[['model','hold_out_by','embeddings_len']+metrics_cols].groupby(['hold_out_by','model','embeddings_len']).agg(['mean','std']).reset_index().sort_values(by=['hold_out_by','model'])
# table.columns = table.columns.droplevel(1)

onehot_table = format_metrics(table, metrics=["mcc","f1","acc","auroc"], sets=["test"]).drop(columns=['Embeddings length'])

onehot_table['Hold-out by'] = onehot_table['Hold-out by'].replace({"UC1":"1", "UC2":"2", "UC3":"3"})
onehot_table.rename(columns={"Hold-out by":"UC"}, inplace=True)

onehot_table.to_latex(tables_path+f'onehot_performances.tex', index=False)

  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
  df.drop(columns=f"{s}_{m}", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Model = df.Model.replace({"gcn":"DGN","deepsets":"DeepSets"})
