In [2]:
import json
import os
import numpy as np
import pandas as pd
from IPython.display import display,HTML
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from tqdm import tqdm
import seaborn as sns

In [9]:
all_models = [
    'roberta_mimic_sbdh', 'roberta_frm_sbdh_gpt4_v2_mimic_sbdh', 
    'roberta_prompt_mimic_sbdh', 'roberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 
    'cliroberta_mimic_sbdh', 'cliroberta_frm_sbdh_gpt4_v2_mimic_sbdh', 
    'cliroberta_prompt_mimic_sbdh', 'cliroberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 
    'mamba_mimic_sbdh', 'mamba_frm_sbdh_gpt4_v2_mimic_sbdh', 
    'mamba_prompt_mimic_sbdh', 'mamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 
    'climamba_mimic_sbdh', 'climamba_frm_sbdh_gpt4_v2_mimic_sbdh', 
    'climamba_prompt_mimic_sbdh', 'climamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 
    'llama_prompt_mimic_sbdh', 'llama_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 
]

In [10]:
def get_thresh(model, seed):
    with open(f'./saved_models/{model}_{seed}/trainer_state.json') as f:
        tr_state = json.load(f)
    with open(f'./saved_models/{model}_{seed}/all_results.json') as f:
        all_results = json.load(f)
    with open(f"./saved_models/{model}_{seed}/{tr_state['best_model_checkpoint'].split('/')[-1]}/trainer_state.json") as f:
        best_state = json.load(f)
    return best_state['log_history'][-1]['eval_threshold'], all_results['test_threshold']

In [11]:
# Using thresholds from the val set for the best checkpoint
result_dict, clf_report_dict = {}, {}
for model in all_models:
    result_dict[model] = {
        'micro-f':[],
        'macro-f':[],
        '∆(micro-f)':0,
        '∆(macro-f)':0,
    }
    clf_report_dict[model] = {}
    for seed in range(3):
        labels = np.load(f'./saved_models/{model}_{seed}/labels.npy')
        preds = np.load(f'./saved_models/{model}_{seed}/predictions.npy')
        with open(f'./saved_models/{model}_{seed}/predict_results.json') as f:
            test_res = json.load(f)
        preds = preds>get_thresh(model, seed)[0]
        clf_report = classification_report(labels, preds, zero_division=0, output_dict=True)
        result_dict[model]['macro-f'] += [clf_report['macro avg']['f1-score']]
        result_dict[model]['micro-f'] += [clf_report['micro avg']['f1-score']]
        clf_report_dict[model][seed] = clf_report

        
    for k,v in result_dict[model].items():
        if k.startswith('∆'):continue
        result_dict[model][k] = f'{np.average(result_dict[model][k])*100:.2f} ± {np.std(result_dict[model][k])*100:.2f}'
        if k.endswith('auc'):continue
        if '_frm_sbdh_gpt4_v2' in model:
            diff = float(result_dict[model][k].split('±')[0])-float(result_dict[model.replace('_frm_sbdh_gpt4_v2','')][k].split('±')[0])
            result_dict[model][f'∆({k})'] = '{:.2f} ({:.2f}%)'.format(
                diff,diff*100/float(result_dict[model.replace('_frm_sbdh_gpt4_v2','')][k].split('±')[0])
                )
        elif '_frm_sbdh_gpt4_msf_v3' in model:
            diff = float(result_dict[model][k].split('±')[0])-float(result_dict[model.replace('_frm_sbdh_gpt4_msf_v3','')][k].split('±')[0])
            result_dict[model][f'∆({k})'] = '{:.2f} ({:.2f}%)'.format(
                diff,diff*100/float(result_dict[model.replace('_frm_sbdh_gpt4_msf_v3','')][k].split('±')[0])
                )
        else:
            result_dict[model][f'∆({k})'] = '-'
        
res_df = pd.DataFrame(result_dict)

display(res_df.T.iloc[:2])
display(res_df.T.iloc[2:4])
display(res_df.T.iloc[4:6])
display(res_df.T.iloc[6:8])
display(res_df.T.iloc[8:10])
display(res_df.T.iloc[10:12])
display(res_df.T.iloc[12:14])
display(res_df.T.iloc[14:16])
display(res_df.T.iloc[16:18])

Unnamed: 0,micro-f,macro-f,∆(micro-f),∆(macro-f)
roberta_mimic_sbdh,83.00 ± 2.01,59.27 ± 3.22,-,-
roberta_frm_sbdh_gpt4_v2_mimic_sbdh,87.32 ± 0.86,65.05 ± 0.90,4.32 (5.20%),5.78 (9.75%)


Unnamed: 0,micro-f,macro-f,∆(micro-f),∆(macro-f)
roberta_prompt_mimic_sbdh,79.32 ± 0.92,55.64 ± 1.30,-,-
roberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh,91.11 ± 0.07,91.11 ± 0.15,11.79 (14.86%),35.47 (63.75%)


Unnamed: 0,micro-f,macro-f,∆(micro-f),∆(macro-f)
cliroberta_mimic_sbdh,85.11 ± 0.48,61.14 ± 1.32,-,-
cliroberta_frm_sbdh_gpt4_v2_mimic_sbdh,89.32 ± 0.72,66.90 ± 0.52,4.21 (4.95%),5.76 (9.42%)


Unnamed: 0,micro-f,macro-f,∆(micro-f),∆(macro-f)
cliroberta_prompt_mimic_sbdh,86.08 ± 0.89,63.64 ± 1.50,-,-
cliroberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh,91.26 ± 0.20,90.81 ± 0.73,5.18 (6.02%),27.17 (42.69%)


Unnamed: 0,micro-f,macro-f,∆(micro-f),∆(macro-f)
mamba_mimic_sbdh,87.43 ± 1.98,76.91 ± 11.01,-,-
mamba_frm_sbdh_gpt4_v2_mimic_sbdh,89.95 ± 0.68,88.49 ± 1.64,2.52 (2.88%),11.58 (15.06%)


Unnamed: 0,micro-f,macro-f,∆(micro-f),∆(macro-f)
mamba_prompt_mimic_sbdh,89.60 ± 0.33,85.44 ± 2.63,-,-
mamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh,90.32 ± 0.09,89.89 ± 0.34,0.72 (0.80%),4.45 (5.21%)


Unnamed: 0,micro-f,macro-f,∆(micro-f),∆(macro-f)
climamba_mimic_sbdh,91.35 ± 0.07,89.40 ± 0.52,-,-
climamba_frm_sbdh_gpt4_v2_mimic_sbdh,90.64 ± 0.44,86.13 ± 1.00,-0.71 (-0.78%),-3.27 (-3.66%)


Unnamed: 0,micro-f,macro-f,∆(micro-f),∆(macro-f)
climamba_prompt_mimic_sbdh,91.01 ± 0.70,82.77 ± 2.91,-,-
climamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh,91.52 ± 0.39,91.63 ± 0.46,0.51 (0.56%),8.86 (10.70%)


Unnamed: 0,micro-f,macro-f,∆(micro-f),∆(macro-f)
llama_prompt_mimic_sbdh,91.99 ± 0.23,90.94 ± 0.43,-,-
llama_prompt_frm_sbdh_gpt4_v2_mimic_sbdh,92.27 ± 0.25,91.02 ± 0.65,0.28 (0.30%),0.08 (0.09%)


In [12]:
# Performance gain for all 4 classes
models_1 = [
    'roberta_prompt_mimic_sbdh','cliroberta_prompt_mimic_sbdh', 'mamba_prompt_mimic_sbdh','climamba_prompt_mimic_sbdh','llama_prompt_mimic_sbdh'
]
models_2 = [
    'roberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh','cliroberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 'mamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 'climamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh','llama_prompt_frm_sbdh_gpt4_v2_mimic_sbdh'
]
for model1,model2 in zip(models_1, models_2):
    df = pd.DataFrame([
            pd.DataFrame(clf_report_dict[model2][0]).T['f1-score'][:4],
            pd.DataFrame(clf_report_dict[model2][1]).T['f1-score'][:4],
            pd.DataFrame(clf_report_dict[model2][2]).T['f1-score'][:4],
            pd.DataFrame(clf_report_dict[model1][0]).T['f1-score'][:4],
            pd.DataFrame(clf_report_dict[model1][1]).T['f1-score'][:4],
            pd.DataFrame(clf_report_dict[model1][2]).T['f1-score'][:4]
        ]).T
    df = df.set_axis(['seed_0_af', 'seed_1_af', 'seed_2_af', 'seed_0_b4', 'seed_1_b4', 'seed_2_b4'], axis=1)
    df ['mean ± std b4'] = [f'{i:.2f} ± {j:.2f}' for i,j in zip(np.average(df.iloc[:,3:6],axis=1)*100,np.std(df.iloc[:,3:6],axis=1)*100)]
    df ['mean ± std af'] = [f'{i:.2f} ± {j:.2f}' for i,j in zip(np.average(df.iloc[:,:3],axis=1)*100,np.std(df.iloc[:,:3],axis=1)*100)]
    df ['gain'] = df.apply(lambda x: float(x['mean ± std af'].split(' ± ')[0]) - float(x['mean ± std b4'].split(' ± ')[0]), axis=1)
    display(df.iloc[:,-3:].T)

Unnamed: 0,0,1,2,3
mean ± std b4,87.10 ± 0.13,0.00 ± 0.00,79.66 ± 0.84,55.79 ± 4.36
mean ± std af,90.91 ± 0.12,94.44 ± 0.00,93.44 ± 0.04,85.64 ± 0.66
gain,3.81,94.44,13.78,29.85


Unnamed: 0,0,1,2,3
mean ± std b4,89.35 ± 0.29,3.33 ± 4.71,87.89 ± 1.73,74.00 ± 0.95
mean ± std af,91.07 ± 0.26,90.74 ± 2.62,92.52 ± 0.13,88.89 ± 0.32
gain,1.72,87.41,4.63,14.89


Unnamed: 0,0,1,2,3
mean ± std b4,89.71 ± 0.55,75.14 ± 9.74,91.71 ± 0.53,85.20 ± 0.93
mean ± std af,89.65 ± 0.60,90.63 ± 2.46,92.80 ± 0.40,86.48 ± 2.41
gain,-0.06,15.49,1.09,1.28


Unnamed: 0,0,1,2,3
mean ± std b4,91.76 ± 0.25,60.84 ± 8.57,93.11 ± 0.70,85.39 ± 2.35
mean ± std af,91.54 ± 0.27,94.34 ± 0.15,92.85 ± 0.43,87.81 ± 1.31
gain,-0.22,33.5,-0.26,2.42


Unnamed: 0,0,1,2,3
mean ± std b4,92.03 ± 0.24,90.89 ± 1.42,94.15 ± 0.24,86.67 ± 0.50
mean ± std af,92.50 ± 0.36,90.53 ± 3.62,94.21 ± 0.14,86.85 ± 0.58
gain,0.47,-0.36,0.06,0.18
