In [1]:
import json
import os
import numpy as np
import pandas as pd
from IPython.display import display,HTML
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from tqdm import tqdm
import seaborn as sns

In [3]:
all_models = [
    'roberta_mimic_sbdh', 'roberta_frm_sbdh_gpt4_v2_mimic_sbdh', 
    # 'roberta_frm_sbdh_gpt4_msf_v3_mimic_sbdh', 
    'roberta_prompt_mimic_sbdh', 'roberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 
    # 'roberta_prompt_frm_sbdh_gpt4_msf_v3_mimic_sbdh', 
    'cliroberta_mimic_sbdh', 'cliroberta_frm_sbdh_gpt4_v2_mimic_sbdh', 
    # 'cliroberta_frm_sbdh_gpt4_msf_v3_mimic_sbdh', 
    'cliroberta_prompt_mimic_sbdh', 'cliroberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 
    # 'cliroberta_prompt_frm_sbdh_gpt4_msf_v3_mimic_sbdh', 
    'mamba_mimic_sbdh', 'mamba_frm_sbdh_gpt4_v2_mimic_sbdh', 
    # 'mamba_frm_sbdh_gpt4_msf_v3_mimic_sbdh',
    'mamba_prompt_mimic_sbdh', 'mamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 
    # 'mamba_prompt_frm_sbdh_gpt4_msf_v3_mimic_sbdh',
    'climamba_mimic_sbdh', 'climamba_frm_sbdh_gpt4_v2_mimic_sbdh', 
    # 'climamba_frm_sbdh_gpt4_msf_v3_mimic_sbdh',
    'climamba_prompt_mimic_sbdh', 'climamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 
    # 'climamba_prompt_frm_sbdh_gpt4_msf_v3_mimic_sbdh'
]

In [4]:
def get_thresh(model, seed):
    with open(f'/home/avijit/playground/sdoh/saved_models/{model}_{seed}/trainer_state.json') as f:
        tr_state = json.load(f)
    with open(f'/home/avijit/playground/sdoh/saved_models/{model}_{seed}/all_results.json') as f:
        all_results = json.load(f)
    with open(f"/home/avijit/playground/sdoh/saved_models/{model}_{seed}/{tr_state['best_model_checkpoint'].split('/')[-1]}/trainer_state.json") as f:
        best_state = json.load(f)
    return best_state['log_history'][-1]['eval_threshold'], all_results['test_threshold']

In [11]:
# Using thresholds from the val set for the best checkpoint
result_dict, clf_report_dict = {}, {}
for model in all_models:
    result_dict[model] = {
        'micro-auc':[],
        'macro-auc':[],
        'micro-f':[],
        'macro-f':[],
        '∆(micro-f)':0,
        '∆(macro-f)':0,
        # 'ori_micro-f':[],
        # 'ori_macro-f':[],
    }
    clf_report_dict[model] = {}
    for seed in range(3):
        labels = np.load(f'/home/avijit/playground/sdoh/saved_models/{model}_{seed}/labels.npy')
        preds = np.load(f'/home/avijit/playground/sdoh/saved_models/{model}_{seed}/predictions.npy')
        with open(f'/home/avijit/playground/sdoh/saved_models/{model}_{seed}/predict_results.json') as f:
            test_res = json.load(f)
        preds = preds>get_thresh(model, seed)[0]
        clf_report = classification_report(labels, preds, zero_division=0, output_dict=True)
        # result_dict[model]['macro-f'] += [f1_score(labels, preds, zero_division=0, average='macro')]
        # result_dict[model]['micro-f'] += [f1_score(labels, preds, zero_division=0, average='micro')]
        result_dict[model]['macro-f'] += [clf_report['macro avg']['f1-score']]
        result_dict[model]['micro-f'] += [clf_report['micro avg']['f1-score']]
        result_dict[model]['macro-auc'] += [test_res['test_auc_micro']]
        result_dict[model]['micro-auc'] += [test_res['test_auc_macro']]
        clf_report_dict[model][seed] = clf_report
          
        # with open(f'./saved_models/{model}_{seed}/predict_results.json') as f:
        #     result = json.load(f)
        #     result_dict[model]['ori_macro-f'] += [result['test_f1_macro']]
        #     result_dict[model]['ori_micro-f'] += [result['test_f1_micro']]
        
    for k,v in result_dict[model].items():
        if k.startswith('∆'):continue
        result_dict[model][k] = f'{np.average(result_dict[model][k])*100:.2f} ± {np.std(result_dict[model][k])*100:.2f}'
        if k.endswith('auc'):continue
        if '_frm_sbdh_gpt4_v2' in model:
            diff = float(result_dict[model][k].split('±')[0])-float(result_dict[model.replace('_frm_sbdh_gpt4_v2','')][k].split('±')[0])
            result_dict[model][f'∆({k})'] = '{:.2f} ({:.2f}%)'.format(
                diff,diff*100/float(result_dict[model.replace('_frm_sbdh_gpt4_v2','')][k].split('±')[0])
                )
        elif '_frm_sbdh_gpt4_msf_v3' in model:
            diff = float(result_dict[model][k].split('±')[0])-float(result_dict[model.replace('_frm_sbdh_gpt4_msf_v3','')][k].split('±')[0])
            result_dict[model][f'∆({k})'] = '{:.2f} ({:.2f}%)'.format(
                diff,diff*100/float(result_dict[model.replace('_frm_sbdh_gpt4_msf_v3','')][k].split('±')[0])
                )
        else:
            result_dict[model][f'∆({k})'] = '-'
        
res_df = pd.DataFrame(result_dict)
# display(res_df.T.iloc[:3])
# display(res_df.T.iloc[3:6])
# display(res_df.T.iloc[6:9])
# display(res_df.T.iloc[9:12])
# display(res_df.T.iloc[12:15])
# display(res_df.T.iloc[15:18])
# display(res_df.T.iloc[18:21])
# display(res_df.T.iloc[21:24])
display(res_df.T.iloc[:2])
display(res_df.T.iloc[2:4])
display(res_df.T.iloc[4:6])
display(res_df.T.iloc[6:8])
display(res_df.T.iloc[8:10])
display(res_df.T.iloc[10:12])
display(res_df.T.iloc[12:14])
display(res_df.T.iloc[14:16])

Unnamed: 0,micro-auc,macro-auc,micro-f,macro-f,∆(micro-f),∆(macro-f)
roberta_mimic_sbdh,84.04 ± 1.04,95.69 ± 0.39,83.26 ± 2.05,58.12 ± 5.14,-,-
roberta_frm_sbdh_gpt4_v2_mimic_sbdh,91.04 ± 2.08,97.46 ± 0.31,87.85 ± 0.61,65.50 ± 0.67,4.59 (5.51%),7.38 (12.70%)


Unnamed: 0,micro-auc,macro-auc,micro-f,macro-f,∆(micro-f),∆(macro-f)
roberta_prompt_mimic_sbdh,89.38 ± 1.50,95.17 ± 0.42,79.75 ± 1.03,56.17 ± 1.66,-,-
roberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh,98.77 ± 0.04,98.95 ± 0.01,91.32 ± 0.07,91.20 ± 0.34,11.57 (14.51%),35.03 (62.36%)


Unnamed: 0,micro-auc,macro-auc,micro-f,macro-f,∆(micro-f),∆(macro-f)
cliroberta_mimic_sbdh,88.94 ± 0.91,96.69 ± 0.10,85.55 ± 0.44,61.15 ± 1.39,-,-
cliroberta_frm_sbdh_gpt4_v2_mimic_sbdh,93.88 ± 2.20,98.06 ± 0.18,89.63 ± 0.58,66.98 ± 0.47,4.08 (4.77%),5.83 (9.53%)


Unnamed: 0,micro-auc,macro-auc,micro-f,macro-f,∆(micro-f),∆(macro-f)
cliroberta_prompt_mimic_sbdh,92.69 ± 0.21,97.43 ± 0.15,86.63 ± 0.61,65.90 ± 0.36,-,-
cliroberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh,98.79 ± 0.04,99.01 ± 0.02,91.41 ± 0.12,91.13 ± 0.59,4.78 (5.52%),25.23 (38.29%)


Unnamed: 0,micro-auc,macro-auc,micro-f,macro-f,∆(micro-f),∆(macro-f)
mamba_mimic_sbdh,94.68 ± 3.74,97.42 ± 1.16,87.79 ± 2.18,79.44 ± 12.54,-,-
mamba_frm_sbdh_gpt4_v2_mimic_sbdh,98.13 ± 0.49,98.45 ± 0.20,90.16 ± 0.63,89.29 ± 1.60,2.37 (2.70%),9.85 (12.40%)


Unnamed: 0,micro-auc,macro-auc,micro-f,macro-f,∆(micro-f),∆(macro-f)
mamba_prompt_mimic_sbdh,98.22 ± 0.05,98.61 ± 0.16,89.95 ± 0.28,86.61 ± 1.17,-,-
mamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh,98.18 ± 0.10,98.55 ± 0.07,90.61 ± 0.12,90.18 ± 0.54,0.66 (0.73%),3.57 (4.12%)


Unnamed: 0,micro-auc,macro-auc,micro-f,macro-f,∆(micro-f),∆(macro-f)
climamba_mimic_sbdh,98.16 ± 0.34,98.77 ± 0.06,91.54 ± 0.09,90.49 ± 0.65,-,-
climamba_frm_sbdh_gpt4_v2_mimic_sbdh,98.30 ± 0.12,98.76 ± 0.11,91.48 ± 0.40,90.07 ± 1.01,-0.06 (-0.07%),-0.42 (-0.46%)


Unnamed: 0,micro-auc,macro-auc,micro-f,macro-f,∆(micro-f),∆(macro-f)
climamba_prompt_mimic_sbdh,98.72 ± 0.07,98.86 ± 0.14,91.26 ± 0.75,84.66 ± 2.98,-,-
climamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh,98.93 ± 0.04,99.04 ± 0.03,91.89 ± 0.18,91.85 ± 0.42,0.63 (0.69%),7.19 (8.49%)


In [12]:
# Performance gain for all 4 classes
models_1 = [
    'roberta_prompt_mimic_sbdh','cliroberta_prompt_mimic_sbdh', 'mamba_prompt_mimic_sbdh','climamba_prompt_mimic_sbdh'
]
models_2 = [
    'roberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh','cliroberta_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 'mamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh', 'climamba_prompt_frm_sbdh_gpt4_v2_mimic_sbdh'
]
for model1,model2 in zip(models_1, models_2):
    df = pd.DataFrame([
            pd.DataFrame(clf_report_dict[model2][0]).T['f1-score'][:4],
            pd.DataFrame(clf_report_dict[model2][1]).T['f1-score'][:4],
            pd.DataFrame(clf_report_dict[model2][2]).T['f1-score'][:4],
            pd.DataFrame(clf_report_dict[model1][0]).T['f1-score'][:4],
            pd.DataFrame(clf_report_dict[model1][1]).T['f1-score'][:4],
            pd.DataFrame(clf_report_dict[model1][2]).T['f1-score'][:4]
        ]).T
    df = df.set_axis(['seed_0_af', 'seed_1_af', 'seed_2_af', 'seed_0_b4', 'seed_1_b4', 'seed_2_b4'], axis=1)
    df ['mean ± std b4'] = [f'{i:.2f} ± {j:.2f}' for i,j in zip(np.average(df.iloc[:,3:6],axis=1)*100,np.std(df.iloc[:,3:6],axis=1)*100)]
    df ['mean ± std af'] = [f'{i:.2f} ± {j:.2f}' for i,j in zip(np.average(df.iloc[:,:3],axis=1)*100,np.std(df.iloc[:,:3],axis=1)*100)]
    df ['gain'] = df.apply(lambda x: float(x['mean ± std af'].split(' ± ')[0]) - float(x['mean ± std b4'].split(' ± ')[0]), axis=1)
    display(df.iloc[:,-3:].T)

Unnamed: 0,0,1,2,3
mean ± std b4,86.93 ± 0.50,0.00 ± 0.00,80.19 ± 1.17,57.57 ± 5.73
mean ± std af,91.04 ± 0.13,93.59 ± 1.20,93.53 ± 0.18,86.63 ± 0.19
gain,4.11,93.59,13.34,29.06


Unnamed: 0,0,1,2,3
mean ± std b4,89.50 ± 0.48,10.35 ± 0.25,88.77 ± 0.90,74.97 ± 0.15
mean ± std af,91.31 ± 0.16,91.59 ± 2.27,92.44 ± 0.28,89.20 ± 0.22
gain,1.81,81.24,3.67,14.23


Unnamed: 0,0,1,2,3
mean ± std b4,90.02 ± 0.69,78.43 ± 5.20,91.85 ± 0.78,86.14 ± 0.60
mean ± std af,90.18 ± 0.52,90.63 ± 2.46,92.49 ± 0.48,87.44 ± 1.68
gain,0.16,12.2,0.64,1.3


Unnamed: 0,0,1,2,3
mean ± std b4,91.93 ± 0.20,67.61 ± 8.45,93.21 ± 0.76,85.88 ± 2.61
mean ± std af,91.75 ± 0.23,93.44 ± 1.42,93.19 ± 0.49,89.03 ± 0.06
gain,-0.18,25.83,-0.02,3.15
