In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import tikzplotlib


# ECE averaged over model families

In [None]:
metric = 'ECE_15'
dataset = 'yahoo_answers_topics' # 'CIFAR10', 'CIFAR100', 'ImageNet', amazon_food, dynasent, mnli, yahoo_answers_topics
valid_size_IN = 25000
seeds = 5
mean_or_std = 'mean' # 'mean' 'std'

df = pd.DataFrame()
for seed in range(seeds):
    if dataset == 'ImageNet':
        df_s = pd.read_csv(f'../results/benchmark_calibration_{dataset}_calibSize{valid_size_IN}_seed{seed}.csv')
    else:
        df_s = pd.read_csv(f'../results/benchmark_calibration_{dataset}_seed{seed}.csv')
    # choose best of eqsize and eqmass for HB
    for m in df_s['model'].unique():
        hb_tva_eqmass = df_s[(df_s['model'] == m) & (df_s['method'] == 'netcal_HB_tva_eqmass')]
        hb_tva_eqsize = df_s[(df_s['model'] == m) & (df_s['method'] == 'netcal_HB_tva_eqsize')]
        if hb_tva_eqmass['ECE_15'].item() < hb_tva_eqsize['ECE_15'].item():
            hb_tva = hb_tva_eqmass.copy()
        else:
            hb_tva = hb_tva_eqsize.copy()
        hb_tva['method'] = 'netcal_HB_tva'
        df_s = pd.concat([df_s, hb_tva], axis=0)
    df = pd.concat([df, df_s], axis=0)
    
# get mean, std values across seeds
if mean_or_std == 'mean':
    df = df.groupby(['dataset', 'model', 'method', df.index]).mean().reset_index().drop(columns='level_3')
elif mean_or_std == 'std':
    df = df.groupby(['dataset', 'model', 'method', df.index]).std().reset_index().drop(columns='level_3')

# choose dataset & metric
df_metric = pd.pivot_table(df, index=['dataset', 'model'], columns=['method'], values=[metric])[metric]
if metric != 'Accuracy':
    df_metric = df_metric * 100 # in %
df_metric = df_metric.loc[dataset]


dict_ref = {'TS_tva': 'TS', 'VS_reg_tva': 'VS', 'Dir-ODIR_reg_tva': 'Dir-ODIR', 'netcal_Iso_tva': 'netcal_Iso', 'netcal_BBQ_tva': 'netcal_BBQ', 'netcal_HB_tva': 'netcal_HB_eqsize'}

columns = ['original', 'IRM', 'Patel2021_sCW_imax']
for k in dict_ref.keys():
    columns.append(dict_ref[k])
    columns.append(k)
df_metric = df_metric[columns]

if 'CIFAR' in dataset:
    new_idx = [
        'ResNet-50',
        'ResNet-110',
        'WRN',
        'DenseNet',
        'clip-vit-base-patch32',
        'clip-vit-base-patch16',
        'clip-vit-large-patch14'
        ]
elif dataset == 'ImageNet':
    new_idx = [
        'ResNet-18','ResNet-34','ResNet-50','ResNet-101','EffNet-B7','EffNetV2-S','EffNetV2-M','EffNetV2-L','ConvNeXt-T','ConvNeXt-S','ConvNeXt-B','ConvNeXt-L',
        'ViT-B/32','ViT-B/16','ViT-L/32','ViT-L/16','ViT-H/14','Swin-T','Swin-S','Swin-B','SwinV2-T','SwinV2-S','SwinV2-B', 'clip-vit-base-patch32', 'clip-vit-base-patch16', 'clip-vit-large-patch14']
elif dataset in ['amazon_food', 'dynasent', 'mnli', 'yahoo_answers_topics']:
    new_idx = ['t5', 't5-large', 'roberta', 'roberta-large']
    

df_metric = df_metric.reindex(index=new_idx)
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-base-patch32', 'CLIP (ViT-B/32)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-base-patch16', 'CLIP (ViT-B/16)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-large-patch14', 'CLIP (ViT-L/14)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('t5', 'T5'))
df_metric.index = df_metric.index.map(lambda x: x.replace('roberta', 'RoBERTa'))

df_metric = df_metric.map(lambda x: float(f'{x:.2f}')) # to consider all close values as min/max

In [None]:
if 'CIFAR' in dataset:
    model_families = ['ConvNets', 'CLIP'] 
    df_avg = pd.DataFrame(index=model_families, columns=df_metric.columns)
    df_avg.loc['ConvNets', :] = df_metric.loc[df_metric.index.str.contains('ResNet|WRN|DenseNet')].mean()
    df_avg.loc['CLIP', :] = df_metric.loc[df_metric.index.str.contains('CLIP')].mean()
elif dataset == 'ImageNet':
    model_families = ['ResNet', 'EffNet', 'ConvNeXt', 'ViT', 'Swin', 'CLIP']
    df_avg = pd.DataFrame(index=model_families, columns=df_metric.columns)
    for model_family in model_families:
        if model_family == 'ViT': # otherwise includes CLIP
            df_avg.loc[model_family, :] = df_metric.loc[df_metric.index.isin(['ViT-B/32','ViT-B/16','ViT-L/32','ViT-L/16','ViT-H/14'])].mean()
        else:
            df_avg.loc[model_family, :] = df_metric.loc[df_metric.index.str.contains(model_family)].mean()
elif dataset in ['amazon_food', 'dynasent', 'mnli', 'yahoo_answers_topics']:
    model_families = ['T5', 'RoBERTa']
    df_avg = pd.DataFrame(index=model_families, columns=df_metric.columns)
    for model_family in model_families:
        df_avg.loc[model_family, :] = df_metric.loc[df_metric.index.str.contains(model_family)].mean()

s = df_avg.style.highlight_min(axis=1, props="textbf:--rwrap;") # min value per row in bold
s = s.format('{:.2f}') # float format
print(s.to_latex())

# ECE

In [None]:
metric = 'ECE_15'
dataset = 'yahoo_answers_topics' # 'CIFAR10', 'CIFAR100', 'ImageNet', 'ImageNet21k', amazon_food, dynasent, mnli, yahoo_answers_topics
valid_size_IN = 78741 # 25000 for IN, 261250 for IN21k , 14001 for yahoo, 19635 mnli, 11160 dynasent, 78741 amazon
seeds = 5
mean_or_std = 'std' # 'mean' 'std'

df = pd.DataFrame()
for seed in range(seeds):
    if 'ImageNet' in dataset:
        df_s = pd.read_csv(f'../results/benchmark_calibration_{dataset}_calibSize{valid_size_IN}_seed{seed}.csv')
    else:
        df_s = pd.read_csv(f'../results/benchmark_calibration_{dataset}_seed{seed}.csv')
    # choose best of eqsize and eqmass for HB
    for m in df_s['model'].unique():
        hb_tva_eqmass = df_s[(df_s['model'] == m) & (df_s['method'] == 'netcal_HB_tva_eqmass')]
        hb_tva_eqsize = df_s[(df_s['model'] == m) & (df_s['method'] == 'netcal_HB_tva_eqsize')]
        if hb_tva_eqmass['ECE_15'].item() < hb_tva_eqsize['ECE_15'].item():
            hb_tva = hb_tva_eqmass.copy()
        else:
            hb_tva = hb_tva_eqsize.copy()
        hb_tva['method'] = 'netcal_HB_tva'
        df_s = pd.concat([df_s, hb_tva], axis=0)
    df = pd.concat([df, df_s], axis=0)
    
# get mean, std values across seeds
if mean_or_std == 'mean':
    df = df.groupby(['dataset', 'model', 'method', df.index]).mean().reset_index().drop(columns='level_3')
elif mean_or_std == 'std':
    df = df.groupby(['dataset', 'model', 'method', df.index]).std().reset_index().drop(columns='level_3')

# choose dataset & metric
df_metric = pd.pivot_table(df, index=['dataset', 'model'], columns=['method'], values=[metric])[metric].reindex(columns=df['method'].unique())
if metric != 'Accuracy':
    df_metric = df_metric * 100 # in %
df_metric = df_metric.loc[dataset]

In [None]:
dict_ref = {'TS_tva': 'TS', 'VS_reg_tva': 'VS', 'Dir-ODIR_reg_tva': 'Dir-ODIR', 'netcal_Beta_tva': 'netcal_Beta', 'netcal_Iso_tva': 'netcal_Iso', 'netcal_BBQ_tva': 'netcal_BBQ', 'netcal_HB_tva': 'netcal_HB_eqsize'}

columns = ['original', 'IRM', 'Patel2021_sCW_imax']
for k in dict_ref.keys():
    columns.append(dict_ref[k])
    columns.append(k)
df_metric = df_metric[columns]

if 'CIFAR' in dataset:
    new_idx = [
        'ResNet-50',
        'ResNet-110',
        'WRN',
        'DenseNet',
        'clip-vit-base-patch32',
        'clip-vit-base-patch16',
        'clip-vit-large-patch14'
        ]
elif dataset == 'ImageNet':
    new_idx = [
        'ResNet-18','ResNet-34','ResNet-50','ResNet-101','EffNet-B7','EffNetV2-S','EffNetV2-M','EffNetV2-L','ConvNeXt-T','ConvNeXt-S','ConvNeXt-B','ConvNeXt-L',
        'ViT-B/32','ViT-B/16','ViT-L/32','ViT-L/16','ViT-H/14','Swin-T','Swin-S','Swin-B','SwinV2-T','SwinV2-S','SwinV2-B', 'clip-vit-base-patch32', 'clip-vit-base-patch16', 'clip-vit-large-patch14']
elif dataset == 'ImageNet21k':
    new_idx = ['mobilenetv3_large_100_miil_in21k', 'vit_base_patch16_224_miil_in21k']
elif dataset in ['amazon_food', 'dynasent', 'mnli', 'yahoo_answers_topics']:
    new_idx = ['t5', 't5-large', 'roberta', 'roberta-large']
    
df_metric = df_metric.reindex(index=new_idx)
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-base-patch32', 'CLIP (ViT-B/32)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-base-patch16', 'CLIP (ViT-B/16)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-large-patch14', 'CLIP (ViT-L/14)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('vit_base_patch16_224_miil_in21k', 'ViT-B/16'))
df_metric.index = df_metric.index.map(lambda x: x.replace('mobilenetv3_large_100_miil_in21k', 'MN3'))
df_metric.index = df_metric.index.map(lambda x: x.replace('t5', 'T5'))
df_metric.index = df_metric.index.map(lambda x: x.replace('roberta', 'RoBERTa'))

df_metric = df_metric.map(lambda x: float(f'{x:.2f}')) # to consider all close values as min/max

if mean_or_std == 'mean':
    s = df_metric.style.highlight_min(axis=1, props="textbf:--rwrap;") # min value per row in bold
elif mean_or_std == 'std':
    s = df_metric.style
s = s.format('{:.2f}') # float format
print(s.to_latex().replace('nan', 'err.'))

In [None]:
if 'CIFAR' in dataset:
    model_families = ['N', 'CLIP'] # contained in all models: for CIFAR only global improvemnt
elif dataset == 'ImageNet':
    model_families = ['ResNet', 'EffNet', 'ConvNeXt', 'ViT', 'Swin', 'CLIP']
elif dataset == 'ImageNet21k':
    model_families = [''] # contained in all models: global improvemnt
elif dataset in ['amazon_food', 'dynasent', 'mnli', 'yahoo_answers_topics']:
    model_families = ['T5', 'RoBERTa']
df_improv = pd.DataFrame(index=model_families, columns=dict_ref.values())
for model_family in model_families:
    for method_new, method_ref in dict_ref.items():
        if model_family == 'ViT': # otherwise includes CLIP
            new_val = df_metric.loc[df_metric.index.isin(['ViT-B/32','ViT-B/16','ViT-L/32','ViT-L/16','ViT-H/14']), method_new]
            ref_val = df_metric.loc[df_metric.index.isin(['ViT-B/32','ViT-B/16','ViT-L/32','ViT-L/16','ViT-H/14']), method_ref]
        else:        
            new_val = df_metric.loc[df_metric.index.str.contains(model_family), method_new]
            ref_val = df_metric.loc[df_metric.index.str.contains(model_family), method_ref]
        improv = (100 * (new_val - ref_val) / ref_val).mean()

        df_improv.loc[model_family, method_ref] = improv
        
for model_family in model_families:
    print('\n', model_family)
    try:
        print(rf"\multicolumn{{4}}{{c|}}{{Mean improvement {model_family}}} & \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'TS'].item():.0f}\%}} & \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'VS'].item():.0f}\%}}& \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'Dir-ODIR'].item():.0f}\%}}& \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'netcal_Beta'].item():.0f}\%}}& \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'netcal_Iso'].item():.0f}\%}}& \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'netcal_BBQ'].item():.0f}\%}}& \multicolumn{{2}}{{c}}{{{df_improv.loc[model_family, 'netcal_HB_eqsize'].item():.0f}\%}} \\")
    except:
        print(rf"\multicolumn{{4}}{{c|}}{{Mean improvement {model_family}}} & \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'TS']:.0f}\%}} & \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'VS']:.0f}\%}}& \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'Dir-ODIR']:.0f}\%}}& \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'netcal_Beta']:.0f}\%}}& \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'netcal_Iso']:.0f}\%}}& \multicolumn{{2}}{{c|}}{{{df_improv.loc[model_family, 'netcal_BBQ']:.0f}\%}}& \multicolumn{{2}}{{c}}{{{df_improv.loc[model_family, 'netcal_HB_eqsize']:.0f}\%}} \\")


# Apendix

In [None]:
metric = 'Brier_top' # Accuracy AdaECE_15 Brier_top ECE_100 AdaECE_100
dataset = 'yahoo_answers_topics' # 'CIFAR10', 'CIFAR100', 'ImageNet', 'ImageNet21k' amazon_food, dynasent, mnli, yahoo_answers_topics
valid_size_IN = 261250 # 25000 for IN, 261250 for IN21k 
seeds = 5

df = pd.DataFrame()
for seed in range(seeds):
    if 'ImageNet' in dataset:
        df_s = pd.read_csv(f'../results/benchmark_calibration_{dataset}_calibSize{valid_size_IN}_seed{seed}.csv')
    else:
        df_s = pd.read_csv(f'../results/benchmark_calibration_{dataset}_seed{seed}.csv')
    # choose best of eqsize and eqmass for HB
    for m in df_s['model'].unique():
        hb_tva_eqmass = df_s[(df_s['model'] == m) & (df_s['method'] == 'netcal_HB_tva_eqmass')]
        hb_tva_eqsize = df_s[(df_s['model'] == m) & (df_s['method'] == 'netcal_HB_tva_eqsize')]
        if metric in ['AdaECE_15', 'ECE_100', 'AdaECE_100', 'Brier_top']:
            if hb_tva_eqmass[metric].item() < hb_tva_eqsize[metric].item():
                hb_tva = hb_tva_eqmass.copy()
            else:
                hb_tva = hb_tva_eqsize.copy()
        else:
            if hb_tva_eqmass['ECE_15'].item() < hb_tva_eqsize['ECE_15'].item():
                hb_tva = hb_tva_eqmass.copy()
            else:
                hb_tva = hb_tva_eqsize.copy()
        hb_tva['method'] = 'netcal_HB_tva'
        df_s = pd.concat([df_s, hb_tva], axis=0)
    df = pd.concat([df, df_s], axis=0)


# choose dataset & metric
df_metric = pd.pivot_table(df, index=['dataset', 'model'], columns=['method'], values=[metric])[metric].reindex(columns=df['method'].unique())
if metric != 'Accuracy':
    df_metric = df_metric * 100 # in %
df_metric = df_metric.loc[dataset]

df_metric = df_metric[columns]

if 'CIFAR' in dataset:
    new_idx = [
        'ResNet-50',
        'ResNet-110',
        'WRN',
        'DenseNet',
        'clip-vit-base-patch32',
        'clip-vit-base-patch16',
        'clip-vit-large-patch14']
elif dataset == 'ImageNet':
    new_idx = [
        'ResNet-18','ResNet-34','ResNet-50','ResNet-101','EffNet-B7','EffNetV2-S','EffNetV2-M','EffNetV2-L','ConvNeXt-T','ConvNeXt-S','ConvNeXt-B','ConvNeXt-L',
        'ViT-B/32','ViT-B/16','ViT-L/32','ViT-L/16','ViT-H/14','Swin-T','Swin-S','Swin-B','SwinV2-T','SwinV2-S','SwinV2-B', 'clip-vit-base-patch32', 'clip-vit-base-patch16', 'clip-vit-large-patch14']
elif dataset == 'ImageNet21k':
    new_idx = ['mobilenetv3_large_100_miil_in21k', 'vit_base_patch16_224_miil_in21k']
elif dataset in ['amazon_food', 'dynasent', 'mnli', 'yahoo_answers_topics']:
    new_idx = ['t5', 't5-large', 'roberta', 'roberta-large']
    
df_metric = df_metric.reindex(index=new_idx)
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-base-patch32', 'CLIP (ViT-B/32)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-base-patch16', 'CLIP (ViT-B/16)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-large-patch14', 'CLIP (ViT-L/14)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('vit_base_patch16_224_miil_in21k', 'ViT-B/16'))
df_metric.index = df_metric.index.map(lambda x: x.replace('mobilenetv3_large_100_miil_in21k', 'MN3'))
df_metric.index = df_metric.index.map(lambda x: x.replace('t5', 'T5'))
df_metric.index = df_metric.index.map(lambda x: x.replace('roberta', 'RoBERTa'))

df_metric = df_metric.applymap(lambda x: float(f'{x:.2f}'))

s = df_metric.style
if metric == 'Accuracy':
    s = s.format('{:.2f}').highlight_max(axis=1, props="textbf:--rwrap;") # float format
else:
    s = s.format('{:.2f}').highlight_min(axis=1, props="textbf:--rwrap;") # float format
print(s.to_latex().replace('nan', 'err.'))

# AUROC

In [None]:
metric = 'AUROC'
dataset = 'yahoo_answers_topics' # 'CIFAR10', 'CIFAR100', 'ImageNet', 'ImageNet21k' amazon_food, dynasent, mnli, yahoo_answers_topics
valid_size_IN = 261250 # 25000 for IN, 261250 for IN21k 
seeds = 5

df = pd.DataFrame()
for seed in range(seeds):
    if 'ImageNet' in dataset:
        df_s = pd.read_csv(f'../results/benchmark_calibration_{dataset}_calibSize{valid_size_IN}_seed{seed}.csv')
    else:
        df_s = pd.read_csv(f'../results/benchmark_calibration_{dataset}_seed{seed}.csv')
    # choose best of eqsize and eqmass for HB
    for m in df_s['model'].unique():
        hb_tva_eqmass = df_s[(df_s['model'] == m) & (df_s['method'] == 'netcal_HB_tva_eqmass')]
        hb_tva_eqsize = df_s[(df_s['model'] == m) & (df_s['method'] == 'netcal_HB_tva_eqsize')]
        if hb_tva_eqmass['ECE_15'].item() < hb_tva_eqsize['ECE_15'].item():
            hb_tva = hb_tva_eqmass.copy()
        else:
            hb_tva = hb_tva_eqsize.copy()
        hb_tva['method'] = 'netcal_HB_tva'
        df_s = pd.concat([df_s, hb_tva], axis=0)
    df = pd.concat([df, df_s], axis=0)
    
# get mean, std values across seeds
df_std = df.groupby(['dataset', 'model', 'method', df.index]).std().reset_index().drop(columns='level_3')
df = df.groupby(['dataset', 'model', 'method', df.index]).mean().reset_index().drop(columns='level_3')


# choose dataset & metric
df_metric = pd.pivot_table(df, index=['dataset', 'model'], columns=['method'], values=[metric])[metric].reindex(columns=df['method'].unique())
if metric != 'Accuracy':
    df_metric = df_metric * 100 # in %
df_metric = df_metric.loc[dataset]


df_metric = df_metric[columns]

if 'CIFAR' in dataset:
    new_idx = [
        'ResNet-50',
        'ResNet-110',
        'WRN',
        'DenseNet',
        'clip-vit-base-patch32',
        'clip-vit-base-patch16',
        'clip-vit-large-patch14']
elif dataset == 'ImageNet':
    new_idx = [
        'ResNet-18','ResNet-34','ResNet-50','ResNet-101','EffNet-B7','EffNetV2-S','EffNetV2-M','EffNetV2-L','ConvNeXt-T','ConvNeXt-S','ConvNeXt-B','ConvNeXt-L',
        'ViT-B/32','ViT-B/16','ViT-L/32','ViT-L/16','ViT-H/14','Swin-T','Swin-S','Swin-B','SwinV2-T','SwinV2-S','SwinV2-B', 'clip-vit-base-patch32', 'clip-vit-base-patch16', 'clip-vit-large-patch14']
elif dataset == 'ImageNet21k':
    new_idx = ['mobilenetv3_large_100_miil_in21k', 'vit_base_patch16_224_miil_in21k']
elif dataset in ['amazon_food', 'dynasent', 'mnli', 'yahoo_answers_topics']:
    new_idx = ['t5', 't5-large', 'roberta', 'roberta-large']
    
df_metric = df_metric.reindex(index=new_idx)
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-base-patch32', 'CLIP (ViT-B/32)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-base-patch16', 'CLIP (ViT-B/16)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-large-patch14', 'CLIP (ViT-L/14)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('vit_base_patch16_224_miil_in21k', 'ViT-B/16'))
df_metric.index = df_metric.index.map(lambda x: x.replace('mobilenetv3_large_100_miil_in21k', 'MN3'))
df_metric.index = df_metric.index.map(lambda x: x.replace('t5', 'T5'))
df_metric.index = df_metric.index.map(lambda x: x.replace('roberta', 'RoBERTa'))

df_metric_color = df_metric.copy()
for col in columns[1:]:
    df_metric_color.loc[df_metric[col] >= df_metric['original'], col] = df_metric.loc[df_metric[col] >= df_metric['original'], col].apply(lambda x: rf'\textcolor{{blue}}{{{x:.2f}}}')
    df_metric_color.loc[df_metric[col] < df_metric['original'], col] = df_metric.loc[df_metric[col] < df_metric['original'], col].apply(lambda x: rf'\textcolor{{orange}}{{{x:.2f}}}')
df_metric_color['original'] = df_metric_color['original'].apply(lambda x: f'{x:.2f}')

s = df_metric_color.style
print(s.to_latex().replace('nan', 'err.'))

# Underconfidence

In [None]:
metric = 'Average_Confidence'
dataset = 'yahoo_answers_topics' # 'CIFAR10', 'CIFAR100', 'ImageNet', 'ImageNet21k' amazon_food, dynasent, mnli, yahoo_answers_topics
valid_size_IN = 261250 # 25000 for IN, 261250 for IN21k 
seeds = 5

df = pd.DataFrame()
for seed in range(seeds):
    if 'ImageNet' in dataset:
        df_s = pd.read_csv(f'../results/benchmark_calibration_{dataset}_calibSize{valid_size_IN}_seed{seed}.csv')
    else:
        df_s = pd.read_csv(f'../results/benchmark_calibration_{dataset}_seed{seed}.csv')
    # choose best of eqsize and eqmass for HB
    for m in df_s['model'].unique():
        hb_tva_eqmass = df_s[(df_s['model'] == m) & (df_s['method'] == 'netcal_HB_tva_eqmass')]
        hb_tva_eqsize = df_s[(df_s['model'] == m) & (df_s['method'] == 'netcal_HB_tva_eqsize')]
        if hb_tva_eqmass['ECE_15'].item() < hb_tva_eqsize['ECE_15'].item():
            hb_tva = hb_tva_eqmass.copy()
        else:
            hb_tva = hb_tva_eqsize.copy()
        hb_tva['method'] = 'netcal_HB_tva'
        df_s = pd.concat([df_s, hb_tva], axis=0)
    df = pd.concat([df, df_s], axis=0)
    
# get mean, std values across seeds
df_std = df.groupby(['dataset', 'model', 'method', df.index]).std().reset_index().drop(columns='level_3')
df = df.groupby(['dataset', 'model', 'method', df.index]).mean().reset_index().drop(columns='level_3')


# choose dataset & metric
df_metric = pd.pivot_table(df, index=['dataset', 'model'], columns=['method'], values=[metric])[metric].reindex(columns=df['method'].unique())
if metric != 'Accuracy':
    df_metric = df_metric * 100 # in %
df_metric = df_metric.loc[dataset]

df_metric['Accuracy'] = df.loc[(df['dataset'] == dataset) & (df['method'] == 'original'), 'Accuracy'].to_list()
df_metric = df_metric[['Accuracy']+columns]

if 'CIFAR' in dataset:
    new_idx = [
        'ResNet-50',
        'ResNet-110',
        'WRN',
        'DenseNet',
        'clip-vit-base-patch32',
        'clip-vit-base-patch16',
        'clip-vit-large-patch14']
elif dataset == 'ImageNet':
    new_idx = [
        'ResNet-18','ResNet-34','ResNet-50','ResNet-101','EffNet-B7','EffNetV2-S','EffNetV2-M','EffNetV2-L','ConvNeXt-T','ConvNeXt-S','ConvNeXt-B','ConvNeXt-L',
        'ViT-B/32','ViT-B/16','ViT-L/32','ViT-L/16','ViT-H/14','Swin-T','Swin-S','Swin-B','SwinV2-T','SwinV2-S','SwinV2-B', 'clip-vit-base-patch32', 'clip-vit-base-patch16', 'clip-vit-large-patch14']
elif dataset == 'ImageNet21k':
    new_idx = ['mobilenetv3_large_100_miil_in21k', 'vit_base_patch16_224_miil_in21k']
elif dataset in ['amazon_food', 'dynasent', 'mnli', 'yahoo_answers_topics']:
    new_idx = ['t5', 't5-large', 'roberta', 'roberta-large']
    
df_metric = df_metric.reindex(index=new_idx)
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-base-patch32', 'CLIP (ViT-B/32)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-base-patch16', 'CLIP (ViT-B/16)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('clip-vit-large-patch14', 'CLIP (ViT-L/14)'))
df_metric.index = df_metric.index.map(lambda x: x.replace('vit_base_patch16_224_miil_in21k', 'ViT-B/16'))
df_metric.index = df_metric.index.map(lambda x: x.replace('mobilenetv3_large_100_miil_in21k', 'MN3'))
df_metric.index = df_metric.index.map(lambda x: x.replace('t5', 'T5'))
df_metric.index = df_metric.index.map(lambda x: x.replace('roberta', 'RoBERTa'))

df_metric_color = df_metric.copy()
for col in columns:
    df_metric_color.loc[df_metric[col] > df_metric['Accuracy'], col] = df_metric.loc[df_metric[col] > df_metric['Accuracy'], col].apply(lambda x: f'\\textcolor{{violet}}{{{x:.1f}}}')
    df_metric_color.loc[df_metric[col] < df_metric['Accuracy'], col] = df_metric.loc[df_metric[col] < df_metric['Accuracy'], col].apply(lambda x: f'\\textcolor{{brown}}{{{x:.1f}}}')
df_metric_color['Accuracy'] = df_metric_color['Accuracy'].apply(lambda x: f'{x:.1f}')

s = df_metric_color.style
# s = s.format('{:.1f}') # float format
print(s.to_latex().replace('nan', 'err.'))

# Calib size

In [None]:
model = 'ResNet-101'
methods = ['TS', 'TS_tva', 'VS', 'VS_reg_tva', 'Dir-ODIR', 'Dir-ODIR_reg_tva']
metric = 'ECE_15'
dataset = 'ImageNet'
load_many_seeds = True


if load_many_seeds:
    df = pd.DataFrame()
    for seed in range(5):
        if dataset == 'ImageNet':
            for valid_size_IN in [5000, 10000, 15000, 20000, 25000]:
                df_s = pd.read_csv(f'../results/benchmark_calibration_ImageNet_calibSize{valid_size_IN}_seed{seed}_final.csv')
                df = pd.concat([df, df_s], axis=0)
        
    # get mean, std values across seeds
    df_std = df.groupby(['dataset', 'model', 'method', 'valid_size', df.index]).std().reset_index().drop(columns='level_4')
    df = df.groupby(['dataset', 'model', 'method', 'valid_size', df.index]).mean().reset_index().drop(columns='level_4')

# choose metric & model
df_metric = pd.pivot_table(df, index=['dataset', 'model', 'valid_size'], columns=['method'], values=[metric])[metric]
if metric != 'Accuracy':
    df_metric = df_metric * 100 # in %
df_metric = df_metric.loc[(dataset, model)]

df_metric_std = pd.pivot_table(df_std, index=['dataset', 'model', 'valid_size'], columns=['method'], values=[metric])[metric]
if metric != 'Accuracy':
    df_metric_std = df_metric_std * 100 # in %
df_metric_std = df_metric_std.loc[(dataset, model)]

labels = {
    'TS': 'TS',
    'TS_tva': r'TS\textsubscript{TvA}',
    'VS': 'VS',
    'VS_reg_tva': r'VS\textsubscript{reg\_TvA}',
    'Dir-ODIR': 'DC',
    'Dir-ODIR_reg_tva': r'DC\textsubscript{reg\_TvA}'}

plt.figure()
for i, method in enumerate(methods):
    ls = '-' if 'tva' in method else '--'
    plt.plot(df_metric.index, df_metric[method], color=f'C{i // 2}', ls=ls, label=labels[method])
    plt.fill_between(df_metric.index, df_metric[method] - df_metric_std[method], df_metric[method] + df_metric_std[method], color=f'C{i // 2}', alpha=0.2)
plt.legend()
plt.xlabel('calibration set size')
plt.ylabel('ECE test [%]')
tikzplotlib.save('calib_size_scaling.tikz')

In [None]:
model = 'ResNet-101'
methods = [
    'Patel2021_sCW_imax',
    'netcal_HB_eqsize', 'netcal_HB_tva_eqmass', 
    'netcal_Iso', 'netcal_Iso_tva', 
    #    'netcal_Beta', 'netcal_Beta_tva',
    'netcal_BBQ', 'netcal_BBQ_tva']

metric = 'ECE_15'
dataset = 'ImageNet'
load_many_seeds = True


if load_many_seeds:
    df = pd.DataFrame()
    for seed in range(5):
        if dataset == 'ImageNet':
            for valid_size_IN in [5000, 10000, 15000, 20000, 25000]:
                df_s = pd.read_csv(f'../results/benchmark_calibration_ImageNet_calibSize{valid_size_IN}_seed{seed}_final.csv')
                df = pd.concat([df, df_s], axis=0)
        
    # get mean, std values across seeds
    df_std = df.groupby(['dataset', 'model', 'method', 'valid_size', df.index]).std().reset_index().drop(columns='level_4')
    df = df.groupby(['dataset', 'model', 'method', 'valid_size', df.index]).mean().reset_index().drop(columns='level_4')

# choose metric & model
df_metric = pd.pivot_table(df, index=['dataset', 'model', 'valid_size'], columns=['method'], values=[metric])[metric]
if metric != 'Accuracy':
    df_metric = df_metric * 100 # in %
df_metric = df_metric.loc[(dataset, model)]

df_metric_std = pd.pivot_table(df_std, index=['dataset', 'model', 'valid_size'], columns=['method'], values=[metric])[metric]
if metric != 'Accuracy':
    df_metric_std = df_metric_std * 100 # in %
df_metric_std = df_metric_std.loc[(dataset, model)]

labels = {
    'netcal_HB_eqsize': 'HB',
    'netcal_HB_tva_eqmass': r'HB\textsubscript{TvA}',
    'netcal_Iso': 'Iso',
    'netcal_Iso_tva': r'Iso\textsubscript{TvA}',
    'netcal_Beta': 'Beta',
    'netcal_Beta_tva': r'Beta\textsubscript{TvA}',
    'netcal_BBQ': 'BBQ',
    'netcal_BBQ_tva': r'BBQ\textsubscript{TvA}',
    'Patel2021_sCW_imax': 'I-Max'}

plt.figure()
for i, method in enumerate(methods):
    
    if method == 'Patel2021_sCW_imax':
        ls = '--'
        color = 'k'
    else:
        ls = '-' if 'tva' in method else '--'
        color = f'C{(i-1) // 2}'
    plt.plot(df_metric.index, df_metric[method], color=color, ls=ls, label=labels[method])
    plt.fill_between(df_metric.index, df_metric[method] - df_metric_std[method], df_metric[method] + df_metric_std[method], color=color, alpha=0.2)
plt.xlabel('calibration set size')
plt.ylabel('ECE test [%]')
# # reorder legend (does not work with tikz)
# handles, labels = plt.gca().get_legend_handles_labels()
# labels = [labels[5], labels[1], labels[3], labels[6], labels[2], labels[4], labels[0]]
# handles = [handles[5], handles[1], handles[3], handles[6], handles[2], handles[4], handles[0]]
# plt.legend(handles, labels)
plt.legend()

tikzplotlib.save('calib_size_binary.tikz')

# Compute time

In [None]:
dataset = 'ImageNet' # 'CIFAR10', 'CIFAR100', 'ImageNet'
valid_size_IN = 25000


# only load single file
df = pd.read_csv(f'../results/benchmark_calibration_{dataset}_calibSize{valid_size_IN}_seed0_computeTime.csv')

df = df.replace({'Effnet-B7': 'EffNet-B7'})

# choose dataset & metric
df_metric = pd.pivot_table(df, index=['dataset', 'model'], columns=['method'], values=['execution_time'])['execution_time']
df_metric = df_metric.loc[dataset]

df_metric

In [None]:
dict_ref = {'TS_tva': 'TS', 'VS_reg_tva': 'VS', 'Dir-ODIR_reg_tva': 'Dir-ODIR', 'netcal_Beta_tva': 'netcal_Beta', 'netcal_Iso_tva': 'netcal_Iso', 'netcal_BBQ_tva': 'netcal_BBQ', 'netcal_HB_tva': 'netcal_HB_eqsize'}

columns = ['original', 'Patel2021_sCW_imax']
for k in dict_ref.keys():
    columns.append(dict_ref[k])
    columns.append(k)
df_metric = df_metric[columns]

if 'CIFAR' in dataset:
    new_idx = [
        'ResNet-50',
        'ResNet-110',
        'WRN',
        'DenseNet']
elif dataset == 'ImageNet':
    new_idx = [
        'ResNet-50','ViT-B/16']
df_metric = df_metric.reindex(index=new_idx)

s = df_metric.style
s = s.format('{:.0f}') # float format
print(s.to_latex())