# Synthetic Experiments with Multiple Sources in Each Batch

In [1]:
output_path = './outputs/graphs/'
results_path = './outputs/synthetic_results/'

In [2]:
import json
import os
import pandas as pd
import numpy as np
import graph_code

In [3]:
results_list = []
for file in [
    f for f in os.listdir(results_path) if f.endswith("json")
]:
    with open(os.path.join(results_path, file)) as f:
        results_temp = json.load(f)
    results_list.append(results_temp)

In [4]:
results = {}

In [5]:
for results_dict in results_list:
    for dataset in results_dict:
        if dataset not in results:
            results[dataset] = {}
        for corruption_type in results_dict[dataset]:
            if corruption_type not in results[dataset]:
                results[dataset][corruption_type] = {}

            for run in results_dict[dataset][corruption_type]:
                if run not in results[dataset][corruption_type]:
                    results[dataset][corruption_type][run] = {}

                for depression in results_dict[dataset][corruption_type][run]:

                    results[dataset][corruption_type][run][depression] = [
                        dict(epoch=int(epoch), **metrics) 
                        for epoch, metrics in results_dict[dataset][corruption_type][run][depression].items() 
                        if epoch != 'corrupt_sources'
                    ]

In [6]:
results_df = []

for dataset in results:
    for corruption_type in results[dataset]:
        for run in results[dataset][corruption_type]:
            for method in results[dataset][corruption_type][run]:

                if "lap" in method:
                    depression_strength = float(method.split("dep_")[1].split("_")[0])
                    leniency = float(method.split("len_")[1].split("-")[0])
                    history_length = float(method.split("his_")[1].split("-")[0])
                    
                    if depression_strength < 0.5: 
                        continue
                    # if leniency < 0.5:
                    #     continue

                results_df.append(
                    pd.json_normalize(
                        results[dataset][corruption_type][run][method]
                    )
                    .assign(
                        dataset=dataset,
                        corruption_type=corruption_type,
                        run=run,
                        method=method,
                    )
                    .assign(epoch=lambda x: x['epoch']+1)
                )

results_df = pd.concat(results_df)
results_df = (
    results_df
    .astype(
        {
            "run": "int64",
        }
    )
)
results_df = results_df.assign(run=lambda x: x['run']-1)

work out what to do here as greatest val acc is bad test acc

In [7]:
corruption_order = ['no_c', 'c_cs', 'c_rl', 'c_lbs', 'c_lbf', 'c_ns', 'c_no']

In [8]:
### choosing the c_rl model on the val set

lap_val_accuracy_results_df = (
    results_df
    .loc[lambda x: x['method'].str.contains('lap')]
    .loc[lambda x: x['method'].str.contains('val')]
    [[
        "dataset", "corruption_type", "run", "method", "epoch", "val_top1acc", "val_top5acc"
    ]]
    # if cifar10 and fmnist, choose val_top1acc and if cifar100 choose val_top5acc
    .assign(
        accuracy = lambda x: np.where(
            x['dataset'].isin(['cifar10', 'fmnist']),
            x['val_top1acc'],
            x['val_top5acc']
        )
    )
    [
        ['dataset', 'corruption_type', 'run', 'method', 'epoch', 'accuracy']
    ]
    .melt(
        id_vars=['dataset', 'corruption_type', 'run', 'method', 'epoch'],
        var_name='metric',
        value_name='value'
    )
)

lap_test_accuracy_results_df = (
    results_df
    .loc[lambda x: x['method'].str.contains('lap')]
    .loc[lambda x: x['method'].str.contains('test')]
    [[
        "dataset", "corruption_type", "run", "method", "epoch", "test_top1acc", "test_top5acc"
    ]]
    # if cifar10 and fmnist, choose test_top1acc and if cifar100 choose test_top5acc
    .assign(
        accuracy = lambda x: np.where(
            x['dataset'].isin(['cifar10', 'fmnist']),
            x['test_top1acc'],
            x['test_top5acc']
        )
    )
    [
        ['dataset', 'corruption_type', 'run', 'method', 'epoch', 'accuracy']
    ]
    .melt(
        id_vars=['dataset', 'corruption_type', 'run', 'method', 'epoch'],
        var_name='metric',
        value_name='value'
    )
)

def find_best(df):
    best_idx = (
        df
        .groupby(['dataset', 'method'])
        ['value']
        .apply(lambda x: x.mean() - x.std())
        .groupby(['dataset', ])
        .idxmax()
        .values
    )
    return best_idx

lap_best_idx = (
    lap_val_accuracy_results_df
    .loc[lambda x: x['corruption_type'] == "c_rl"]
    .loc[lambda x: x['method'].str.contains("lap")]
    .loc[lambda x: x['method'].str.contains("val")]
    .assign(value = lambda x: x['value']*100)
    .groupby(['dataset', 'method', 'run'])
    ['value']
    .max()
    .reset_index()
    .pipe(
        find_best
    )
)

lap_best = (
    lap_test_accuracy_results_df
    .set_index(['dataset', 'method'])
    .loc[[(d, m.replace("val", "test")) for d, m in lap_best_idx]]
    .reset_index()
    .assign(method = 'lap')
    .assign(value = lambda x: x['value']*100)
    .groupby(['corruption_type', 'dataset', 'method', 'run'])
    ['value']
    .max()
    .reset_index()
    .groupby(['corruption_type', 'dataset', 'method'])
    ['value']
    .agg(['mean', 'std'])
    .assign(
        mean_std = lambda x: 
            np.round(x['mean'].astype(float), 2).astype(str) 
            + " ± "
            + np.round(x['std'].astype(float), 2).astype(str),
    )
    ['mean_std']
    .unstack(0)
    .reset_index()
)
lap_best[["dataset", "method"] + corruption_order]

corruption_type,dataset,method,no_c,c_cs,c_rl,c_lbs,c_lbf,c_ns,c_no
0,cifar10,lap,77.52 ± 0.9,74.29 ± 1.43,73.44 ± 0.9,73.94 ± 0.33,73.85 ± 0.92,72.63 ± 1.24,73.84 ± 0.55
1,cifar100,lap,76.01 ± 0.54,70.41 ± 0.96,69.05 ± 0.38,69.31 ± 1.4,69.82 ± 0.78,67.69 ± 1.17,68.71 ± 1.22
2,fmnist,lap,83.52 ± 0.21,82.18 ± 1.26,81.8 ± 1.37,82.45 ± 0.38,79.87 ± 0.7,78.72 ± 2.02,82.94 ± 0.31


In [9]:
lap_best_idx = lap_best_idx.tolist() + [(d, m.replace("val", "test")) for d, m in lap_best_idx]

In [10]:
lap_best_idx

[('cifar10', 'lap-len_1.0-his_50-dep_1.0_val'),
 ('cifar100', 'lap-len_1.0-his_25-dep_0.5_val'),
 ('fmnist', 'lap-len_0.5-his_25-dep_0.5_val'),
 ('cifar10', 'lap-len_1.0-his_50-dep_1.0_test'),
 ('cifar100', 'lap-len_1.0-his_25-dep_0.5_test'),
 ('fmnist', 'lap-len_0.5-his_25-dep_0.5_test')]

In [11]:
results_df = pd.concat([
    results_df.loc[lambda x: x['method'] == "standard"],
    (
        results_df
        .set_index(['dataset', 'method'])
        .loc[lap_best_idx]
        .reset_index()
        .assign(method = 'lap')
    )
])

In [12]:
tags = [
    'weighted_average_test_results_accuracy', 
    'weighted_average_test_results_top_5_acc'
]

results_fed_arfl_df = []

for d in ['cifar10', 'cifar100', 'fmnist']:
    fed_arfl_folder = os.path.join(results_path, 'baseline', 'arfl', d)
    tbl = graph_code.TensorboardLoad(fed_arfl_folder, level=1)
    fed_arfl_results = tbl.scalars(tags=tags)[1]

    fed_arfl_results = (fed_arfl_results
    .drop(['run', 'level_0', 'level_1', 'level_2', 'level_3', 'level_4', 'level_5'], axis=1)
    .rename(
        {'level_6': 'Run', 'value': 'Value', 'step': 'Step', 'tag': 'Metric'},
        axis=1)
    )

    fed_arfl_results['corruption_type'] = fed_arfl_results['Run'].str.split('-').str[1]
    fed_arfl_results['run'] = (
        fed_arfl_results['Run'].str.split('-').str[2].str.split('_').str[1].astype(int) - 1
    )
    fed_arfl_results['method'] = 'ARFL'
    fed_arfl_results['epoch'] = fed_arfl_results['Step']
    fed_arfl_results['dataset'] = d
    fed_arfl_results = fed_arfl_results[[
         'dataset', 'method', 'corruption_type', 'run', 'Value', 'Metric', 'epoch'
    ]]
    fed_arfl_results = fed_arfl_results.pivot(
        index=['dataset', 'method', 'corruption_type', 'run', 'epoch'], 
        columns='Metric', 
        values='Value'
    ).reset_index()
    fed_arfl_results.columns.name = None
    fed_arfl_results = fed_arfl_results.rename(
        columns={
            'weighted_average_test_results_accuracy': 'test_top1acc',
            'weighted_average_test_results_top_5_acc': 'test_top5acc'
        }
    )
    if d in ['cifar10', 'cifar100']:
        fed_arfl_results = fed_arfl_results.query("epoch <= 15040").query("epoch > 0")


    results_fed_arfl_df.append(fed_arfl_results)

results_fed_arfl_df = pd.concat(results_fed_arfl_df)

Loading Files: 100%|▉▉▉▉▉▉▉▉▉▉| 35/35 [00:00<00:00, 73.03it/s]
Loading Files: 100%|▉▉▉▉▉▉▉▉▉▉| 35/35 [00:00<00:00, 72.35it/s] 
Loading Files: 100%|▉▉▉▉▉▉▉▉▉▉| 36/36 [00:00<00:00, 119.55it/s]


In [None]:
with open(os.path.join(results_path, "baseline", "co-teaching", "results.json")) as f:
    results_cot = json.load(f)


with open(os.path.join(results_path, "baseline", "idpa", "results.json")) as f:
    results_idpa = json.load(f)


with open(os.path.join(results_path, "baseline", "nls", "results.json")) as f:
    results_nls = json.load(f)


with open(os.path.join(results_path, "baseline", "cdr", "results.json")) as f:
    results_cdr = json.load(f)

In [14]:
results_cot_df = []

for dataset in results_cot:
    for corruption_type in results_cot[dataset]:
        for run in results_cot[dataset][corruption_type]:
            results_cot_df.append(
                pd.DataFrame(
                    results_cot[dataset][corruption_type][run]
                )
                .T
                .reset_index()
                .rename(columns={"index": "epoch"})
                .assign(
                    dataset=dataset,
                    corruption_type=corruption_type,
                    run=run,
                )
                .assign(epoch=lambda x: x['epoch'].astype(int)+1)
            )

results_cot_df = pd.concat(results_cot_df)
results_cot_df = (
    results_cot_df
    .astype(
        {
            "run": "int64",
        }
    )
)

In [15]:
results_idpa_df = []

for dataset in results_idpa:
    for corruption_type in results_idpa[dataset]:
        for run in results_idpa[dataset][corruption_type]:
            results_idpa_df.append(
                pd.DataFrame(
                    results_idpa[dataset][corruption_type][run]
                )
                .T
                .reset_index()
                .rename(columns={"index": "epoch"})
                .assign(
                    dataset=dataset,
                    corruption_type=corruption_type,
                    run=run,
                )
                .assign(epoch=lambda x: x['epoch'].astype(int)+1)
            )

results_idpa_df = pd.concat(results_idpa_df)
results_idpa_df = (
    results_idpa_df
    .astype(
        {
            "run": "int64",
        }
    )
)

In [16]:
results_nls_df = []

for dataset in results_nls:
    for corruption_type in results_nls[dataset]:
        for run in results_nls[dataset][corruption_type]:
            results_nls_df.append(
                pd.DataFrame(
                    {k: v for k,v in results_nls[dataset][corruption_type][run].items() if k != 'corrupt_sources'}
                )
                .T
                .reset_index()
                .rename(columns={"index": "epoch"})
                .assign(
                    dataset=dataset,
                    corruption_type=corruption_type,
                    run=run,
                )
                .assign(epoch=lambda x: x['epoch'].astype(int)+1)
            )

results_nls_df = pd.concat(results_nls_df)
results_nls_df = (
    results_nls_df
    .astype(
        {
            "run": "int64",
        }
    )
)

In [17]:
results_cdr_df = []

for dataset in results_cdr:
    for corruption_type in results_cdr[dataset]:
        for run in results_cdr[dataset][corruption_type]:
            results_cdr_df.append(
                pd.DataFrame(
                    {k: v for k,v in results_cdr[dataset][corruption_type][run].items() if k != 'corrupt_sources'}
                )
                .T
                .reset_index()
                .rename(columns={"index": "epoch"})
                .assign(
                    dataset=dataset,
                    corruption_type=corruption_type,
                    run=run,
                )
                .assign(epoch=lambda x: x['epoch'].astype(int)+1)
            )

results_cdr_df = pd.concat(results_cdr_df)
results_cdr_df = (
    results_cdr_df
    .astype(
        {
            "run": "int64",
        }
    )
)

In [18]:
results_cdr_df = results_cdr_df.assign(method="CDR")
results_nls_df = results_nls_df.assign(method="Label Smoothing")
results_cot_df = results_cot_df.assign(method="Co-teaching")
results_idpa_df = results_idpa_df.assign(method="IDPA")
results_df = results_df.assign(
    method=lambda df: df['method'].map({"lap": "LAP", "standard": "Standard"})
)

percentage improvement results

In [19]:
columns_intersection = [
    'dataset', 'corruption_type', 'run', 'epoch', 
    'method', 'test_top1acc', 'test_top5acc'
]

results_all_perc_improvement = (
    pd.concat([
        results_df[columns_intersection].reset_index(drop=True),
        results_nls_df[columns_intersection].reset_index(drop=True),
        results_cdr_df[columns_intersection].reset_index(drop=True),
        results_cot_df[columns_intersection].reset_index(drop=True),
        results_idpa_df[columns_intersection].reset_index(drop=True),
        results_fed_arfl_df[columns_intersection].reset_index(drop=True)
    ])
    .assign(test_top1acc=lambda x: x['test_top1acc']*100)
    .assign(test_top5acc=lambda x: x['test_top5acc']*100)
    .groupby(['dataset', 'corruption_type', 'run', 'method'])
    .agg({'test_top1acc': 'max', 'test_top5acc': 'max'})
    .unstack()
    .reset_index()
    .set_index(['dataset', 'corruption_type', 'run'])
    .stack(0)
    .apply(
        lambda x: x[['ARFL', 'Co-teaching', 'IDPA', "CDR", "Label Smoothing", 'LAP']]/x['Standard']-1, 
        axis=1
    )
    .assign(Standard=lambda x: 0)
    .unstack(-1)
    .reset_index()
    .drop(columns='run')
    .groupby(['dataset', 'corruption_type'])
    .agg(['mean', 'std'])
    .stack(0)
    .stack(0)
    .assign(
        mean_std = lambda x: 
            (x['mean']*100).map("{:.2f}\%".format).astype(str)
            + " ± "
            + (x['std']*100).map("{:.2f}".format).astype(str)
    )
    .drop(columns=['mean', 'std'])
    .unstack()
    .unstack()

)
results_all_perc_improvement

  .drop(columns='run')
  .agg(['mean', 'std'])


Unnamed: 0_level_0,Unnamed: 1_level_0,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std
Unnamed: 0_level_1,Unnamed: 1_level_1,test_top1acc,test_top1acc,test_top1acc,test_top1acc,test_top1acc,test_top1acc,test_top1acc,test_top5acc,test_top5acc,test_top5acc,test_top5acc,test_top5acc,test_top5acc,test_top5acc
Unnamed: 0_level_2,method,ARFL,CDR,Co-teaching,IDPA,LAP,Label Smoothing,Standard,ARFL,CDR,Co-teaching,IDPA,LAP,Label Smoothing,Standard
dataset,corruption_type,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
cifar10,c_cs,-5.12\% ± 1.59,-2.89\% ± 3.05,3.68\% ± 1.50,0.77\% ± 0.49,3.02\% ± 2.22,-2.79\% ± 1.84,0.00\% ± 0.00,-0.54\% ± 0.15,-0.31\% ± 0.34,0.34\% ± 0.19,-0.30\% ± 0.11,0.27\% ± 0.27,-0.70\% ± 0.26,0.00\% ± 0.00
cifar10,c_lbf,-1.40\% ± 3.67,-0.77\% ± 3.30,6.06\% ± 1.05,3.77\% ± 2.10,10.04\% ± 0.85,1.90\% ± 3.59,0.00\% ± 0.00,1.83\% ± 0.72,0.37\% ± 0.45,2.62\% ± 0.41,1.61\% ± 0.51,3.16\% ± 0.20,0.31\% ± 0.78,0.00\% ± 0.00
cifar10,c_lbs,-2.31\% ± 3.95,-0.76\% ± 2.18,4.29\% ± 2.08,-0.80\% ± 1.68,7.59\% ± 2.57,-2.00\% ± 2.74,0.00\% ± 0.00,0.16\% ± 0.58,-0.20\% ± 0.27,0.85\% ± 0.19,-0.78\% ± 1.00,1.41\% ± 0.27,-0.49\% ± 0.39,0.00\% ± 0.00
cifar10,c_no,-7.92\% ± 2.26,-2.42\% ± 1.50,1.50\% ± 0.42,-1.12\% ± 2.70,1.46\% ± 1.63,0.26\% ± 1.52,0.00\% ± 0.00,-1.16\% ± 0.53,-0.42\% ± 0.31,0.03\% ± 0.19,-0.62\% ± 0.37,-0.01\% ± 0.18,-0.66\% ± 0.49,0.00\% ± 0.00
cifar10,c_ns,-2.33\% ± 1.51,-2.72\% ± 2.79,2.94\% ± 0.91,1.02\% ± 2.23,3.43\% ± 2.41,-1.05\% ± 2.28,0.00\% ± 0.00,-0.14\% ± 0.45,-0.22\% ± 0.38,0.27\% ± 0.14,-0.03\% ± 0.32,0.39\% ± 0.24,-0.33\% ± 0.24,0.00\% ± 0.00
cifar10,c_rl,3.10\% ± 2.45,-3.22\% ± 3.44,1.15\% ± 1.43,-17.90\% ± 3.40,8.34\% ± 3.93,0.15\% ± 4.01,0.00\% ± 0.00,3.62\% ± 1.14,-0.75\% ± 1.44,1.74\% ± 1.34,-3.18\% ± 1.87,4.18\% ± 1.15,-0.01\% ± 1.13,0.00\% ± 0.00
cifar10,no_c,-3.76\% ± 2.40,-0.79\% ± 2.03,2.86\% ± 0.45,2.66\% ± 1.53,-0.38\% ± 1.53,0.84\% ± 1.23,0.00\% ± 0.00,-0.27\% ± 0.16,-0.09\% ± 0.24,0.06\% ± 0.10,0.03\% ± 0.08,-0.02\% ± 0.11,-0.35\% ± 0.24,0.00\% ± 0.00
cifar100,c_cs,-29.48\% ± 5.61,-0.71\% ± 4.11,3.41\% ± 1.35,-3.03\% ± 2.80,3.78\% ± 3.85,-1.82\% ± 1.57,0.00\% ± 0.00,-17.59\% ± 3.31,-0.29\% ± 1.84,1.06\% ± 1.15,-2.95\% ± 1.83,2.26\% ± 1.72,-1.03\% ± 1.33,0.00\% ± 0.00
cifar100,c_lbf,-16.04\% ± 9.31,-2.24\% ± 5.54,21.62\% ± 6.80,11.73\% ± 6.79,23.82\% ± 6.48,-3.03\% ± 6.69,0.00\% ± 0.00,-8.64\% ± 6.29,-3.03\% ± 2.58,11.57\% ± 2.15,4.41\% ± 3.94,12.84\% ± 2.60,-2.00\% ± 3.63,0.00\% ± 0.00
cifar100,c_lbs,-19.73\% ± 10.44,1.58\% ± 3.45,13.00\% ± 1.94,3.06\% ± 7.18,10.06\% ± 1.69,-0.29\% ± 3.32,0.00\% ± 0.00,-10.15\% ± 7.25,1.38\% ± 1.39,7.03\% ± 1.49,-0.68\% ± 3.85,6.96\% ± 0.93,-0.08\% ± 2.65,0.00\% ± 0.00


In [20]:
results_all_perc_improvement = pd.concat(
    [
        results_all_perc_improvement.loc["cifar10", ("mean_std", "test_top1acc")].assign(dataset="cifar10"),
        results_all_perc_improvement.loc["cifar100", ("mean_std", "test_top5acc")].assign(dataset="cifar100"),
        results_all_perc_improvement.loc["fmnist", ("mean_std", "test_top1acc")].assign(dataset="fmnist"),
    ]
).reset_index().set_index(["dataset", "corruption_type"])

In [21]:
def bold_max_value(x, model_names):
    x = x.copy()
    len_cols = x.shape[0]
    n_models = len(model_names)
    idx_bold = (
        x
        [-n_models:]
        .str.replace(" ", "")
        .str.replace("\%", "")
        .str.split("±")
        .str[0]
        .astype(float)
        .argmax()
    )
    max_val = x.iloc[idx_bold+len_cols-n_models]
    max_val, max_std = map(float, max_val.replace("\%", "").split("±"))
    #x.iloc[idx_bold+len_cols-n_models] = '\\textbf{' + x.iloc[idx_bold+len_cols-n_models] + '}'
    # also bold those that are within one std of the max

    for i in range(n_models+1):
        val = x.iloc[i]
        val, std = map(float, val.replace("\%", "").split("±"))
        if val >= max_val - max_std and val <= max_val + max_std:
            x.iloc[i] = '\\textbf{' + x.iloc[i] + '}'
    
    return x

dataset_map = {
    'cifar10': 'CIFAR-10',
    'cifar100': 'CIFAR-100',
    'fmnist': 'F-MNIST',
}


dataset_order = [
    'CIFAR-10',
    'CIFAR-100',
    'F-MNIST',
]

corruption_type_map = {
    "no_c": "Original Data",
    "c_cs": "Chunk Shuffle",
    "c_rl": "Random Label",
    "c_lbs": "Batch Label Shuffle",
    "c_lbf": "Batch Label Flip",
    "c_ns": "Added Noise",
    "c_no": "Replace With Noise",
}

corruption_type_order = [
    "Original Data",
    "Chunk Shuffle",
    "Random Label",
    "Batch Label Shuffle",
    "Batch Label Flip",
    "Added Noise",
    "Replace With Noise",
]



model_order = [
    "Standard",
    "ARFL",
    "IDPA",
    "Co-teaching",
    "CDR",
    "Label Smoothing",
    "LAP",
]



results_formatted = (
    results_all_perc_improvement
    .apply(
        bold_max_value,
        model_names = ["Co-teaching", "IDPA", "CDR", "Label Smoothing", "LAP", "Standard"],
        axis=1
    )
    .reset_index()
    .rename(columns={"corruption_type": "Corruption Type", "dataset": "Dataset"})
    .replace({'Dataset': dataset_map, "Corruption Type": corruption_type_map})
    .reset_index()
    .drop(columns=['index'])
    .rename_axis(index=None, columns=None)
    .set_index(['Dataset', 'Corruption Type'])
    .reindex(
        [
            (ds, ct)  for ds in dataset_order for ct in corruption_type_order
        ]
    )
    .reset_index()
    [
        ['Dataset', 'Corruption Type'] + model_order
    ]
    # change standard to "-" if not bold and \textbf{-} if bold
    .assign(
        Standard=lambda x: x['Standard'].apply(
            lambda y: '\\textbf{-}' if y != '0.00\% ± 0.00' else '-'
        )
    )
)

results_formatted

Unnamed: 0,Dataset,Corruption Type,Standard,ARFL,IDPA,Co-teaching,CDR,Label Smoothing,LAP
0,CIFAR-10,Original Data,-,-3.76\% ± 2.40,\textbf{2.66\% ± 1.53},\textbf{2.86\% ± 0.45},-0.79\% ± 2.03,0.84\% ± 1.23,-0.38\% ± 1.53
1,CIFAR-10,Chunk Shuffle,-,-5.12\% ± 1.59,0.77\% ± 0.49,\textbf{3.68\% ± 1.50},-2.89\% ± 3.05,-2.79\% ± 1.84,\textbf{3.02\% ± 2.22}
2,CIFAR-10,Random Label,-,3.10\% ± 2.45,-17.90\% ± 3.40,1.15\% ± 1.43,-3.22\% ± 3.44,0.15\% ± 4.01,\textbf{8.34\% ± 3.93}
3,CIFAR-10,Batch Label Shuffle,-,-2.31\% ± 3.95,-0.80\% ± 1.68,4.29\% ± 2.08,-0.76\% ± 2.18,-2.00\% ± 2.74,\textbf{7.59\% ± 2.57}
4,CIFAR-10,Batch Label Flip,-,-1.40\% ± 3.67,3.77\% ± 2.10,6.06\% ± 1.05,-0.77\% ± 3.30,1.90\% ± 3.59,\textbf{10.04\% ± 0.85}
5,CIFAR-10,Added Noise,-,-2.33\% ± 1.51,\textbf{1.02\% ± 2.23},\textbf{2.94\% ± 0.91},-2.72\% ± 2.79,-1.05\% ± 2.28,\textbf{3.43\% ± 2.41}
6,CIFAR-10,Replace With Noise,-,-7.92\% ± 2.26,-1.12\% ± 2.70,\textbf{1.50\% ± 0.42},-2.42\% ± 1.50,0.26\% ± 1.52,\textbf{1.46\% ± 1.63}
7,CIFAR-100,Original Data,-,-20.42\% ± 3.05,\textbf{2.42\% ± 0.77},0.41\% ± 1.84,-0.02\% ± 2.36,-0.67\% ± 1.91,0.10\% ± 1.91
8,CIFAR-100,Chunk Shuffle,-,-17.59\% ± 3.31,-2.95\% ± 1.83,\textbf{1.06\% ± 1.15},-0.29\% ± 1.84,-1.03\% ± 1.33,\textbf{2.26\% ± 1.72}
9,CIFAR-100,Random Label,-,-16.20\% ± 6.51,-15.13\% ± 1.72,4.93\% ± 1.96,-2.06\% ± 1.89,-0.07\% ± 0.90,\textbf{18.40\% ± 2.38}


In [22]:
results_all_perc_improvement

Unnamed: 0_level_0,method,ARFL,CDR,Co-teaching,IDPA,LAP,Label Smoothing,Standard
dataset,corruption_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cifar10,c_cs,-5.12\% ± 1.59,-2.89\% ± 3.05,3.68\% ± 1.50,0.77\% ± 0.49,3.02\% ± 2.22,-2.79\% ± 1.84,0.00\% ± 0.00
cifar10,c_lbf,-1.40\% ± 3.67,-0.77\% ± 3.30,6.06\% ± 1.05,3.77\% ± 2.10,10.04\% ± 0.85,1.90\% ± 3.59,0.00\% ± 0.00
cifar10,c_lbs,-2.31\% ± 3.95,-0.76\% ± 2.18,4.29\% ± 2.08,-0.80\% ± 1.68,7.59\% ± 2.57,-2.00\% ± 2.74,0.00\% ± 0.00
cifar10,c_no,-7.92\% ± 2.26,-2.42\% ± 1.50,1.50\% ± 0.42,-1.12\% ± 2.70,1.46\% ± 1.63,0.26\% ± 1.52,0.00\% ± 0.00
cifar10,c_ns,-2.33\% ± 1.51,-2.72\% ± 2.79,2.94\% ± 0.91,1.02\% ± 2.23,3.43\% ± 2.41,-1.05\% ± 2.28,0.00\% ± 0.00
cifar10,c_rl,3.10\% ± 2.45,-3.22\% ± 3.44,1.15\% ± 1.43,-17.90\% ± 3.40,8.34\% ± 3.93,0.15\% ± 4.01,0.00\% ± 0.00
cifar10,no_c,-3.76\% ± 2.40,-0.79\% ± 2.03,2.86\% ± 0.45,2.66\% ± 1.53,-0.38\% ± 1.53,0.84\% ± 1.23,0.00\% ± 0.00
cifar100,c_cs,-17.59\% ± 3.31,-0.29\% ± 1.84,1.06\% ± 1.15,-2.95\% ± 1.83,2.26\% ± 1.72,-1.03\% ± 1.33,0.00\% ± 0.00
cifar100,c_lbf,-8.64\% ± 6.29,-3.03\% ± 2.58,11.57\% ± 2.15,4.41\% ± 3.94,12.84\% ± 2.60,-2.00\% ± 3.63,0.00\% ± 0.00
cifar100,c_lbs,-10.15\% ± 7.25,1.38\% ± 1.39,7.03\% ± 1.49,-0.68\% ± 3.85,6.96\% ± 0.93,-0.08\% ± 2.65,0.00\% ± 0.00


In [23]:
print(results_formatted.to_latex(index=False))

\begin{tabular}{lllllllll}
\toprule
Dataset & Corruption Type & Standard & ARFL & IDPA & Co-teaching & CDR & Label Smoothing & LAP \\
\midrule
CIFAR-10 & Original Data & - & -3.76\% ± 2.40 & \textbf{2.66\% ± 1.53} & \textbf{2.86\% ± 0.45} & -0.79\% ± 2.03 & 0.84\% ± 1.23 & -0.38\% ± 1.53 \\
CIFAR-10 & Chunk Shuffle & - & -5.12\% ± 1.59 & 0.77\% ± 0.49 & \textbf{3.68\% ± 1.50} & -2.89\% ± 3.05 & -2.79\% ± 1.84 & \textbf{3.02\% ± 2.22} \\
CIFAR-10 & Random Label & - & 3.10\% ± 2.45 & -17.90\% ± 3.40 & 1.15\% ± 1.43 & -3.22\% ± 3.44 & 0.15\% ± 4.01 & \textbf{8.34\% ± 3.93} \\
CIFAR-10 & Batch Label Shuffle & - & -2.31\% ± 3.95 & -0.80\% ± 1.68 & 4.29\% ± 2.08 & -0.76\% ± 2.18 & -2.00\% ± 2.74 & \textbf{7.59\% ± 2.57} \\
CIFAR-10 & Batch Label Flip & - & -1.40\% ± 3.67 & 3.77\% ± 2.10 & 6.06\% ± 1.05 & -0.77\% ± 3.30 & 1.90\% ± 3.59 & \textbf{10.04\% ± 0.85} \\
CIFAR-10 & Added Noise & - & -2.33\% ± 1.51 & \textbf{1.02\% ± 2.23} & \textbf{2.94\% ± 0.91} & -2.72\% ± 2.79 & -1.05\% ± 2.28 & 

raw results:

In [24]:
columns_intersection = [
    'dataset', 'corruption_type', 'run', 'epoch', 
    'method', 'test_top1acc', 'test_top5acc'
]

results_all = (
    pd.concat([
        results_df[columns_intersection],
        results_nls_df[columns_intersection],
        results_cdr_df[columns_intersection],
        results_cot_df[columns_intersection],
        results_idpa_df[columns_intersection],
        results_fed_arfl_df[columns_intersection]
    ])
    .assign(test_top1acc=lambda x: x['test_top1acc']*100)
    .assign(test_top5acc=lambda x: x['test_top5acc']*100)
    .groupby(['dataset', 'corruption_type', 'run', 'method'])
    .agg({'test_top1acc': 'max', 'test_top5acc': 'max'})
    .unstack()
    .swaplevel(axis=1)   
    .reset_index()
    .drop(columns='run')
    .groupby(['dataset', 'corruption_type'])
    .agg(['mean', 'std'])
    .stack(0)
    .stack(0)
    .assign(
        mean_std = lambda x: 
            np.round(x['mean'], 2).astype(str) 
            + " ± "
            + np.round(x['std'], 2).astype(str),
    )
    .drop(columns=['mean', 'std'])
    .unstack()
    .unstack()
)
results_all

  .drop(columns='run')
  .agg(['mean', 'std'])


Unnamed: 0_level_0,Unnamed: 1_level_0,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std,mean_std
Unnamed: 0_level_1,Unnamed: 1_level_1,test_top1acc,test_top1acc,test_top1acc,test_top1acc,test_top1acc,test_top1acc,test_top1acc,test_top5acc,test_top5acc,test_top5acc,test_top5acc,test_top5acc,test_top5acc,test_top5acc
Unnamed: 0_level_2,method,ARFL,CDR,Co-teaching,IDPA,LAP,Label Smoothing,Standard,ARFL,CDR,Co-teaching,IDPA,LAP,Label Smoothing,Standard
dataset,corruption_type,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
cifar10,c_cs,68.42 ± 0.71,70.02 ± 1.35,74.77 ± 0.3,72.68 ± 0.96,74.29 ± 1.43,70.1 ± 0.97,72.12 ± 0.95,97.47 ± 0.15,97.69 ± 0.33,98.33 ± 0.16,97.7 ± 0.11,98.26 ± 0.27,97.31 ± 0.28,98.0 ± 0.03
cifar10,c_lbf,66.16 ± 1.96,66.59 ± 1.79,71.18 ± 0.76,69.65 ± 1.54,73.85 ± 0.92,68.39 ± 2.14,67.12 ± 0.7,96.85 ± 0.59,95.46 ± 0.47,97.59 ± 0.37,96.63 ± 0.3,98.11 ± 0.09,95.4 ± 0.56,95.11 ± 0.23
cifar10,c_lbs,67.12 ± 1.93,68.21 ± 1.06,71.68 ± 0.92,68.19 ± 1.48,73.94 ± 0.33,67.35 ± 1.23,68.75 ± 1.36,96.99 ± 0.53,96.65 ± 0.23,97.67 ± 0.3,96.08 ± 1.0,98.21 ± 0.12,96.36 ± 0.32,96.84 ± 0.27
cifar10,c_no,67.01 ± 1.02,71.02 ± 1.04,73.87 ± 0.57,71.96 ± 1.53,73.84 ± 0.55,72.97 ± 0.83,72.79 ± 0.67,97.03 ± 0.39,97.76 ± 0.26,98.2 ± 0.15,97.56 ± 0.26,98.16 ± 0.07,97.53 ± 0.35,98.17 ± 0.23
cifar10,c_ns,68.59 ± 1.38,68.3 ± 1.32,72.29 ± 0.61,70.94 ± 1.61,72.63 ± 1.24,69.48 ± 1.33,70.23 ± 0.76,97.41 ± 0.44,97.33 ± 0.29,97.8 ± 0.11,97.51 ± 0.28,97.93 ± 0.21,97.22 ± 0.23,97.54 ± 0.09
cifar10,c_rl,69.93 ± 1.88,65.61 ± 0.97,68.61 ± 1.78,55.66 ± 1.54,73.44 ± 0.9,67.92 ± 2.66,67.84 ± 1.84,97.56 ± 0.48,93.45 ± 1.33,95.79 ± 0.69,91.15 ± 0.89,98.09 ± 0.13,94.15 ± 0.75,94.16 ± 1.08
cifar10,no_c,74.89 ± 1.67,77.2 ± 1.59,80.04 ± 0.49,79.89 ± 1.01,77.52 ± 0.9,78.47 ± 0.72,77.82 ± 0.37,98.42 ± 0.22,98.6 ± 0.29,98.74 ± 0.11,98.71 ± 0.04,98.66 ± 0.15,98.34 ± 0.21,98.69 ± 0.06
cifar100,c_cs,26.14 ± 2.12,36.79 ± 1.13,38.33 ± 0.59,35.93 ± 0.62,38.46 ± 1.33,36.39 ± 0.59,37.07 ± 0.61,56.76 ± 2.69,68.65 ± 0.91,69.58 ± 0.51,66.82 ± 0.85,70.41 ± 0.96,68.15 ± 0.95,68.86 ± 0.74
cifar100,c_lbf,25.78 ± 2.39,30.03 ± 0.78,37.38 ± 1.37,34.34 ± 1.45,38.04 ± 0.63,29.8 ± 1.63,30.78 ± 1.27,56.51 ± 3.36,60.01 ± 1.16,69.04 ± 1.19,64.61 ± 2.1,69.82 ± 0.78,60.64 ± 2.05,61.89 ± 0.97
cifar100,c_lbs,27.03 ± 3.47,34.21 ± 1.05,38.07 ± 0.89,34.7 ± 2.14,37.09 ± 1.21,33.58 ± 0.72,33.69 ± 0.68,58.2 ± 4.45,65.69 ± 1.15,69.34 ± 0.69,64.34 ± 2.09,69.31 ± 1.4,64.73 ± 1.12,64.8 ± 0.97


In [25]:
results_all = pd.concat(
    [
        results_all.loc["cifar10", ("mean_std", "test_top1acc")].assign(dataset="cifar10"),
        results_all.loc["cifar100", ("mean_std", "test_top5acc")].assign(dataset="cifar100"),
        results_all.loc["fmnist", ("mean_std", "test_top1acc")].assign(dataset="fmnist"),
    ]
).reset_index().set_index(["dataset", "corruption_type"])

In [26]:
def bold_max_value(x, model_names):
    x = x.copy()
    len_cols = x.shape[0]
    n_models = len(model_names)
    idx_bold = (
        x
        [-n_models:]
        .str.replace(" ", "")
        .str.split("±")
        .str[0]
        .astype(float)
        .argmax()
    )
    max_val = x.iloc[idx_bold+len_cols-n_models]
    max_val, max_std = map(float, max_val.split("±"))
    #x.iloc[idx_bold+len_cols-n_models] = '\\textbf{' + x.iloc[idx_bold+len_cols-n_models] + '}'
    # also bold those that are within one std of the max

    for i in range(n_models+1):
        val = x.iloc[i]
        val, std = map(float, val.split("±"))
        if val >= max_val - max_std and val <= max_val + max_std:
            x.iloc[i] = '\\textbf{' + x.iloc[i] + '}'
    
    return x

dataset_map = {
    'cifar10': 'CIFAR-10',
    'cifar100': 'CIFAR-100',
    'fmnist': 'F-MNIST',
}


dataset_order = [
    'CIFAR-10',
    'CIFAR-100',
    'F-MNIST',
]

corruption_type_map = {
    "no_c": "Original Data",
    "c_cs": "Chunk Shuffle",
    "c_rl": "Random Label",
    "c_lbs": "Batch Label Shuffle",
    "c_lbf": "Batch Label Flip",
    "c_ns": "Added Noise",
    "c_no": "Replace With Noise",
}

corruption_type_order = [
    "Original Data",
    "Chunk Shuffle",
    "Random Label",
    "Batch Label Shuffle",
    "Batch Label Flip",
    "Added Noise",
    "Replace With Noise",
]




model_order = [
    "Standard",
    "ARFL",
    "IDPA",
    "Co-teaching",
    "CDR",
    "Label Smoothing",
    "LAP",
]


results_formatted = (
    results_all
    .apply(
        bold_max_value,
        model_names = ["Co-teaching", "IDPA", "CDR", "Label Smoothing", "LAP", "Standard"],
        axis=1
    )
    .reset_index()
    .rename(columns={"corruption_type": "Corruption Type", "dataset": "Dataset"})
    .replace({'Dataset': dataset_map, "Corruption Type": corruption_type_map})
    .reset_index()
    .drop(columns=['index'])
    .rename_axis(index=None, columns=None)
    .set_index(['Dataset', 'Corruption Type'])
    .reindex(
        [
            (ds, ct)  for ds in dataset_order for ct in corruption_type_order
        ]
    )
    .reset_index()
    [
        ['Dataset', 'Corruption Type'] + model_order
    ]
)

results_formatted

Unnamed: 0,Dataset,Corruption Type,Standard,ARFL,IDPA,Co-teaching,CDR,Label Smoothing,LAP
0,CIFAR-10,Original Data,77.82 ± 0.37,74.89 ± 1.67,\textbf{79.89 ± 1.01},\textbf{80.04 ± 0.49},77.2 ± 1.59,78.47 ± 0.72,77.52 ± 0.9
1,CIFAR-10,Chunk Shuffle,72.12 ± 0.95,68.42 ± 0.71,72.68 ± 0.96,\textbf{74.77 ± 0.3},70.02 ± 1.35,70.1 ± 0.97,74.29 ± 1.43
2,CIFAR-10,Random Label,67.84 ± 1.84,69.93 ± 1.88,55.66 ± 1.54,68.61 ± 1.78,65.61 ± 0.97,67.92 ± 2.66,\textbf{73.44 ± 0.9}
3,CIFAR-10,Batch Label Shuffle,68.75 ± 1.36,67.12 ± 1.93,68.19 ± 1.48,71.68 ± 0.92,68.21 ± 1.06,67.35 ± 1.23,\textbf{73.94 ± 0.33}
4,CIFAR-10,Batch Label Flip,67.12 ± 0.7,66.16 ± 1.96,69.65 ± 1.54,71.18 ± 0.76,66.59 ± 1.79,68.39 ± 2.14,\textbf{73.85 ± 0.92}
5,CIFAR-10,Added Noise,70.23 ± 0.76,68.59 ± 1.38,70.94 ± 1.61,\textbf{72.29 ± 0.61},68.3 ± 1.32,69.48 ± 1.33,\textbf{72.63 ± 1.24}
6,CIFAR-10,Replace With Noise,72.79 ± 0.67,67.01 ± 1.02,71.96 ± 1.53,\textbf{73.87 ± 0.57},71.02 ± 1.04,72.97 ± 0.83,\textbf{73.84 ± 0.55}
7,CIFAR-100,Original Data,75.95 ± 1.01,60.42 ± 1.91,\textbf{77.78 ± 0.95},76.25 ± 0.54,75.91 ± 1.11,75.42 ± 0.49,76.01 ± 0.54
8,CIFAR-100,Chunk Shuffle,68.86 ± 0.74,56.76 ± 2.69,66.82 ± 0.85,\textbf{69.58 ± 0.51},68.65 ± 0.91,68.15 ± 0.95,\textbf{70.41 ± 0.96}
9,CIFAR-100,Random Label,58.34 ± 0.91,48.85 ± 3.2,49.5 ± 1.03,61.2 ± 0.54,57.12 ± 0.76,58.3 ± 0.93,\textbf{69.05 ± 0.38}


In [27]:
print(
    results_formatted
    [['Dataset', 'Corruption Type'] + model_order]
    .to_latex(index=False)
)

\begin{tabular}{lllllllll}
\toprule
Dataset & Corruption Type & Standard & ARFL & IDPA & Co-teaching & CDR & Label Smoothing & LAP \\
\midrule
CIFAR-10 & Original Data & 77.82 ± 0.37 & 74.89 ± 1.67 & \textbf{79.89 ± 1.01} & \textbf{80.04 ± 0.49} & 77.2 ± 1.59 & 78.47 ± 0.72 & 77.52 ± 0.9 \\
CIFAR-10 & Chunk Shuffle & 72.12 ± 0.95 & 68.42 ± 0.71 & 72.68 ± 0.96 & \textbf{74.77 ± 0.3} & 70.02 ± 1.35 & 70.1 ± 0.97 & 74.29 ± 1.43 \\
CIFAR-10 & Random Label & 67.84 ± 1.84 & 69.93 ± 1.88 & 55.66 ± 1.54 & 68.61 ± 1.78 & 65.61 ± 0.97 & 67.92 ± 2.66 & \textbf{73.44 ± 0.9} \\
CIFAR-10 & Batch Label Shuffle & 68.75 ± 1.36 & 67.12 ± 1.93 & 68.19 ± 1.48 & 71.68 ± 0.92 & 68.21 ± 1.06 & 67.35 ± 1.23 & \textbf{73.94 ± 0.33} \\
CIFAR-10 & Batch Label Flip & 67.12 ± 0.7 & 66.16 ± 1.96 & 69.65 ± 1.54 & 71.18 ± 0.76 & 66.59 ± 1.79 & 68.39 ± 2.14 & \textbf{73.85 ± 0.92} \\
CIFAR-10 & Added Noise & 70.23 ± 0.76 & 68.59 ± 1.38 & 70.94 ± 1.61 & \textbf{72.29 ± 0.61} & 68.3 ± 1.32 & 69.48 ± 1.33 & \textbf{72.63