# Table 2 CIFAR-10 Benchmarking (10 trials for each table entry) in Confident Learning paper.

# Benchmarking Confident Learning with TEN TRIALS EACH
* uses co-teaching
* learning with noisy labels accuracy on CIFAR-10

In [1]:
import pandas as pd

In [2]:
base = '/home/cgn/cifar10_cl_coteaching/'
experiments = ['argmax', 'pbc', 'pbnr', 'both', 'cj_only']
num_trials = 10

In [3]:
dfs = {}
for seed in range(1, num_trials + 1):
    results = []
    for experiment in ['argmax', 'pbc', 'pbnr', 'both', 'cj_only']:
        for noise in [2, 4, 6]:
            for sparsity in [0, 2, 4, 6]:
                f = str(sparsity) + "_" + str(noise)
                path = base + "trial{}/".format(seed) + experiment + "/" + f + "/out.log"
                try:
                    with open(path, 'r') as rf:
                        lines = rf.readlines()
                    acc = [[float(z[3:-1]) for z in l.split('Acc')[1:]] for l in lines if l.startswith(' * Acc')]
                    model1, model2 = acc[::2], acc[1::2]
                    acc1 = max([max(model2[i][0], a[0]) for i, a in enumerate(model1)])
                    acc5 = max([max(model2[i][1], a[1]) for i, a in enumerate(model1)])
                except:
                    print(path)
                    print(" ".join(lines[-5:-4]))
                    acc1, acc5 = np.NaN, np.NaN
                results.append({
                    'experiment': experiment,
                    'frac_zero_noise_rates': sparsity / 10.,
                    'noise_amount': noise / 10.,
                    'acc1': acc1,
                    'acc5': acc5,
                    'forget_rate': noise / 10. / 2,
                })

    df = pd.DataFrame(results)
    df_results = pd.concat([
        z.sort_values(by=['noise_amount', 'frac_zero_noise_rates']).set_index(
            ['noise_amount', 'frac_zero_noise_rates']).drop(
            ['experiment', 'forget_rate', 'acc5'], axis=1).T.set_index([[i]]) \
        for i, z in df.groupby('experiment')
    ])
    dfs[seed] = df_results
    
cl = pd.concat(dfs.values()).mean(level=0)
cl_std = pd.concat(dfs.values()).std(level=0)
cl = cl.reindex(['argmax', 'pbc', 'cj_only', 'both', 'pbnr'])
cl_std = cl_std.reindex(['argmax', 'pbc', 'cj_only', 'both', 'pbnr'])
cl.round(2)

noise_amount,0.2,0.2,0.2,0.2,0.4,0.4,0.4,0.4,0.6,0.6,0.6,0.6
frac_zero_noise_rates,0.0,0.2,0.4,0.6,0.0,0.2,0.4,0.6,0.0,0.2,0.4,0.6
argmax,89.64,89.41,90.19,89.86,83.91,83.86,83.24,84.21,31.46,39.28,33.72,30.56
pbc,90.48,90.14,90.64,90.74,84.78,85.54,85.31,86.17,33.71,40.71,35.13,31.39
cj_only,91.06,90.88,91.1,91.26,86.73,86.66,86.61,86.91,32.43,41.81,34.38,34.46
both,90.82,90.66,91.02,91.07,87.06,86.88,86.72,87.15,41.11,41.67,39.05,32.86
pbnr,90.66,90.54,90.89,90.87,87.12,86.83,86.61,87.22,41.04,41.84,39.09,36.37


## Results on other models -- ran on Google's servers by co-author Lu Jiang

In [4]:
mentornet = [[
#     0.9378,  # 0 noise
    0.8493, 0.8514, 0.8319, 0.8342,  # 0.2 noise
    0.6444, 0.6423, 0.6238, 0.6146,  # 0.4 noise
    0.2996, 0.3160, 0.2930, 0.2786,  # 0.6 noise
]]
mentornet = pd.DataFrame(mentornet, columns=cl.columns, index = ['mentornet'])
smodel = [[
#     0.9375,  # 0 noise
    0.8000, 0.7996, 0.7974, 0.7910,  # 0.2 noise
    0.5856, 0.6121, 0.5913, 0.5752,  # 0.4 noise
    0.2845, 0.2853, 0.2793, 0.2726,  # 0.6 noise
]]
smodel = pd.DataFrame(smodel, columns=cl.columns, index = ['smodel'])
reed = [[
#     0.9372,  # 0 noise
    0.7809, 0.7892, 0.8076, 0.7927,  # 0.2 noise
    0.6048, 0.6041, 0.6124, 0.5860,  # 0.4 noise
    0.2904, 0.2939, 0.2913, 0.2677,  # 0.6 noise
]]
reed = pd.DataFrame(reed, columns=cl.columns, index = ['reed'])
vanilla = [[
#     0.935,  # 0 noise
    0.7843, 0.7916, 0.7901, 0.7825,  # 0.2 noise
    0.6022, 0.6077, 0.5963, 0.5727,  # 0.4 noise
    0.2696, 0.2966, 0.2824, 0.2681,  # 0.6 noise
]]
vanilla = pd.DataFrame(vanilla, columns=cl.columns, index = ['vanilla'])

In [5]:
# With standard deviations
cl.round(1).astype(str) + '±' + cl_std.round(1).astype(str)

noise_amount,0.2,0.2,0.2,0.2,0.4,0.4,0.4,0.4,0.6,0.6,0.6,0.6
frac_zero_noise_rates,0.0,0.2,0.4,0.6,0.0,0.2,0.4,0.6,0.0,0.2,0.4,0.6
argmax,89.6±0.1,89.4±0.1,90.2±0.2,89.9±0.1,83.9±0.2,83.9±0.2,83.2±0.2,84.2±0.2,31.5±0.9,39.3±0.2,33.7±0.1,30.6±0.3
pbc,90.5±0.1,90.1±0.1,90.6±0.1,90.7±0.1,84.8±0.1,85.5±0.2,85.3±0.2,86.2±0.1,33.7±0.1,40.7±0.2,35.1±0.1,31.4±0.3
cj_only,91.1±0.2,90.9±0.1,91.1±0.2,91.3±0.1,86.7±0.1,86.7±0.2,86.6±0.1,86.9±0.1,32.4±1.0,41.8±0.2,34.4±0.2,34.5±1.6
both,90.8±0.1,90.7±0.1,91.0±0.1,91.1±0.1,87.1±0.1,86.9±0.1,86.7±0.2,87.2±0.1,41.1±0.4,41.7±0.3,39.0±0.3,32.9±1.9
pbnr,90.7±0.1,90.5±0.1,90.9±0.1,90.9±0.1,87.1±0.2,86.8±0.1,86.6±0.2,87.2±0.1,41.0±0.3,41.8±0.3,39.1±0.2,36.4±1.4


In [6]:
# Final table of results.
cifar10_final_benchmarks = (cl / 100).append(mentornet).append(smodel).append(reed).append(vanilla) * 100
cifar10_final_benchmarks.round(1)

noise_amount,0.2,0.2,0.2,0.2,0.4,0.4,0.4,0.4,0.6,0.6,0.6,0.6
frac_zero_noise_rates,0.0,0.2,0.4,0.6,0.0,0.2,0.4,0.6,0.0,0.2,0.4,0.6
argmax,89.6,89.4,90.2,89.9,83.9,83.9,83.2,84.2,31.5,39.3,33.7,30.6
pbc,90.5,90.1,90.6,90.7,84.8,85.5,85.3,86.2,33.7,40.7,35.1,31.4
cj_only,91.1,90.9,91.1,91.3,86.7,86.7,86.6,86.9,32.4,41.8,34.4,34.5
both,90.8,90.7,91.0,91.1,87.1,86.9,86.7,87.2,41.1,41.7,39.0,32.9
pbnr,90.7,90.5,90.9,90.9,87.1,86.8,86.6,87.2,41.0,41.8,39.1,36.4
mentornet,84.9,85.1,83.2,83.4,64.4,64.2,62.4,61.5,30.0,31.6,29.3,27.9
smodel,80.0,80.0,79.7,79.1,58.6,61.2,59.1,57.5,28.4,28.5,27.9,27.3
reed,78.1,78.9,80.8,79.3,60.5,60.4,61.2,58.6,29.0,29.4,29.1,26.8
vanilla,78.4,79.2,79.0,78.2,60.2,60.8,59.6,57.3,27.0,29.7,28.2,26.8


In [7]:
# Latex of the final table in the paper
method_name_map = {
	'argmax': r'CL: $\bm{C}_{\text{confusion}}$',
	'pbc': 'CL: PBC',
	'cj\_only': r'CL: $\cj$',
	'both': 'CL: C+NR',
	'pbnr': 'CL: PBNR',
	'vanilla': 'Baseline',
}
tex = cifar10_final_benchmarks.round(1).to_latex().replace('±', ' \pm ').replace('nan', '0.1')
for k, v in method_name_map.items():
    tex = tex.replace(k, v)
print(tex)

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
noise\_amount & \multicolumn{4}{l}{0.2} & \multicolumn{4}{l}{0.4} & \multicolumn{4}{l}{0.6} \\
frac\_zero\_noise\_rates &   0.0 &   0.2 &   0.4 &   0.6 &   0.0 &   0.2 &   0.4 &   0.6 &   0.0 &   0.2 &   0.4 &   0.6 \\
\midrule
CL: $\bm{C}_{\text{confusion}}$    &  89.6 &  89.4 &  90.2 &  89.9 &  83.9 &  83.9 &  83.2 &  84.2 &  31.5 &  39.3 &  33.7 &  30.6 \\
CL: PBC       &  90.5 &  90.1 &  90.6 &  90.7 &  84.8 &  85.5 &  85.3 &  86.2 &  33.7 &  40.7 &  35.1 &  31.4 \\
CL: $\cj$   &  91.1 &  90.9 &  91.1 &  91.3 &  86.7 &  86.7 &  86.6 &  86.9 &  32.4 &  41.8 &  34.4 &  34.5 \\
CL: C+NR      &  90.8 &  90.7 &  91.0 &  91.1 &  87.1 &  86.9 &  86.7 &  87.2 &  41.1 &  41.7 &  39.0 &  32.9 \\
CL: PBNR      &  90.7 &  90.5 &  90.9 &  90.9 &  87.1 &  86.8 &  86.6 &  87.2 &  41.0 &  41.8 &  39.1 &  36.4 \\
mentornet &  84.9 &  85.1 &  83.2 &  83.4 &  64.4 &  64.2 &  62.4 &  61.5 &  30.0 &  31.6 &  29.3 &  27.9 \\
smodel    &  80.0 &  80.0 &  79.7 &  79

In [28]:
print('Mean std. dev. of CL methods across all sparsities for:')
print(' * 20% noise: {:.2f}%'.format(cl_std.iloc[:,:4].mean().mean()))
print(' * 40% noise: {:.2f}%'.format(cl_std.iloc[:,4:8].mean().mean()))
print(' * 70% noise: {:.2f}%'.format(cl_std.iloc[:,8:12].mean().mean()))

Mean std. dev. of CL methods across all sparsities for:
 * 20% noise: 0.11%
 * 40% noise: 0.15%
 * 70% noise: 0.52%


In [10]:
tex = cl_std.round(2).to_latex()
for k, v in method_name_map.items():
    tex = tex.replace(k, v)
print(tex)

\begin{tabular}{lrrrrrrrrrrrr}
\toprule
{} & \multicolumn{4}{l}{0.2} & \multicolumn{4}{l}{0.4} & \multicolumn{4}{l}{0.6} \\
{} &   0.0 &   0.2 &   0.4 &   0.6 &   0.0 &   0.2 &   0.4 &   0.6 &   0.0 &   0.2 &   0.4 &   0.6 \\
\midrule
CL: $\bm{C}_{\text{confusion}}$  &  0.07 &  0.10 &  0.17 &  0.08 &  0.19 &  0.22 &  0.23 &  0.20 &  0.93 &  0.24 &  0.13 &  0.26 \\
CL: PBC     &  0.14 &  0.12 &  0.11 &  0.10 &  0.15 &  0.17 &  0.16 &  0.10 &  0.12 &  0.22 &  0.11 &  0.30 \\
CL: $\cj$ &  0.17 &  0.09 &  0.17 &  0.11 &  0.10 &  0.20 &  0.09 &  0.13 &  1.02 &  0.15 &  0.18 &  1.63 \\
CL: C+NR    &  0.09 &  0.10 &  0.08 &  0.08 &  0.11 &  0.14 &  0.16 &  0.10 &  0.42 &  0.33 &  0.26 &  1.90 \\
CL: PBNR    &  0.15 &  0.09 &  0.09 &  0.10 &  0.18 &  0.10 &  0.15 &  0.12 &  0.26 &  0.28 &  0.24 &  1.43 \\
\bottomrule
\end{tabular}

