In [1]:
import os
from pathlib import Path
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import torch

# from torchvision.datasets import CIFAR10
# import sys
# sys.path.append('./stylegan/stylegan2')
# from benchmark_calibration import postprocess_synthetic_images, preprocess_images_classifier

In [2]:
for p in [
    Path('/d/alecoz/projects'), # DeepLab
    Path(os.path.expandvars('$WORK')), # Jean Zay
    Path('w:/')]: # local
    if os.path.exists(p):
        path_main = p / 'calibration-with-synthetic-data'
path_results = path_main / 'results'
path_models =  path_main / 'models/CIFAR10'

In [3]:
dataset_origins = [
    'synthetic', 'synthetic filtered', 
    'validation', 'validation augmented', 
    'validation high confidence', 'validation low confidence'
]
dataset_sizes = [
    
]
model_names = [
    'densenet121', 'densenet161', 'densenet169',
    'googlenet',
    'inception_v3',
    'mobilenet_v2',
    'resnet18', 'resnet34', 'resnet50', 
    'vgg11_bn', 'vgg13_bn', 'vgg16_bn', 'vgg19_bn'
    ]
methods = ['baseline (no calibration)', 'temperature scaling', 'vector scaling']

In [4]:
df = pd.read_csv(path_results / 'benchmark_calibration_test.csv')
# average over seeds
df = df.groupby(['dataset origin', 'dataset size', 'model', 'method']).mean().reset_index()
df

Unnamed: 0,dataset origin,dataset size,model,method,seed,ECE,SCE,RMSCE,ACE,TACE,Accuracy,AUROC,Coverage_for_Accuracy_99,Coverage_for_Accuracy_95,Coverage_for_Accuracy_90,AURC
0,synthetic,1000,densenet121,baseline (no calibration),2.0,0.022205,0.003330,0.057408,0.007698,0.044101,94.060001,0.871927,0.002200,0.009478,0.001880,0.015375
1,synthetic,1000,densenet121,temperature scaling,2.0,0.030767,0.004915,0.054757,0.008115,0.041341,94.060001,0.871548,0.002240,0.012398,0.002360,0.015407
2,synthetic,1000,densenet121,vector scaling,2.0,0.021055,0.004355,0.042191,0.008588,0.037915,93.940000,0.843663,0.005199,0.035433,0.002799,0.017250
3,synthetic,1000,densenet161,baseline (no calibration),2.0,0.021215,0.003088,0.062704,0.008083,0.047961,93.988000,0.859910,0.002120,0.076745,0.000240,0.018076
4,synthetic,1000,densenet161,temperature scaling,2.0,0.027246,0.004227,0.060249,0.008384,0.047564,93.988000,0.859682,0.001840,0.002719,0.003759,0.018105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1397,validation low confidence,5000,vgg16_bn,temperature scaling,2.0,0.017531,0.002969,0.049991,0.006644,0.040182,93.936001,0.917471,0.001560,0.010278,0.006039,0.009400
1398,validation low confidence,5000,vgg16_bn,vector scaling,2.0,0.018333,0.002901,0.048255,0.006255,0.036687,93.844000,0.911564,0.002160,0.018196,0.001120,0.010093
1399,validation low confidence,5000,vgg19_bn,baseline (no calibration),2.0,0.020657,0.004080,0.061233,0.006514,0.044683,94.051999,0.894471,0.001880,0.034193,0.000400,0.012947
1400,validation low confidence,5000,vgg19_bn,temperature scaling,2.0,0.021623,0.003105,0.052763,0.006622,0.041402,94.051999,0.894053,0.001560,0.009878,0.000160,0.012935


# Main table

In [None]:
# method = 'temperature scaling' # temperature scaling, vector scaling
metric = 'ECE'

pivot_df = df[df['dataset size'] == 5000].pivot_table(index='model', columns=['method', 'dataset origin'], values=metric)
pivot_df['No calibration'] = df[df['method'] == 'baseline (no calibration)'].pivot_table(index='model', values=metric)
pivot_df = pivot_df[['No calibration', 'temperature scaling', 'vector scaling']]
pivot_df = pivot_df[[col for col in pivot_df.columns if 'confidence' not in col[1]]]
pivot_df = pivot_df[[col for col in pivot_df.columns if 'filtered' not in col[1]]]

# format
pivot_df.columns.name = 'Calibration data'
pivot_df = pivot_df[
    pd.MultiIndex.from_tuples([
    ('No calibration', ''),
    ('temperature scaling', 'validation'),
    ('temperature scaling', 'validation augmented'),
    ('temperature scaling', 'synthetic'),
    ('vector scaling', 'validation'),
    ('vector scaling', 'validation augmented'),
    ('vector scaling', 'synthetic')])
]
pivot_df = 100*pivot_df

s = pivot_df.round(2).style
s = s.format('{:.2f}').highlight_min(axis=1, props="textbf:--rwrap;") # min value per row in bold
print(s.to_latex())

# Images

In [None]:
with open(path_models/'cifar10.pkl', 'rb') as f:
    G = pickle.load(f)['G_ema'].cuda()

n_images = 3
fig, axs = plt.subplots(n_images, 10, figsize=(10, 3))
for label in range(G.c_dim):

    z = torch.randn([n_images, G.z_dim]).cuda() # latent codes
    c = torch.nn.functional.one_hot(label*torch.ones((n_images,), dtype=int).cuda(), num_classes=G.c_dim) # class labels
    img = G(z, c, truncation_psi=1) # NCHW, float32, dynamic range [-1, +1]
    img = postprocess_synthetic_images(img).cpu()

    for i in range(n_images):
        ax = axs[i, label]
        ax.imshow(img[i].permute(1,2,0))
        ax.axis('off')  # Hide the axis
plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig(path_results / 'synthetic_images.png') 

In [None]:
dataset_train = CIFAR10(root=os.path.expandvars('$DSDIR'), train=True)

fig, axs = plt.subplots(n_images, 10, figsize=(10, 3))
for label in range(G.c_dim):

    imgs = []
    while len(imgs) < n_images:
        img, label_ = dataset_train[torch.randint(len(dataset_train), (1,)).item()]
        if label_ == label:
            imgs.append(img)

    for i in range(n_images):
        ax = axs[i, label]
        ax.imshow(imgs[i])
        ax.axis('off')  # Hide the axis
plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig(path_results / 'real_images.png') 

# Confidence values

In [19]:
metric = 'ECE'

pivot_df = df[df['dataset size'] == 2500].pivot_table(index='model', columns=['method', 'dataset origin'], values=metric)
pivot_df['No calibration'] = df[df['method'] == 'baseline (no calibration)'].pivot_table(index='model', values=metric)
pivot_df = pivot_df[['No calibration', 'temperature scaling', 'vector scaling']]

# format
pivot_df.columns.name = 'Calibration data'
pivot_df = pivot_df[
    pd.MultiIndex.from_tuples([
    ('No calibration', ''),
    ('temperature scaling', 'validation'),
    ('temperature scaling', 'validation low confidence'),
    ('temperature scaling', 'validation high confidence'),
    ('vector scaling', 'validation'),
    ('vector scaling', 'validation low confidence'),
    ('vector scaling', 'validation high confidence')])
]
pivot_df = 100*pivot_df

s = pivot_df.round(2).style
s = s.format('{:.2f}').highlight_min(axis=1, props="textbf:--rwrap;") # min value per row in bold
print(s.to_latex())

\begin{tabular}{lrrrrrrr}
method & No calibration & \multicolumn{3}{r}{temperature scaling} & \multicolumn{3}{r}{vector scaling} \\
dataset origin &  & validation & validation low confidence & validation high confidence & validation & validation low confidence & validation high confidence \\
model &  &  &  &  &  &  &  \\
densenet121 & 2.22 & \textbf{1.73} & 3.00 & 2.80 & 1.82 & 2.81 & 3.75 \\
densenet161 & 2.12 & 1.99 & 2.80 & 2.36 & \textbf{1.88} & 2.45 & 3.48 \\
densenet169 & 2.54 & 2.29 & 2.93 & 2.69 & \textbf{2.13} & 2.51 & 3.80 \\
googlenet & 1.47 & 1.22 & \textbf{1.02} & 2.01 & 1.25 & 1.13 & 5.31 \\
inception_v3 & 1.98 & 1.58 & 2.29 & 2.61 & \textbf{1.53} & 2.44 & 3.01 \\
mobilenet_v2 & 2.59 & \textbf{1.49} & 2.52 & 3.44 & \textbf{1.49} & 2.83 & 8.99 \\
resnet18 & 2.03 & 1.74 & 2.30 & 2.64 & \textbf{1.56} & 1.87 & 4.25 \\
resnet34 & 2.71 & 2.19 & 3.31 & 3.49 & \textbf{2.17} & 2.81 & 5.42 \\
resnet50 & 2.27 & \textbf{1.62} & 2.79 & 3.10 & 1.81 & 2.77 & 5.39 \\
vgg11_bn & 1.59 & 1.

# OTHER

# Influence of dataset size

Vector scaling always better than temperature scaling when looking at ACE. ACE also lowers with more data.

In [None]:
# dataset_origin = 'validation'
dataset_origin = 'synthetic'
# dataset_origin = 'synthetic filtered'
metric = 'ECE'

fig, axs = plt.subplots(int(len(model_names)/3)+1, 3, figsize=(10, len(model_names)))
fig.suptitle(f'Dataset origin: {dataset_origin}')

for ax, model_name in zip(axs.flatten(), model_names):

    for method in methods:
        indices = df[
            (df['dataset origin'] == dataset_origin) & (df['model'] == model_name) & (df['method'] == method)].index
        ax.plot(df.loc[indices, 'dataset size'], df.loc[indices, metric], label=method)
        ax.set_title(model_name)
        ax.set_ylabel(metric)

for ax in axs.flatten():
    ax.legend()
    ax.set_xlabel('calib subset size')

fig.tight_layout()

## Synthetic data

Synthetic data never improves ACE, rarely improves ECE. Filtering synthetic data is even worse.

In [None]:
metric = 'ACE'

for method in methods[1:]:
    nb_synth_better = 0
    nb_synth_filtered_better = 0
    for model_name in model_names:

        metric_value_validation = df.loc[
            (df['dataset origin'] == 'validation') & (df['dataset size'] == 5000) & (df['method'] == method) & (df['model'] == model_name),
            metric]
        metric_value_synthetic = df.loc[
            (df['dataset origin'] == 'synthetic') & (df['dataset size'] == 10000) & (df['method'] == method) & (df['model'] == model_name),
            metric]
        metric_value_synthetic_filtered = df.loc[
            (df['dataset origin'] == 'synthetic filtered') & (df['dataset size'] == 10000) & (df['method'] == method) & (df['model'] == model_name),
            metric]

        nb_synth_better += (metric_value_synthetic.item() < metric_value_validation.item())
        nb_synth_filtered_better += (metric_value_synthetic_filtered.item() < metric_value_synthetic.item())
        
    print(f'{method}: {nb_synth_better} / {len(model_names)} models better with synthetic data than with validation data')
    print(f'{method}: {nb_synth_filtered_better} / {len(model_names)} models better with filtered synthetic data than with synthetic data')

## Correlations

In [None]:
method = 'vector scaling'

plt.figure()
plt.scatter(df.loc[(df['dataset origin'] == 'validation') & (df['method'] == method), 'ECE'], 
            df.loc[(df['dataset origin'] == 'validation') & (df['method'] == method), 'ACE'])

In [None]:
plt.figure()
plt.scatter(df.loc[(df['dataset origin'] == 'validation') & (df['method'] == method), 'AUROC'], 
            df.loc[(df['dataset origin'] == 'validation') & (df['method'] == method), 'ECE'])

## Is data augmentation useful?

No, never improves compared to not-augmented validation data (same dataset size)

In [None]:
metric = 'ECE'

for method in methods[1:]:
    nb_aug_better = 0
    for model_name in model_names:

        metric_value_validation = df.loc[
            (df['dataset origin'] == 'validation') & (df['dataset size'] == 5000) & (df['method'] == method) & (df['model'] == model_name),
            metric]
        metric_value_synthetic = df.loc[
            (df['dataset origin'] == 'validation augmented') & (df['dataset size'] == 5000) & (df['method'] == method) & (df['model'] == model_name),
            metric]

        nb_aug_better += (metric_value_synthetic.item() < metric_value_validation.item())
        
    print(f'{method}: {nb_synth_better} / {len(model_names)} models better with augmented data than with validation data')


In [None]:
df.loc[(df['dataset origin'] == 'validation') & (df['dataset size'] == 5000) & (df['model'] == model_name)]

In [None]:
df.loc[(df['dataset origin'] == 'validation augmented') & (df['dataset size'] == 5000) & (df['model'] == model_name)]

## Comparison of methods

In [None]:
metric = 'ACE'

method = 'temperature scaling'
metric_value_validation = df.loc[
            (df['dataset origin'] == 'validation') & (df['dataset size'] == 5000) & (df['method'] == method) & (df['model'] == model_name),
            metric]


In [None]:
metric_value_validation

In [None]:
metric = 'AUROC'

metric_changes = {m: [] for m in ['temperature scaling', 'vector scaling']}
for method in ['temperature scaling', 'vector scaling']:

    for model_name in model_names:
        
        metric_after_calib = df.loc[
            (df['dataset origin'] == 'validation') & (df['dataset size'] == 5000) & (df['method'] == method) & (df['model'] == model_name),
            metric].item()

        metric_before_calib = df.loc[
            (df['dataset origin'] == 'validation') & (df['dataset size'] == 5000) & (df['method'] == 'baseline (no calibration)') & (df['model'] == model_name),
            metric].item()

        metric_changes[method].append(100*(metric_after_calib - metric_before_calib) / metric_before_calib)

metric_changes

In [None]:
metric_changes