# This notebook is to visualize standard results for differnet metric (ECE, Error, Loss, Brier Score, Temperature) on different models (ResNet, EfficeintNet) with different datasets (Cifar10C, OfficeHome, DomainNet).

In [4]:
import pandas as pd
import os
import math

### Change the paramaeter 'model' to ResNet or EfficientNet for different model
### Change the paramaeter 'dataset' to Cifar10C, OfficeHome or DomaineNet for different datasets
### Change the 'loss' parameter to crossentropy or focal for different losses

In [5]:
model = "ResNet" # "EfficientNet" # "ResNet"
dataset = "DomainNet" # "OfficeHome" # "Cifar10C", "DomainNet"
loss = "crossentropy" # "crossentropy", "focal"

In [6]:
if dataset != "OfficeHome" and (loss == "focal" or model == "EfficientNet"):
    raise Exception("EfficientNet network and Focal loss experiments are done only on OfiiceHome datesets. If you want to test the focal loss on other datasets, please first train and calibrate the methods using the train and calibrate scripts.")

### Change the paramaeter 'metric' to "Error", "Loss", "Temperature", "Brier" or "ECE"
### Change the paramaeter 'calib' to "in" or "out": "in" means source domains are included in calibration domains (the default setting of the paper), "out" means methods are calibrated using only calibration domains.
### Change the 'average_by' parameter to 'Source', 'Target': Target means unseen domain results and Source means training domain results

In [7]:
metric = 'Error' # "Error", "Loss", "Temperature", "Brier", "ECE"
calib = 'in' # in: source domains included, out: only calibration domains
average_by = 'Target' # 'Source', 'Target'

In [8]:
if dataset == "Cifar10C":
    from Cifar10C.utils import *
elif dataset == "OfficeHome":
    from OfficeHome.utils import *
elif dataset == "DomainNet":
    from DomainNet.utils import *

In [9]:
res_columns = ['Calibrated on'] + domains + ['Average']

## Load the {metric} Dataframe when source domain is {calib} calibration domains.

### KFold results of best Rho based on both Error and ECE

In [10]:
best_rho_er = best_rho_calib(model, 'Error', calib, loss)
best_rho_ece = best_rho_calib(model, 'ECE', calib, loss)

In [11]:
if dataset == "Cifar10C":
    res_file = os.path.join('..', 'results', model.lower(), dataset.lower(), loss, '4filter_1', f"{calib}_{metric}_mean.csv")
    
    filters_calib = ['fog', 'contrast', 'elastic_transform', 'saturate']
    filters_train = ['gaussian_noise', 'brightness', 'pixelate', 'gaussian_blur']
    filter_rest = ['shot_noise', 'impulse_noise', 'defocus_blur', 'glass_blur', 'zoom_blur', 'jpeg_compression', 'speckle_noise']
    
    train_f, valid_f, rest_f = get_level_filters('4filter_1', filters_train, filters_train, filter_rest)
    #res = pd.read_csv(res_file)
    #res = get_rho_df(res, best_rho_er, best_rho_ece)
else:
    res_file = os.path.join('..', 'results', model.lower(), dataset, loss, f"{calib}_{metric}_mean.csv")
    #res = pd.read_csv(res_file)
    #res = get_res_df(res, best_rho_er, best_rho_ece)
res = pd.read_csv(res_file)

In [12]:
res_file

'../results/resnet/DomainNet/crossentropy/in_Error_mean.csv'

### Average the domain wise results for target domains
We also have stored results for std. Here we are abusing the statistics by taking the standard deviation over columns. We ran the 20 iterations over 500 randomly selected samples. By abusing we are treating these columns as single iteration results.

In [13]:
if dataset == "Cifar10C":
    if metric == "Temperature":
        MAIN_COLUMNS = ['TS Source', 'TS Oracle',  'CPCS', 'TransCal', 'TS', 'Cluster NN', 'Cluster LR']
    else:
        res = get_rho_df(res, best_rho_er, best_rho_ece)
    res_mean, res_std = get_res_df(res, '4filter_1', train_f, valid_f, rest_f)
else:
    if metric == "Temperature":
        MAIN_COLUMNS = ['TS Source', 'TS Oracle',  'CPCS', 'TransCal', 'TS', 'Cluster NN', 'Cluster LR']
    else:
        res = get_res_df(res, best_rho_er, best_rho_ece)
    if average_by == 'Target':
        res_mean = res[MAIN_COLUMNS+['Domain']][(res['Valid'] == False) & (res['Train'] == False)].groupby(['Domain']).mean().T
        res_std = res[MAIN_COLUMNS+['Domain']][(res['Valid'] == False) & (res['Train'] == False)].groupby(['Domain']).std().T
    elif average_by == 'Source':
        res_mean = res[MAIN_COLUMNS+['Domain']][(res['Train'] == True)].groupby(['Domain']).mean().T
        res_std = res[MAIN_COLUMNS+['Domain']][(res['Train'] == True)].groupby(['Domain']).std().T
    res_mean['Average'] = res_mean[domains].mean(axis=1)

### CaliGen Er is the methods where best value of rho is selected by minimum error of cross validation. This is the one presented in paper. We also presented for comparison the best rho value selected by minimum ECE (displayed as CaliGen ECE)

#### Get the standard deviations for Average. The function is in utils.py

In [14]:
res_std['std'] = get_std(res_mean, res_std)

In [15]:
res_mean = res_mean.round(2)
res_std = res_std.round(2)

In [16]:
for domain in domains:
    res_mean[domain] = res_mean[domain].astype(str) + '±' + res_std[domain].astype(str)
res_mean['Average'] = res_mean['Average'].astype(str) + '±' + res_std['std'].astype(str)

#### Show the results

In [17]:
res_mean['Calibrated on'] = [calib] * len(res_mean)
res_mean.rename_axis('Method', axis='columns', inplace=True)

In [21]:
print(f"Result for model: {model}; loss: {loss}; metric {metric}; calibration: {calib}; Averaged by: {average_by} domains")

Result for model: ResNet; loss: crossentropy; metric Error; calibration: in; Averaged by: Target domains


In [22]:
res_mean[res_columns]

Method,Calibrated on,clipart,infograph,painting,quickdraw,real,sketch,Average
Uncalibrated,in,65.0±6.23,90.22±2.53,76.9±8.6,92.48±3.22,68.99±8.51,73.96±4.97,77.93±11.93
TS Source,in,65.0±6.23,90.22±2.53,76.9±8.6,92.48±3.22,68.99±8.51,73.96±4.97,77.93±11.93
TS Oracle,in,65.0±6.23,90.22±2.53,76.9±8.6,92.48±3.22,68.99±8.51,73.96±4.97,77.93±11.93
HB,in,62.75±6.03,89.38±2.39,75.04±7.92,91.89±2.92,68.62±7.33,72.04±4.96,76.62±12.02
Isotonic,in,61.17±6.32,88.68±2.54,73.52±7.66,91.23±3.06,66.08±7.6,70.63±5.13,75.22±12.52
Beta abm,in,60.48±6.36,88.47±2.46,73.03±7.79,91.14±3.05,65.12±8.14,70.33±5.05,74.76±12.81
Beta am,in,60.32±6.28,88.5±2.52,73.07±7.78,91.05±3.15,65.28±7.89,70.17±5.09,74.73±12.79
Beta ab,in,63.89±6.11,89.62±2.41,75.84±8.27,92.11±3.35,68.11±8.2,73.09±4.7,77.11±12.02
CPCS,in,65.0±6.23,90.22±2.53,76.9±8.6,92.48±3.22,68.99±8.51,73.96±4.97,77.93±11.93
TransCal,in,65.0±6.23,90.22±2.53,76.9±8.6,92.48±3.22,68.99±8.51,73.96±4.97,77.93±11.93
