# Ablation Study Results for ECE

In [1]:
import pandas as pd
import os
from utils import *

In [2]:
res_columns = ['Calibrated on', 'Uncalibrated', 'CaliGen Er', 'Unmodified Loss', 'Unmodified Head']

## Load the ECE Dataframe when source domain is included in calibration domains.

### KFold results of best Rho based on both Error and ECE

In [3]:
best_rho_er_in = best_rho_calib('Error', 'in', 'crossentropy')
best_rho_ece_in = best_rho_calib('ECE', 'in', 'crossentropy')

In [4]:
ece_file_in = os.path.join('..', '..', 'results', 'OfficeHome', 'crossentropy', 'in_ECE_mean.csv')
ece_in = pd.read_csv(ece_file_in)
ece_in = get_res_df(ece_in, best_rho_er_in, best_rho_ece_in, True)

## Unmodified Loss is when Rho = 0

In [5]:
ece_in['Unmodified Loss'] = ece_in['CaliGen 0']

### Average the domain wise results for target domains
We also have stored results for std. Here we are abusing the statistics by taking the standard deviation over columns. We ran the 20 iterations over 500 randomly selected samples. By abusing we are treating these columns as single iteration results.

In [6]:
ABLATION_COL = ['Trainer', 'Calib', 'Domain', 'Uncalibrated', 'CaliGen Er', 'Unmodified Loss']

In [7]:
res_mean_in = ece_in[ABLATION_COL][(ece_in['Valid'] == False) & (ece_in['Train'] == False)].groupby(['Domain']).mean().T
res_std_in = ece_in[ABLATION_COL][(ece_in['Valid'] == False) & (ece_in['Train'] == False)].groupby(['Domain']).std().T

In [8]:
res_mean_in['Average'] = res_mean_in[['Art', 'Clipart', 'Product', 'RealWorld']].mean(axis=1)

Get the standard deviations for Average. The function is in utils.py

In [9]:
res_std_in['std'] = get_std(res_mean_in, res_std_in)

In [10]:
res_mean_in = res_mean_in.round(2)
res_std_in = res_std_in.round(2)

In [11]:
for col in ['Art', 'Clipart', 'Product', 'RealWorld']:
    res_mean_in[col] = res_mean_in[col].astype(str) + '+' + res_std_in[col].astype(str)
res_mean_in['Average'] = res_mean_in['Average'].astype(str) + '+' + res_std_in['std'].astype(str)

#### Show the results

In [12]:
result_in = res_mean_in.T

## Unmodified Head

In [13]:
best_rho_er_in_ab = best_rho_calib('Error', 'in', 'crossentropy', 'ablation')
best_rho_ece_in_ab = best_rho_calib('ECE', 'in', 'crossentropy', 'ablation')

In [14]:
ece_file_in_ab = os.path.join('..', '..', 'results', 'OfficeHome', 'ablation', 'in_ECE_mean.csv')
ece_in_ab = pd.read_csv(ece_file_in_ab)
ece_in_ab = get_res_df(ece_in_ab, best_rho_er_in_ab, best_rho_ece_in_ab, True)

In [15]:
ece_in_ab['Unmodified Head'] = ece_in_ab['CaliGen Er']

In [16]:
ABLATION_COL = ['Trainer', 'Calib', 'Domain', 'Uncalibrated', 'Unmodified Head']

In [17]:
res_mean_in_ab = ece_in_ab[ABLATION_COL][(ece_in_ab['Valid'] == False) & (ece_in_ab['Train'] == False)].groupby(['Domain']).mean().T
res_std_in_ab = ece_in_ab[ABLATION_COL][(ece_in_ab['Valid'] == False) & (ece_in_ab['Train'] == False)].groupby(['Domain']).std().T

In [18]:
res_mean_in_ab['Average'] = res_mean_in_ab[['Art', 'Clipart', 'Product', 'RealWorld']].mean(axis=1)

In [19]:
res_std_in_ab['std'] = get_std(res_mean_in_ab, res_std_in_ab)

In [20]:
res_mean_in_ab = res_mean_in_ab.round(2)
res_std_in_ab = res_std_in_ab.round(2)

In [21]:
for col in ['Art', 'Clipart', 'Product', 'RealWorld']:
    res_mean_in_ab[col] = res_mean_in_ab[col].astype(str) + '+' + res_std_in_ab[col].astype(str)
res_mean_in_ab['Average'] = res_mean_in_ab['Average'].astype(str) + '+' + res_std_in_ab['std'].astype(str)

Show the result

In [22]:
result_in['Unmodified Head'] = res_mean_in_ab.T['Unmodified Head']
result_in['Calibrated on'] = ['in'] * len(result_in)
result_in

Unnamed: 0_level_0,Uncalibrated,CaliGen Er,Unmodified Loss,Unmodified Head,Calibrated on
Domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Art,37.61+5.21,6.64+1.84,13.41+4.77,10.57+2.05,in
Clipart,40.32+0.28,14.61+4.87,21.23+7.37,19.14+4.67,in
Product,29.64+4.81,6.38+0.93,10.14+4.83,6.73+0.96,in
RealWorld,26.05+7.48,7.27+0.54,8.93+0.26,6.22+0.32,in
Average,33.41+7.75,8.72+4.32,13.43+6.93,10.66+5.79,in


## Load the ECE Dataframe when source domain is not included in calibration domains.

### KFold results of best Rho based on both Error and ECE

In [23]:
best_rho_er_out = best_rho_calib('Error', 'out', 'crossentropy')
best_rho_ece_out = best_rho_calib('ECE', 'out', 'crossentropy')

In [24]:
ece_file_out = os.path.join('..', '..', 'results', 'OfficeHome', 'crossentropy', 'out_ECE_mean.csv')
ece_out = pd.read_csv(ece_file_out)
ece_out = get_res_df(ece_out, best_rho_er_out, best_rho_ece_out, True)

In [25]:
ece_out['Unmodified Loss'] = ece_out['CaliGen 0']

### Average the domain wise results for target domains when source domain not included in calibration domain
We also have stored results for std. Here we are abusing the statistics by taking the standard deviation over columns. We ran the 20 iterations over 500 randomly selected samples. By abusing we are treating these columns as single iteration results.

In [26]:
ABLATION_COL = ['Trainer', 'Calib', 'Domain', 'Uncalibrated', 'CaliGen Er', 'Unmodified Loss']

In [27]:
res_mean_out = ece_out[ABLATION_COL][(ece_out['Valid'] == False) & (ece_out['Train'] == False)].groupby(['Domain']).mean().T
res_std_out = ece_out[ABLATION_COL][(ece_out['Valid'] == False) & (ece_out['Train'] == False)].groupby(['Domain']).std().T

Get the standard deviations for Average. The function is in utils.py

In [28]:
res_mean_out['Average'] = res_mean_out[['Art', 'Clipart', 'Product', 'RealWorld']].mean(axis=1)
res_std_out['std'] = get_std(res_mean_out, res_std_out)

In [29]:
res_mean_out = res_mean_out.round(2)
res_std_out = res_std_out.round(2)

In [30]:
res_mean_out['Art'] = res_mean_out['Art'].astype(str) + '+' + res_std_out['Art'].astype(str)
res_mean_out['Clipart'] = res_mean_out['Clipart'].astype(str) + '+' + res_std_out['Clipart'].astype(str)
res_mean_out['Product'] = res_mean_out['Product'].astype(str) + '+' + res_std_out['Product'].astype(str)
res_mean_out['RealWorld'] = res_mean_out['RealWorld'].astype(str) + '+' + res_std_out['RealWorld'].astype(str)
res_mean_out['Average'] = res_mean_out['Average'].astype(str) + '+' + res_std_out['std'].astype(str)

#### Show the results

In [31]:
result_out = res_mean_out.T

In [32]:
best_rho_er_out_ab = best_rho_calib('Error', 'out', 'crossentropy', 'ablation')
best_rho_ece_out_ab = best_rho_calib('ECE', 'out', 'crossentropy', 'ablation')

In [33]:
ece_file_out_ab = os.path.join('..', '..', 'results', 'OfficeHome', 'ablation', 'out_ECE_mean.csv')
ece_out_ab = pd.read_csv(ece_file_out_ab)
ece_out_ab = get_res_df(ece_out_ab, best_rho_er_out_ab, best_rho_ece_out_ab, True)

In [34]:
ece_out_ab['Unmodified Head'] = ece_out_ab['CaliGen Er']

In [35]:
ABLATION_COL = ['Trainer', 'Calib', 'Domain', 'Uncalibrated', 'Unmodified Head']

In [36]:
res_mean_out_ab = ece_out_ab[ABLATION_COL][(ece_out_ab['Valid'] == False) & (ece_out_ab['Train'] == False)].groupby(['Domain']).mean().T
res_std_out_ab = ece_out_ab[ABLATION_COL][(ece_out_ab['Valid'] == False) & (ece_out_ab['Train'] == False)].groupby(['Domain']).std().T

In [37]:
res_mean_out_ab['Average'] = res_mean_out_ab[['Art', 'Clipart', 'Product', 'RealWorld']].mean(axis=1)

In [38]:
res_std_out_ab['std'] = get_std(res_mean_out_ab, res_std_out_ab)
res_mean_out_ab = res_mean_out_ab.round(2)
res_std_out_ab = res_std_out_ab.round(2)

In [39]:
for col in ['Art', 'Clipart', 'Product', 'RealWorld']:
    res_mean_out_ab[col] = res_mean_out_ab[col].astype(str) + '+' + res_std_out_ab[col].astype(str)
res_mean_out_ab['Average'] = res_mean_out_ab['Average'].astype(str) + '+' + res_std_out_ab['std'].astype(str)

In [40]:
result_out['Unmodified Head'] = res_mean_out_ab.T['Unmodified Head']
result_out['Calibrated on'] = ['out'] * len(result_out)
result_out

Unnamed: 0_level_0,Uncalibrated,CaliGen Er,Unmodified Loss,Unmodified Head,Calibrated on
Domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Art,37.61+5.21,6.81+1.12,17.72+1.68,10.44+1.84,out
Clipart,40.32+0.28,11.48+3.06,27.51+2.4,17.56+1.85,out
Product,29.64+4.81,6.56+1.82,12.85+1.24,8.92+4.1,out
RealWorld,26.05+7.48,10.14+1.98,12.71+0.57,6.5+2.01,out
Average,33.41+7.75,8.75+2.99,17.7+6.22,10.85+4.88,out


## Combined results when source domain included (in) and not included (out) in calibration domains

In [41]:
res = pd.concat([result_in[res_columns], result_out[res_columns]])

In [None]:
res

Unnamed: 0_level_0,Calibrated on,Uncalibrated,CaliGen Er,Unmodified Loss,Unmodified Head
Domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Art,in,37.61+5.21,6.64+1.84,13.41+4.77,10.57+2.05
Clipart,in,40.32+0.28,14.61+4.87,21.23+7.37,19.14+4.67
Product,in,29.64+4.81,6.38+0.93,10.14+4.83,6.73+0.96
RealWorld,in,26.05+7.48,7.27+0.54,8.93+0.26,6.22+0.32
Average,in,33.41+7.75,8.72+4.32,13.43+6.93,10.66+5.79
Art,out,37.61+5.21,6.81+1.12,17.72+1.68,10.44+1.84
Clipart,out,40.32+0.28,11.48+3.06,27.51+2.4,17.56+1.85
Product,out,29.64+4.81,6.56+1.82,12.85+1.24,8.92+4.1
RealWorld,out,26.05+7.48,10.14+1.98,12.71+0.57,6.5+2.01
Average,out,33.41+7.75,8.75+2.99,17.7+6.22,10.85+4.88


### Average the domain wise results for source domains
We also have stored results for std. Here we are abusing the statistics by taking the standard deviation over columns. We ran the 20 iterations over 500 randomly selected samples. By abusing we are treating these columns as single iteration results.

In [None]:
res_mean_in = ece_in[MAIN_COLUMNS+['Domain']][(ece_in['Train'] == True)].groupby(['Domain']).mean().T
res_std_in = ece_in[MAIN_COLUMNS+['Domain']][(ece_in['Train'] == True)].groupby(['Domain']).std().T

In [None]:
res_mean_in['Average'] = res_mean_in[['Art', 'Clipart', 'Product', 'RealWorld']].mean(axis=1)

Get the standard deviations for Average. The function is in utils.py

In [None]:
res_std_in['std'] = get_std(res_mean_in, res_std_in)

In [None]:
res_mean_in = res_mean_in.round(2)
res_std_in = res_std_in.round(2)

In [None]:
res_mean_in['Art'] = res_mean_in['Art'].astype(str) + '+' + res_std_in['Art'].astype(str)
res_mean_in['Clipart'] = res_mean_in['Clipart'].astype(str) + '+' + res_std_in['Clipart'].astype(str)
res_mean_in['Product'] = res_mean_in['Product'].astype(str) + '+' + res_std_in['Product'].astype(str)
res_mean_in['RealWorld'] = res_mean_in['RealWorld'].astype(str) + '+' + res_std_in['RealWorld'].astype(str)
res_mean_in['Average'] = res_mean_in['Average'].astype(str) + '+' + res_std_in['std'].astype(str)

In [None]:
res_mean_in['Calibrated on'] = ['in'] * len(res_mean_in)
res_mean_in.rename_axis('Method', axis='columns', inplace=True)

In [None]:
res_mean_in[res_columns]

Method,Calibrated on,Art,Clipart,Product,RealWorld,Average
Uncalibrated,in,0.11+0.0,0.63+0.0,0.1+0.0,2.42+0.0,0.82+0.95
TS Source,in,0.23+0.0,2.74+0.0,0.18+0.0,13.55+0.0,4.17+5.51
TS Oracle,in,0.15+0.0,0.58+0.0,0.08+0.0,0.71+0.0,0.38+0.27
HB,in,21.98+1.44,15.72+1.08,10.98+2.41,18.29+1.8,16.74+4.37
Isotonic,in,1.55+0.65,3.15+0.19,0.58+0.15,9.7+1.21,3.74+3.63
Beta abm,in,13.14+3.49,6.22+0.25,2.65+0.67,16.48+1.7,9.62+5.81
Beta am,in,4.79+2.18,7.6+0.84,2.77+1.39,20.49+4.41,8.91+7.37
Beta ab,in,14.41+2.17,7.69+0.42,4.96+1.44,13.19+0.97,10.06+4.13
TS,in,2.21+1.21,7.45+0.85,2.11+1.15,23.48+5.16,8.81+9.16
Cluster NN,in,2.16+0.99,6.61+0.59,2.11+1.11,22.86+5.38,8.44+8.97


### Average the domain wise results for source domains when source domain not included in calibration domain
We also have stored results for std. Here we are abusing the statistics by taking the standard deviation over columns. We ran the 20 iterations over 500 randomly selected samples. By abusing we are treating these columns as single iteration results.

In [None]:
res_mean_out = ece_out[MAIN_COLUMNS+['Domain']][(ece_out['Train'] == True)].groupby(['Domain']).mean().T
res_std_out = ece_out[MAIN_COLUMNS+['Domain']][(ece_out['Train'] == True)].groupby(['Domain']).std().T

Get the standard deviations for Average. The function is in utils.py

In [None]:
res_mean_out['Average'] = res_mean_out[['Art', 'Clipart', 'Product', 'RealWorld']].mean(axis=1)
res_std_out['std'] = get_std(res_mean_out, res_std_out)

In [None]:
res_mean_out = res_mean_out.round(2)
res_std_out = res_std_out.round(2)
res_mean_out['Art'] = res_mean_out['Art'].astype(str) + '+' + res_std_out['Art'].astype(str)
res_mean_out['Clipart'] = res_mean_out['Clipart'].astype(str) + '+' + res_std_out['Clipart'].astype(str)
res_mean_out['Product'] = res_mean_out['Product'].astype(str) + '+' + res_std_out['Product'].astype(str)
res_mean_out['RealWorld'] = res_mean_out['RealWorld'].astype(str) + '+' + res_std_out['RealWorld'].astype(str)
res_mean_out['Average'] = res_mean_out['Average'].astype(str) + '+' + res_std_out['std'].astype(str)

#### Show the results

In [None]:
res_mean_out['Calibrated on'] = ['out'] * len(res_mean_out)
res_mean_out.rename_axis('Method', axis='columns', inplace=True)

In [None]:
res_mean_out[res_columns]

Method,Calibrated on,Art,Clipart,Product,RealWorld,Average
Uncalibrated,out,0.11+0.0,0.63+0.0,0.1+0.0,2.42+0.0,0.82+0.95
TS Source,out,0.23+0.0,2.74+0.0,0.18+0.0,13.55+0.0,4.17+5.51
TS Oracle,out,0.15+0.0,0.58+0.0,0.08+0.0,0.71+0.0,0.38+0.27
HB,out,25.11+1.32,17.8+2.26,18.2+5.49,16.45+1.3,19.39+4.58
Isotonic,out,1.74+0.97,2.25+0.29,0.96+0.68,9.92+1.36,3.72+3.72
Beta abm,out,21.49+7.18,17.65+3.32,16.07+3.8,23.72+6.84,19.73+6.33
Beta am,out,7.69+3.87,13.92+3.46,10.69+7.25,27.44+8.28,14.93+9.69
Beta ab,out,22.44+5.36,17.55+1.24,14.2+1.33,15.37+1.72,17.39+4.32
TS,out,3.7+2.27,11.53+2.42,5.3+4.22,30.11+9.26,12.66+11.78
Cluster NN,out,3.05+1.61,9.66+1.49,4.54+3.24,28.47+8.25,11.43+11.12


## Combined results when source domain included (in) and not included (out) in calibration domains

In [None]:
res = pd.concat([res_mean_in[res_columns], res_mean_out[res_columns]])

In [None]:
res

Method,Calibrated on,Art,Clipart,Product,RealWorld,Average
Uncalibrated,in,0.11+0.0,0.63+0.0,0.1+0.0,2.42+0.0,0.82+0.95
TS Source,in,0.23+0.0,2.74+0.0,0.18+0.0,13.55+0.0,4.17+5.51
TS Oracle,in,0.15+0.0,0.58+0.0,0.08+0.0,0.71+0.0,0.38+0.27
HB,in,21.98+1.44,15.72+1.08,10.98+2.41,18.29+1.8,16.74+4.37
Isotonic,in,1.55+0.65,3.15+0.19,0.58+0.15,9.7+1.21,3.74+3.63
Beta abm,in,13.14+3.49,6.22+0.25,2.65+0.67,16.48+1.7,9.62+5.81
Beta am,in,4.79+2.18,7.6+0.84,2.77+1.39,20.49+4.41,8.91+7.37
Beta ab,in,14.41+2.17,7.69+0.42,4.96+1.44,13.19+0.97,10.06+4.13
TS,in,2.21+1.21,7.45+0.85,2.11+1.15,23.48+5.16,8.81+9.16
Cluster NN,in,2.16+0.99,6.61+0.59,2.11+1.11,22.86+5.38,8.44+8.97
