# Experiment

In [1]:
import pandas as pd
import numpy as np
import torch
import os
from IPython.display import display

from models import get_global_threshold, get_individual_thresholds
from models import get_global_similarity_log_reg, get_similarity_log_reg
from models import get_embeddings_log_reg
from models import get_global_sim_X_y, get_concept_sim_X_y

from calibration_framework import apply_platt_scaling, apply_isotonic_regression, apply_temperature_scaling

from utils import compare_all_models_calibration_metric, compare_all_models_calibration_avg, compare_all_models_calibration_concept

In [2]:
dataset_name = 'CUB'

## 1. Get pre-processed data

In [3]:
metadata_df = pd.read_csv(f'../Data/{dataset_name}/metadata.csv')
embeddings = torch.load(f'Embeddings/{dataset_name}/embeddings.pt')
cosine_similarity_df = pd.read_csv(f'Cosine_Similarities/{dataset_name}/cosine_similarities.csv')

if dataset_name == 'CLEVR':
    metadata_df = metadata_df.drop(['size::large','material::rubber'], axis=1)
    cosine_similarity_df = cosine_similarity_df.drop(['size::large','material::rubber'], axis=1)

concepts = list(cosine_similarity_df.columns)

train_mask = metadata_df['split'] == 'train'
train_embeddings = embeddings[train_mask]
train_metadata_df = metadata_df[train_mask].reset_index(drop=True)
train_cosine_similarity_df = cosine_similarity_df[train_mask].reset_index(drop=True)

cal_mask = metadata_df['split'] == 'calibration'
cal_embeddings = embeddings[cal_mask]
cal_metadata_df = metadata_df[cal_mask].reset_index(drop=True)
cal_cosine_similarity_df = cosine_similarity_df[cal_mask].reset_index(drop=True)

test_mask = metadata_df['split'] == 'test'
test_embeddings = embeddings[test_mask]
test_metadata_df = metadata_df[test_mask].reset_index(drop=True)
test_cosine_similarity_df = cosine_similarity_df[test_mask].reset_index(drop=True)

## 2. Train models

In [4]:
m1_models, m1_global_train_error, m1_train_errors = get_global_threshold(train_metadata_df, train_cosine_similarity_df,
                                                                        verbose=False)
m1_train_errors['Model'] = '(M1) Global Threshold'

In [5]:
m2_models, m2_train_errors = get_individual_thresholds(train_metadata_df, train_cosine_similarity_df, verbose=False)
m2_train_errors['Model'] = '(M2) Individual Threshold'

In [6]:
m3_models, m3_global_train_error, m3_train_errors = get_global_similarity_log_reg(train_metadata_df, 
                                                                                  train_cosine_similarity_df,
                                                                                  verbose=False)
m3_train_errors['Model'] = '(M3) Global Similarity LogReg'

In [7]:
m4_models, m4_train_errors = get_similarity_log_reg(train_metadata_df, train_cosine_similarity_df, verbose=False)
m4_train_errors['Model'] = '(M4) Individual Similarity LogReg'

In [8]:
m5_models, m5_train_errors = get_embeddings_log_reg(train_embeddings, train_metadata_df, train_cosine_similarity_df,
                                                   verbose=False)
m5_train_errors['Model'] = '(M5) Embeddings LogReg'

In [9]:
base_models = {'(M1) Global Threshold': m1_models,
          '(M2) Individual Threshold': m2_models,
          '(M3) Global Similarity LogReg': m3_models,
          '(M4) Individual Similarity LogReg': m4_models,
          '(M5) Embeddings LogReg': m5_models
         }

### 2.1 Evaluate training classification error

In [10]:
error_comparison_df = pd.DataFrame.from_dict([m1_train_errors, 
                                              m2_train_errors,
                                              m3_train_errors,
                                              m4_train_errors,
                                              m5_train_errors
                                             ])
error_comparison_df = error_comparison_df.set_index('Model')
if dataset_name == 'CUB':
    error_comparison_df = error_comparison_df.transpose()
    display(error_comparison_df.describe())
else:
    display(error_comparison_df)

Model,(M1) Global Threshold,(M2) Individual Threshold,(M3) Global Similarity LogReg,(M4) Individual Similarity LogReg,(M5) Embeddings LogReg
count,312.0,312.0,312.0,312.0,312.0
mean,0.100325,0.084739,0.100344,0.085458,0.084219
std,0.123291,0.090527,0.123204,0.09113,0.086232
min,0.001535,0.001396,0.001535,0.001535,0.001535
25%,0.01619,0.016085,0.01612,0.015736,0.015876
50%,0.042149,0.037683,0.042219,0.038241,0.04187
75%,0.157432,0.151151,0.157188,0.15307,0.153559
max,0.836008,0.344173,0.836008,0.346546,0.321842


## 3. Calibrate models

In [11]:
X_cal, y_cal, _, _ = get_global_sim_X_y(cal_metadata_df, cal_cosine_similarity_df)

m3_models_cal = {}

m3_models_cal['Platt'] = apply_platt_scaling(m3_models, X_cal, y_cal)
m3_models_cal['Isotonic'] = apply_isotonic_regression(m3_models, X_cal, y_cal)
m3_models_cal['Temperature'] = apply_temperature_scaling(m3_models, X_cal, y_cal, verbose=False)

In [12]:
m4_models_cal = {}
m4_models_cal['Platt'] = {}
m4_models_cal['Isotonic'] = {}
m4_models_cal['Temperature'] = {}

for concept in m4_models.keys():
    X_cal, y_cal = get_concept_sim_X_y(cal_metadata_df, cal_cosine_similarity_df, concept)

    m4_models_cal['Platt'][concept] = apply_platt_scaling(m4_models[concept], X_cal, y_cal)
    m4_models_cal['Isotonic'][concept] = apply_isotonic_regression(m4_models[concept], X_cal, y_cal)
    m4_models_cal['Temperature'][concept] = apply_temperature_scaling(m4_models[concept], X_cal, y_cal, verbose=False)

In [13]:
m5_models_cal = {}
m5_models_cal['Platt'] = {}
m5_models_cal['Isotonic'] = {}
m5_models_cal['Temperature'] = {}

for concept in m4_models.keys():
    X_cal = cal_embeddings
    y_cal = (cal_metadata_df[concept]==1).to_numpy().astype(int)

    m5_models_cal['Platt'][concept] = apply_platt_scaling(m5_models[concept], X_cal, y_cal)
    m5_models_cal['Isotonic'][concept] = apply_isotonic_regression(m5_models[concept], X_cal, y_cal)
    m5_models_cal['Temperature'][concept] = apply_temperature_scaling(m5_models[concept], X_cal, y_cal, verbose=False)

In [14]:
calibrated_models = {'(M3) Global Similarity LogReg': m3_models_cal,
                     '(M4) Individual Similarity LogReg': m4_models_cal,
                     '(M5) Embeddings LogReg': m5_models_cal
                    }

## 4. Evaluate models

In [15]:
results_path = f"Results/{dataset_name}/"
os.makedirs(results_path, exist_ok=True)

### 4.1 Get a single metric for all models, calibration methods, and concepts

**Use only if you don't have many concepts!**

In [16]:
if len(concepts) <= 10:
    acc_df = compare_all_models_calibration_metric(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                       test_metadata_df, test_cosine_similarity_df, test_embeddings, 
                                       metric='accuracy')
    display(acc_df)

### 4.2 Get the average of all metrics over the concepts for all models and calibration methods

In [17]:
metrics_avg_df = compare_all_models_calibration_avg(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                   test_metadata_df, test_cosine_similarity_df, test_embeddings)
metrics_avg_df.to_csv(os.path.join(results_path, 'metrics_average.csv'))
metrics_avg_df

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,auc,K1,K2,Kmax
Model,Calibration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
(M1) Global Threshold,,0.901 ± 0.12,0.096 ± 0.13,-,-,-,-
(M2) Individual Threshold,,0.916 ± 0.09,0.198 ± 0.23,-,-,-,-
(M3) Global Similarity LogReg,,0.901 ± 0.12,0.103 ± 0.14,0.802 ± 0.08,0.804 ± 0.12,0.688 ± 0.16,0.604 ± 0.09
(M3) Global Similarity LogReg,Platt,0.901 ± 0.12,0.106 ± 0.14,0.802 ± 0.08,0.804 ± 0.12,0.689 ± 0.16,0.606 ± 0.09
(M3) Global Similarity LogReg,Isotonic,0.901 ± 0.12,0.104 ± 0.14,0.802 ± 0.09,0.804 ± 0.12,0.690 ± 0.16,0.603 ± 0.09
(M3) Global Similarity LogReg,Temperature,0.901 ± 0.12,0.103 ± 0.14,0.802 ± 0.08,0.594 ± 0.11,0.379 ± 0.11,0.369 ± 0.06
(M4) Individual Similarity LogReg,,0.916 ± 0.09,0.167 ± 0.23,0.802 ± 0.08,0.817 ± 0.21,0.738 ± 0.26,0.709 ± 0.30
(M4) Individual Similarity LogReg,Platt,0.916 ± 0.09,0.177 ± 0.23,0.801 ± 0.09,0.818 ± 0.21,0.740 ± 0.26,0.711 ± 0.30
(M4) Individual Similarity LogReg,Isotonic,0.915 ± 0.09,0.198 ± 0.23,0.794 ± 0.08,0.687 ± 0.19,0.612 ± 0.22,0.582 ± 0.25
(M4) Individual Similarity LogReg,Temperature,0.916 ± 0.09,0.167 ± 0.23,0.802 ± 0.08,0.603 ± 0.14,0.397 ± 0.14,0.526 ± 0.19


### 4.3 Get all metrics for all models and calibration methods for a single concept

In [18]:
compare_all_models_calibration_concept(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                   test_metadata_df, test_cosine_similarity_df, test_embeddings,
                                      concept=concepts[0])

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,auc,K1,K2,Kmax
Model,Calibration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
(M1) Global Threshold,,0.719023,0.0,-,-,-,-
(M2) Individual Threshold,,0.784031,0.547118,-,-,-,-
(M3) Global Similarity LogReg,,0.719023,0.0,0.751843,0.630752,0.462183,0.529211
(M3) Global Similarity LogReg,Platt,0.719023,0.0,0.751843,0.631691,0.463128,0.530759
(M3) Global Similarity LogReg,Isotonic,0.719023,0.0,0.751385,0.630047,0.463178,0.527603
(M3) Global Similarity LogReg,Temperature,0.719023,0.0,0.751843,0.423065,0.202728,0.303628
(M4) Individual Similarity LogReg,,0.780541,0.509268,0.751843,0.490672,0.315575,0.184926
(M4) Individual Similarity LogReg,Platt,0.782723,0.535448,0.751843,0.489407,0.312994,0.178556
(M4) Individual Similarity LogReg,Isotonic,0.783159,0.544455,0.753369,0.494458,0.319514,0.290935
(M4) Individual Similarity LogReg,Temperature,0.780541,0.509268,0.751843,0.381517,0.174612,0.272591
