# Experiment

In [1]:
import pandas as pd
import numpy as np
import torch
import os
from IPython.display import display

from models import get_global_threshold, get_individual_thresholds
from models import get_global_similarity_log_reg, get_similarity_log_reg
from models import get_embeddings_log_reg
from models import get_global_sim_X_y, get_concept_sim_X_y

from calibration_framework import apply_platt_scaling, apply_isotonic_regression, apply_temperature_scaling

from utils import compare_all_models_calibration_metric, compare_all_models_calibration_avg, compare_all_models_calibration_concept

In [2]:
dataset_name = 'true-false-dataset'

## 1. Get pre-processed data

In [3]:
metadata_df = pd.read_csv(f'../Data/{dataset_name}/metadata.csv')
embeddings = torch.load(f'Embeddings/{dataset_name}/embeddings.pt')
cosine_similarity_df = pd.read_csv(f'Cosine_Similarities/{dataset_name}/cosine_similarities.csv')

if dataset_name == 'CLEVR':
    metadata_df = metadata_df.drop(['size::large','material::rubber'], axis=1)
    cosine_similarity_df = cosine_similarity_df.drop(['size::large','material::rubber'], axis=1)

concepts = list(cosine_similarity_df.columns)

train_mask = metadata_df['split'] == 'train'
train_embeddings = embeddings[train_mask]
train_metadata_df = metadata_df[train_mask].reset_index(drop=True)
train_cosine_similarity_df = cosine_similarity_df[train_mask].reset_index(drop=True)

cal_mask = metadata_df['split'] == 'calibration'
cal_embeddings = embeddings[cal_mask]
cal_metadata_df = metadata_df[cal_mask].reset_index(drop=True)
cal_cosine_similarity_df = cosine_similarity_df[cal_mask].reset_index(drop=True)

test_mask = metadata_df['split'] == 'test'
test_embeddings = embeddings[test_mask]
test_metadata_df = metadata_df[test_mask].reset_index(drop=True)
test_cosine_similarity_df = cosine_similarity_df[test_mask].reset_index(drop=True)

## 2. Train models

In [4]:
m1_models, m1_global_train_error, m1_train_errors = get_global_threshold(train_metadata_df, train_cosine_similarity_df,
                                                                        verbose=False)
m1_train_errors['Model'] = '(M1) Global Threshold'

In [5]:
m2_models, m2_train_errors = get_individual_thresholds(train_metadata_df, train_cosine_similarity_df, verbose=False)
m2_train_errors['Model'] = '(M2) Individual Threshold'

In [6]:
m3_models, m3_global_train_error, m3_train_errors = get_global_similarity_log_reg(train_metadata_df, 
                                                                                  train_cosine_similarity_df,
                                                                                  verbose=False)
m3_train_errors['Model'] = '(M3) Global Similarity LogReg'

In [7]:
m4_models, m4_train_errors = get_similarity_log_reg(train_metadata_df, train_cosine_similarity_df, verbose=False)
m4_train_errors['Model'] = '(M4) Individual Similarity LogReg'

In [8]:
m5_models, m5_train_errors = get_embeddings_log_reg(train_embeddings, train_metadata_df, train_cosine_similarity_df,
                                                   verbose=False)
m5_train_errors['Model'] = '(M5) Embeddings LogReg'

In [9]:
base_models = {'(M1) Global Threshold': m1_models,
          '(M2) Individual Threshold': m2_models,
          '(M3) Global Similarity LogReg': m3_models,
          '(M4) Individual Similarity LogReg': m4_models,
          '(M5) Embeddings LogReg': m5_models
         }

### 2.1 Evaluate training classification error

In [10]:
error_comparison_df = pd.DataFrame.from_dict([m1_train_errors, 
                                              m2_train_errors,
                                              m3_train_errors,
                                              m4_train_errors,
                                              m5_train_errors
                                             ])
error_comparison_df = error_comparison_df.set_index('Model')
if dataset_name == 'CUB':
    error_comparison_df = error_comparison_df.transpose()
    display(error_comparison_df.describe())
else:
    display(error_comparison_df)

Unnamed: 0_level_0,true,animals,cities,companies,elements,facts,generated,inventions
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
(M1) Global Threshold,0.488323,0.017318,0.088166,0.069273,0.008659,0.125426,0.15534,0.070323
(M2) Individual Threshold,0.498819,0.017318,0.079769,0.016793,0.006822,0.092364,0.042771,0.063763
(M3) Global Similarity LogReg,0.498557,0.023616,0.125951,0.124377,0.014957,0.093151,0.125951,0.102335
(M4) Individual Similarity LogReg,0.521648,0.019155,0.08318,0.026502,0.009971,0.0963,0.042771,0.133823
(M5) Embeddings LogReg,0.00105,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3. Calibrate models

In [11]:
X_cal, y_cal, _, _ = get_global_sim_X_y(cal_metadata_df, cal_cosine_similarity_df)

m3_models_cal = {}

m3_models_cal['Platt'] = apply_platt_scaling(m3_models, X_cal, y_cal)
m3_models_cal['Isotonic'] = apply_isotonic_regression(m3_models, X_cal, y_cal)
m3_models_cal['Temperature'] = apply_temperature_scaling(m3_models, X_cal, y_cal, verbose=False)

In [12]:
m4_models_cal = {}
m4_models_cal['Platt'] = {}
m4_models_cal['Isotonic'] = {}
m4_models_cal['Temperature'] = {}

for concept in m4_models.keys():
    X_cal, y_cal = get_concept_sim_X_y(cal_metadata_df, cal_cosine_similarity_df, concept)

    m4_models_cal['Platt'][concept] = apply_platt_scaling(m4_models[concept], X_cal, y_cal)
    m4_models_cal['Isotonic'][concept] = apply_isotonic_regression(m4_models[concept], X_cal, y_cal)
    m4_models_cal['Temperature'][concept] = apply_temperature_scaling(m4_models[concept], X_cal, y_cal, verbose=False)

In [13]:
m5_models_cal = {}
m5_models_cal['Platt'] = {}
m5_models_cal['Isotonic'] = {}
m5_models_cal['Temperature'] = {}

for concept in m4_models.keys():
    X_cal = cal_embeddings
    y_cal = (cal_metadata_df[concept]==1).to_numpy().astype(int)

    m5_models_cal['Platt'][concept] = apply_platt_scaling(m5_models[concept], X_cal, y_cal)
    m5_models_cal['Isotonic'][concept] = apply_isotonic_regression(m5_models[concept], X_cal, y_cal)
    m5_models_cal['Temperature'][concept] = apply_temperature_scaling(m5_models[concept], X_cal, y_cal, verbose=False)

In [14]:
calibrated_models = {'(M3) Global Similarity LogReg': m3_models_cal,
                     '(M4) Individual Similarity LogReg': m4_models_cal,
                     '(M5) Embeddings LogReg': m5_models_cal
                    }

## 4. Evaluate models

In [15]:
results_path = f"Results/{dataset_name}/"
os.makedirs(results_path, exist_ok=True)

### 4.1 Get a single metric for all models, calibration methods, and concepts

**Use only if you don't have many concepts!**

In [16]:
metric = 'K1'

if len(concepts) <= 10:
    metric_df = compare_all_models_calibration_metric(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                       test_metadata_df, test_cosine_similarity_df, test_embeddings, 
                                       metric=metric)
    if not 'K' in metric:
        display(metric_df.style.highlight_max(color='grey'))
    else:
        display(metric_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,true,animals,cities,companies,elements,facts,generated,inventions
Model,Calibration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
(M1) Global Threshold,,-,-,-,-,-,-,-,-
(M2) Individual Threshold,,-,-,-,-,-,-,-,-
(M3) Global Similarity LogReg,,0.239483,0.788014,0.756128,0.806532,0.831968,0.699597,0.778906,0.764369
(M3) Global Similarity LogReg,Platt,0.229174,0.778164,0.7511,0.803178,0.825294,0.692541,0.772694,0.757917
(M3) Global Similarity LogReg,Isotonic,0.264176,0.744466,0.737107,0.802928,0.789642,0.708412,0.829373,0.789713
(M3) Global Similarity LogReg,Temperature,0.114955,0.621486,0.574213,0.632625,0.651366,0.549564,0.61838,0.559009
(M4) Individual Similarity LogReg,,0.015215,0.821621,0.680771,0.785824,0.850731,0.813125,0.928421,0.770697
(M4) Individual Similarity LogReg,Platt,0.020356,0.828451,0.668661,0.781229,0.856088,0.803515,0.933127,0.775787
(M4) Individual Similarity LogReg,Isotonic,0.030656,0.151383,0.660792,0.438218,0.391637,0.584737,0.630365,0.739118
(M4) Individual Similarity LogReg,Temperature,0.014569,0.638471,0.537808,0.616615,0.659785,0.599914,0.685104,0.58892


### 4.2 Get the average of all metrics over the concepts for all models and calibration methods

In [17]:
metrics_avg_df = compare_all_models_calibration_avg(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                   test_metadata_df, test_cosine_similarity_df, test_embeddings)
metrics_avg_df.to_csv(os.path.join(results_path, 'metrics_average.csv'))
with open(os.path.join(results_path, 'metrics_average.tex'), 'w') as tf:
     tf.write(metrics_avg_df.to_latex())
    
metrics_avg_df.style.highlight_max(color='grey', subset=['accuracy','f1','auc'])

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,auc,K1,K2,Kmax
Model,Calibration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
(M1) Global Threshold,,0.869 ± 0.15,0.607 ± 0.31,-,-,-,-
(M2) Individual Threshold,,0.898 ± 0.16,0.664 ± 0.34,-,-,-,-
(M3) Global Similarity LogReg,,0.859 ± 0.15,0.470 ± 0.34,0.814 ± 0.17,0.708 ± 0.19,0.571 ± 0.21,0.370 ± 0.16
(M3) Global Similarity LogReg,Platt,0.861 ± 0.15,0.493 ± 0.33,0.814 ± 0.17,0.701 ± 0.19,0.562 ± 0.20,0.352 ± 0.15
(M3) Global Similarity LogReg,Isotonic,0.869 ± 0.15,0.591 ± 0.32,0.824 ± 0.17,0.708 ± 0.18,0.587 ± 0.21,0.560 ± 0.20
(M3) Global Similarity LogReg,Temperature,0.859 ± 0.15,0.470 ± 0.34,0.814 ± 0.17,0.540 ± 0.18,0.332 ± 0.13,0.284 ± 0.10
(M4) Individual Similarity LogReg,,0.886 ± 0.17,0.514 ± 0.44,0.809 ± 0.18,0.708 ± 0.29,0.613 ± 0.27,0.507 ± 0.29
(M4) Individual Similarity LogReg,Platt,0.894 ± 0.17,0.607 ± 0.41,0.814 ± 0.17,0.708 ± 0.29,0.634 ± 0.29,0.552 ± 0.30
(M4) Individual Similarity LogReg,Isotonic,0.896 ± 0.17,0.652 ± 0.35,0.825 ± 0.17,0.453 ± 0.25,0.391 ± 0.22,0.383 ± 0.22
(M4) Individual Similarity LogReg,Temperature,0.886 ± 0.17,0.514 ± 0.44,0.809 ± 0.18,0.543 ± 0.22,0.347 ± 0.15,0.388 ± 0.18


### 4.3 Get all metrics for all models and calibration methods for a single concept

In [18]:
concept = concepts[0]

metrics_concept_df = compare_all_models_calibration_concept(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                   test_metadata_df, test_cosine_similarity_df, test_embeddings,
                                      concept=concept)
metrics_concept_df.to_csv(os.path.join(results_path, f'metrics_concept_{concept}.csv'))
with open(os.path.join(results_path, f'metrics_concept_{concept}.tex'), 'w') as tf:
     tf.write(metrics_concept_df.to_latex())
    
metrics_concept_df.style.highlight_max(color='grey', subset=['accuracy','f1','auc'])  

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,auc,K1,K2,Kmax
Model,Calibration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
(M1) Global Threshold,,0.526,0.351,-,-,-,-
(M2) Individual Threshold,,0.5,0.666,-,-,-,-
(M3) Global Similarity LogReg,,0.507,0.151,0.520,0.239,0.084,0.089
(M3) Global Similarity LogReg,Platt,0.509,0.167,0.520,0.229,0.080,0.085
(M3) Global Similarity LogReg,Isotonic,0.528,0.33,0.523,0.264,0.097,0.150
(M3) Global Similarity LogReg,Temperature,0.507,0.151,0.520,0.115,0.020,0.068
(M4) Individual Similarity LogReg,,0.486,0.441,0.480,0.015,0.000,0.010
(M4) Individual Similarity LogReg,Platt,0.494,0.661,0.520,0.020,0.002,0.018
(M4) Individual Similarity LogReg,Isotonic,0.492,0.64,0.518,0.031,0.001,0.027
(M4) Individual Similarity LogReg,Temperature,0.486,0.441,0.480,0.015,0.000,0.008
