# Experiment

In [1]:
import pandas as pd
import numpy as np
import torch
import os
from IPython.display import display

from models import get_global_threshold, get_individual_thresholds
from models import get_global_similarity_log_reg, get_similarity_log_reg
from models import get_embeddings_log_reg
from models import get_global_sim_X_y, get_concept_sim_X_y

from calibration_framework import apply_platt_scaling, apply_isotonic_regression, apply_temperature_scaling

from utils import compare_all_models_calibration_metric, compare_all_models_calibration_avg, compare_all_models_calibration_concept

In [2]:
dataset_name = 'true-false-dataset'

## 1. Get pre-processed data

In [3]:
metadata_df = pd.read_csv(f'../Data/{dataset_name}/metadata.csv')
embeddings = torch.load(f'Embeddings/{dataset_name}/embeddings.pt')
cosine_similarity_df = pd.read_csv(f'Cosine_Similarities/{dataset_name}/cosine_similarities.csv')

if dataset_name == 'CLEVR':
    metadata_df = metadata_df.drop(['size::large','material::rubber'], axis=1)
    cosine_similarity_df = cosine_similarity_df.drop(['size::large','material::rubber'], axis=1)

concepts = list(cosine_similarity_df.columns)

train_mask = metadata_df['split'] == 'train'
train_embeddings = embeddings[train_mask]
train_metadata_df = metadata_df[train_mask].reset_index(drop=True)
train_cosine_similarity_df = cosine_similarity_df[train_mask].reset_index(drop=True)

cal_mask = metadata_df['split'] == 'calibration'
cal_embeddings = embeddings[cal_mask]
cal_metadata_df = metadata_df[cal_mask].reset_index(drop=True)
cal_cosine_similarity_df = cosine_similarity_df[cal_mask].reset_index(drop=True)

test_mask = metadata_df['split'] == 'test'
test_embeddings = embeddings[test_mask]
test_metadata_df = metadata_df[test_mask].reset_index(drop=True)
test_cosine_similarity_df = cosine_similarity_df[test_mask].reset_index(drop=True)

In [4]:
embeddings.shape

torch.Size([6330, 4096])

In [5]:
cosine_similarity_df.shape

(6330, 8)

## 2. Train models

### (GT) Global Threshold

In [6]:
m1_models, m1_global_train_error, m1_train_errors = get_global_threshold(train_metadata_df, train_cosine_similarity_df,
                                                                        verbose=False)
m1_train_errors['Model'] = 'GT'

### (CT) Concept Threshold

In [7]:
m2_models, m2_train_errors = get_individual_thresholds(train_metadata_df, train_cosine_similarity_df, verbose=False)
m2_train_errors['Model'] = 'CT'

### (GLR) Global Similarity LogReg

In [8]:
m3_models, m3_global_train_error, m3_train_errors = get_global_similarity_log_reg(train_metadata_df, 
                                                                                  train_cosine_similarity_df,
                                                                                  verbose=False)
m3_train_errors['Model'] = 'GLR'

### (CLR) Concept Similarity LogReg

In [9]:
m4_models, m4_train_errors = get_similarity_log_reg(train_metadata_df, train_cosine_similarity_df, verbose=False)
m4_train_errors['Model'] = 'CLR'

### (EmbCLR) Embeddings Concept LogReg

In [10]:
m5_models, m5_train_errors = get_embeddings_log_reg(train_embeddings, train_metadata_df, train_cosine_similarity_df,
                                                   verbose=False)
m5_train_errors['Model'] = 'EmbCLR'

In [11]:
base_models = {'GT': m1_models,
               'CT': m2_models,
               'GLR': m3_models,
               'CLR': m4_models,
               'EmbCLR': m5_models
         }

### 2.1 Evaluate training classification error

In [12]:
error_comparison_df = pd.DataFrame.from_dict([m1_train_errors, 
                                              m2_train_errors,
                                              m3_train_errors,
                                              m4_train_errors,
                                              m5_train_errors
                                             ])
error_comparison_df = error_comparison_df.set_index('Model')
if dataset_name == 'CUB':
    error_comparison_df = error_comparison_df.transpose()
    display(error_comparison_df.describe())
else:
    display(error_comparison_df)

Unnamed: 0_level_0,true,animals,cities,companies,elements,facts,generated,inventions
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GT,0.489898,0.017318,0.087641,0.07006,0.008922,0.118342,0.155077,0.070585
CT,0.498819,0.017318,0.079507,0.016531,0.006822,0.092102,0.042771,0.0635
GLR,0.499082,0.023353,0.124114,0.126214,0.015219,0.092627,0.125689,0.101548
CLR,0.520073,0.019155,0.082131,0.025453,0.009709,0.0963,0.042771,0.134086
EmbCLR,0.00105,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3. Calibrate models

In [13]:
X_cal, y_cal, _, _ = get_global_sim_X_y(cal_metadata_df, cal_cosine_similarity_df)

m3_models_cal = {}

m3_models_cal['Platt'] = apply_platt_scaling(m3_models, X_cal, y_cal)
m3_models_cal['Isotonic'] = apply_isotonic_regression(m3_models, X_cal, y_cal)
m3_models_cal['Temperature'] = apply_temperature_scaling(m3_models, X_cal, y_cal, verbose=False)

In [14]:
m4_models_cal = {}
m4_models_cal['Platt'] = {}
m4_models_cal['Isotonic'] = {}
m4_models_cal['Temperature'] = {}

for concept in m4_models.keys():
    X_cal, y_cal = get_concept_sim_X_y(cal_metadata_df, cal_cosine_similarity_df, concept)

    m4_models_cal['Platt'][concept] = apply_platt_scaling(m4_models[concept], X_cal, y_cal)
    m4_models_cal['Isotonic'][concept] = apply_isotonic_regression(m4_models[concept], X_cal, y_cal)
    m4_models_cal['Temperature'][concept] = apply_temperature_scaling(m4_models[concept], X_cal, y_cal, verbose=False)

In [15]:
m5_models_cal = {}
m5_models_cal['Platt'] = {}
m5_models_cal['Isotonic'] = {}
m5_models_cal['Temperature'] = {}

for concept in m4_models.keys():
    X_cal = cal_embeddings
    y_cal = (cal_metadata_df[concept]==1).to_numpy().astype(int)

    m5_models_cal['Platt'][concept] = apply_platt_scaling(m5_models[concept], X_cal, y_cal)
    m5_models_cal['Isotonic'][concept] = apply_isotonic_regression(m5_models[concept], X_cal, y_cal)
    m5_models_cal['Temperature'][concept] = apply_temperature_scaling(m5_models[concept], X_cal, y_cal, verbose=False)

In [16]:
calibrated_models = {'GLR': m3_models_cal,
                     'CLR': m4_models_cal,
                     'EmbCLR': m5_models_cal
                    }

## 4. Evaluate models

In [17]:
results_path = f"Results/{dataset_name}/"
os.makedirs(results_path, exist_ok=True)

### 4.1 Get a single metric for all models, calibration methods, and concepts

**Use only if you don't have many concepts!**

In [18]:
metric = 'K1'

if len(concepts) <= 10:
    metric_df = compare_all_models_calibration_metric(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                       test_metadata_df, test_cosine_similarity_df, test_embeddings, 
                                       metric=metric)
    if not 'K' in metric:
        display(metric_df.style.highlight_max(color='grey'))
    else:
        display(metric_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,true,animals,cities,companies,elements,facts,generated,inventions
Model,Calibration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GT,,-,-,-,-,-,-,-,-
CT,,-,-,-,-,-,-,-,-
GLR,,0.237794,0.785883,0.755726,0.807074,0.832449,0.702956,0.778383,0.764517
GLR,Platt,0.230039,0.777465,0.750375,0.803493,0.825604,0.695806,0.77176,0.75705
GLR,Isotonic,0.263964,0.744815,0.734668,0.802374,0.789688,0.713821,0.829473,0.789135
GLR,Temperature,0.115744,0.621291,0.574037,0.63288,0.651581,0.55158,0.617512,0.559794
CLR,,0.016821,0.82073,0.68088,0.78673,0.850896,0.81318,0.928473,0.770483
CLR,Platt,0.017165,0.826729,0.668815,0.781487,0.856089,0.80478,0.933186,0.775585
CLR,Isotonic,0.029641,0.140014,0.661022,0.438478,0.390694,0.588378,0.644869,0.738289
CLR,Temperature,0.016167,0.638554,0.537911,0.616679,0.65985,0.599923,0.685126,0.588826


### 4.2 Get the average of all metrics over the concepts for all models and calibration methods

In [19]:
metrics_avg_df = compare_all_models_calibration_avg(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                   test_metadata_df, test_cosine_similarity_df, test_embeddings)
metrics_avg_df.to_csv(os.path.join(results_path, 'metrics_average.csv'))
with open(os.path.join(results_path, 'metrics_average.tex'), 'w') as tf:
     tf.write(metrics_avg_df.to_latex())

with open(os.path.join(results_path, 'metrics_average_short.tex'), 'w') as tf:
     tf.write(metrics_avg_df[['Acc','K1','Kmax']].to_latex())
    
metrics_avg_df#.style.highlight_max(color='grey', subset=['Acc','F1','AUC'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Acc,F1,AUC,K1,K2,Kmax
Model,Calibration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GT,,0.87 ± 0.15,0.60 ± 0.32,-,-,-,-
CT,,0.90 ± 0.16,0.64 ± 0.38,-,-,-,-
GLR,,0.86 ± 0.15,0.47 ± 0.34,0.81 ± 0.17,0.71 ± 0.19,0.57 ± 0.21,0.37 ± 0.16
GLR,Platt,0.86 ± 0.15,0.49 ± 0.33,0.81 ± 0.17,0.70 ± 0.19,0.56 ± 0.20,0.35 ± 0.15
GLR,Isotonic,0.87 ± 0.15,0.59 ± 0.32,0.82 ± 0.17,0.71 ± 0.18,0.59 ± 0.21,0.58 ± 0.20
GLR,Temperature,0.86 ± 0.15,0.47 ± 0.34,0.81 ± 0.17,0.54 ± 0.18,0.33 ± 0.13,0.28 ± 0.10
CLR,,0.89 ± 0.17,0.51 ± 0.44,0.81 ± 0.18,0.71 ± 0.29,0.61 ± 0.27,0.51 ± 0.29
CLR,Platt,0.89 ± 0.16,0.60 ± 0.41,0.81 ± 0.17,0.71 ± 0.29,0.63 ± 0.29,0.55 ± 0.30
CLR,Isotonic,0.90 ± 0.17,0.65 ± 0.35,0.83 ± 0.17,0.45 ± 0.26,0.39 ± 0.22,0.37 ± 0.22
CLR,Temperature,0.89 ± 0.17,0.51 ± 0.44,0.81 ± 0.18,0.54 ± 0.22,0.35 ± 0.15,0.39 ± 0.18


### 4.3 Get all metrics for all models and calibration methods for a single concept

In [20]:
concept = concepts[0]

metrics_concept_df = compare_all_models_calibration_concept(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                   test_metadata_df, test_cosine_similarity_df, test_embeddings,
                                      concept=concept)
metrics_concept_df.to_csv(os.path.join(results_path, f'metrics_concept_{concept}.csv'))
with open(os.path.join(results_path, f'metrics_concept_{concept}.tex'), 'w') as tf:
     tf.write(metrics_concept_df.to_latex())
    
metrics_concept_df.style.highlight_max(color='grey', subset=['Acc','F1','AUC'])  

Unnamed: 0_level_0,Unnamed: 1_level_0,Acc,F1,AUC,K1,K2,Kmax
Model,Calibration,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GT,,0.526,0.346,-,-,-,-
CT,,0.5,0.666,-,-,-,-
GLR,,0.508,0.153,0.520,0.238,0.084,0.089
GLR,Platt,0.51,0.168,0.520,0.230,0.080,0.085
GLR,Isotonic,0.526,0.323,0.522,0.264,0.097,0.163
GLR,Temperature,0.508,0.153,0.520,0.116,0.020,0.068
CLR,,0.484,0.44,0.480,0.017,0.000,0.010
CLR,Platt,0.497,0.664,0.520,0.017,0.001,0.016
CLR,Isotonic,0.492,0.64,0.517,0.030,0.001,0.027
CLR,Temperature,0.484,0.44,0.480,0.016,0.000,0.009


In [21]:
if len(concepts) < 10:
    for concept in concepts:
        metrics_concept_df = compare_all_models_calibration_concept(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                           test_metadata_df, test_cosine_similarity_df, test_embeddings,
                                              concept=concept)
        metrics_concept_df.to_csv(os.path.join(results_path, f'metrics_concept_{concept}.csv'))
        with open(os.path.join(results_path, f'metrics_concept_{concept}.tex'), 'w') as tf:
             tf.write(metrics_concept_df.to_latex())