# Experiment

In [None]:
import pandas as pd
import numpy as np
import torch
import os
from IPython.display import display
import matplotlib.pyplot as plt

from models import get_global_threshold, get_individual_thresholds
from models import get_global_similarity_log_reg, get_similarity_log_reg
from models import get_embeddings_log_reg
from models import get_global_sim_X_y, get_concept_sim_X_y

from calibration_framework import apply_platt_scaling, apply_isotonic_regression, apply_temperature_scaling
from calibration_framework import apply_histogram_binning, apply_beta_calibration

from utils import compare_all_models_calibration_metric, compare_all_models_calibration_avg, compare_all_models_calibration_concept

In [None]:
dataset_name = 'CLEVR'

## 1. Get pre-processed data

In [None]:
metadata_df = pd.read_csv(f'../Data/{dataset_name}/metadata.csv')
embeddings = torch.load(f'Embeddings/{dataset_name}/embeddings.pt')
cosine_similarity_df = pd.read_csv(f'Cosine_Similarities/{dataset_name}/train_cosine_similarities.csv')

if dataset_name == 'CLEVR':
    metadata_df = metadata_df.drop(['size::large','material::rubber'], axis=1)
    cosine_similarity_df = cosine_similarity_df.drop(['size::large','material::rubber'], axis=1)

concepts = list(cosine_similarity_df.columns)

train_mask = metadata_df['split'] == 'train'
train_embeddings = embeddings[train_mask]
train_metadata_df = metadata_df[train_mask].reset_index(drop=True)
train_cosine_similarity_df = cosine_similarity_df[train_mask].reset_index(drop=True)

cal_mask = metadata_df['split'] == 'calibration'
cal_embeddings = embeddings[cal_mask]
cal_metadata_df = metadata_df[cal_mask].reset_index(drop=True)
cal_cosine_similarity_df = cosine_similarity_df[cal_mask].reset_index(drop=True)

test_mask = metadata_df['split'] == 'test'
test_embeddings = embeddings[test_mask]
test_metadata_df = metadata_df[test_mask].reset_index(drop=True)
test_cosine_similarity_df = cosine_similarity_df[test_mask].reset_index(drop=True)

In [None]:
embeddings.shape

In [None]:
cosine_similarity_df.shape

## 2. Train models

### (GT) Global Threshold

In [None]:
m1_models, m1_global_train_error, m1_train_errors = get_global_threshold(train_metadata_df, train_cosine_similarity_df,
                                                                        verbose=False)
m1_train_errors['Model'] = 'GT'

### (CT) Concept Threshold

In [None]:
m2_models, m2_train_errors = get_individual_thresholds(train_metadata_df, train_cosine_similarity_df, verbose=False)
m2_train_errors['Model'] = 'CT'

### (GLR) Global Similarity LogReg

In [None]:
m3_models, m3_global_train_error, m3_train_errors = get_global_similarity_log_reg(train_metadata_df, 
                                                                                  train_cosine_similarity_df,
                                                                                  verbose=False)
m3_train_errors['Model'] = 'GLR'

### (CLR) Concept Similarity LogReg

In [None]:
m4_models, m4_train_errors = get_similarity_log_reg(train_metadata_df, train_cosine_similarity_df, verbose=False)
m4_train_errors['Model'] = 'CLR'

### (EmbCLR) Embeddings Concept LogReg

In [None]:
m5_models, m5_train_errors = get_embeddings_log_reg(train_embeddings, train_metadata_df, train_cosine_similarity_df,
                                                   verbose=False)
m5_train_errors['Model'] = 'EmbCLR'

In [None]:
base_models = {'GT': m1_models,
               'CT': m2_models,
               'GLR': m3_models,
               'CLR': m4_models,
               'EmbCLR': m5_models
         }

### 2.1 Evaluate training classification error

In [None]:
error_comparison_df = pd.DataFrame.from_dict([m1_train_errors, 
                                              m2_train_errors,
                                              m3_train_errors,
                                              m4_train_errors,
                                              m5_train_errors
                                             ])
error_comparison_df = error_comparison_df.set_index('Model')
if dataset_name == 'CUB':
    error_comparison_df = error_comparison_df.transpose()
    display(error_comparison_df.describe())
else:
    display(error_comparison_df)

## 3. Calibrate models

In [None]:
X_cal, y_cal, _, _ = get_global_sim_X_y(cal_metadata_df, cal_cosine_similarity_df)

m3_models_cal = {}

m3_models_cal['Platt'] = apply_platt_scaling(m3_models, X_cal, y_cal)
m3_models_cal['Isotonic'] = apply_isotonic_regression(m3_models, X_cal, y_cal)
m3_models_cal['Temperature'] = apply_temperature_scaling(m3_models, X_cal, y_cal, verbose=False)

In [None]:
m3_models_cal['Histogram'] = apply_histogram_binning(m3_models, X_cal, y_cal, nbins=10)
m3_models_cal['Beta'] = apply_beta_calibration(m3_models, X_cal, y_cal)

In [None]:
m4_models_cal = {}
m4_models_cal['Platt'] = {}
m4_models_cal['Isotonic'] = {}
m4_models_cal['Temperature'] = {}

m4_models_cal['Histogram'] = {}
m4_models_cal['Beta'] = {}

for concept in m4_models.keys():
    X_cal, y_cal = get_concept_sim_X_y(cal_metadata_df, cal_cosine_similarity_df, concept)

    m4_models_cal['Platt'][concept] = apply_platt_scaling(m4_models[concept], X_cal, y_cal)
    m4_models_cal['Isotonic'][concept] = apply_isotonic_regression(m4_models[concept], X_cal, y_cal)
    m4_models_cal['Temperature'][concept] = apply_temperature_scaling(m4_models[concept], X_cal, y_cal, verbose=False)
    
    m4_models_cal['Histogram'][concept] = apply_histogram_binning(m4_models[concept], X_cal, y_cal, nbins=10)
    m4_models_cal['Beta'][concept] = apply_beta_calibration(m4_models[concept], X_cal, y_cal)

In [None]:
m5_models_cal = {}
m5_models_cal['Platt'] = {}
m5_models_cal['Isotonic'] = {}
m5_models_cal['Temperature'] = {}

m5_models_cal['Histogram'] = {}
m5_models_cal['Beta'] = {}

for concept in m5_models.keys():
    X_cal = cal_embeddings
    y_cal = (cal_metadata_df[concept]==1).to_numpy().astype(int)

    m5_models_cal['Platt'][concept] = apply_platt_scaling(m5_models[concept], X_cal, y_cal)
    m5_models_cal['Isotonic'][concept] = apply_isotonic_regression(m5_models[concept], X_cal, y_cal)
    m5_models_cal['Temperature'][concept] = apply_temperature_scaling(m5_models[concept], X_cal, y_cal, verbose=False)
    
    m5_models_cal['Histogram'][concept] = apply_histogram_binning(m5_models[concept], X_cal, y_cal, nbins=10)
    m5_models_cal['Beta'][concept] = apply_beta_calibration(m5_models[concept], X_cal, y_cal)

In [None]:
calibrated_models = {'GLR': m3_models_cal,
                     'CLR': m4_models_cal,
                     'EmbCLR': m5_models_cal
                    }

## 4. Evaluate models

In [None]:
results_path = f"Results/{dataset_name}/"
os.makedirs(results_path, exist_ok=True)

### 4.1 Get a single metric for all models, calibration methods, and concepts

**Use only if you don't have many concepts!**

In [None]:
metric = 'K1'

if len(concepts) <= 10:
    metric_df = compare_all_models_calibration_metric(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                       test_metadata_df, test_cosine_similarity_df, test_embeddings, 
                                       metric=metric)
    if not 'K' in metric:
        display(metric_df.style.highlight_max(color='grey'))
    else:
        display(metric_df)

### 4.2 Get the average of all metrics over the concepts for all models and calibration methods

In [None]:
metrics_avg_df = compare_all_models_calibration_avg(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                   test_metadata_df, test_cosine_similarity_df, test_embeddings)
metrics_avg_df.to_csv(os.path.join(results_path, 'metrics_average.csv'))
with open(os.path.join(results_path, 'metrics_average.tex'), 'w') as tf:
     tf.write(metrics_avg_df.to_latex())

with open(os.path.join(results_path, 'metrics_average_short.tex'), 'w') as tf:
     tf.write(metrics_avg_df[['Acc','K1','Kmax']].to_latex())
    
metrics_avg_df#.style.highlight_max(color='grey', subset=['Acc','F1','AUC'])

### 4.3 Get all metrics for all models and calibration methods for a single concept

In [None]:
concept = concepts[0]

metrics_concept_df = compare_all_models_calibration_concept(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                   test_metadata_df, test_cosine_similarity_df, test_embeddings,
                                      concept=concept)
metrics_concept_df.to_csv(os.path.join(results_path, f'metrics_concept_{concept}.csv'))
with open(os.path.join(results_path, f'metrics_concept_{concept}.tex'), 'w') as tf:
     tf.write(metrics_concept_df.to_latex())
    
metrics_concept_df.style.highlight_max(color='grey', subset=['Acc','F1','AUC'])  

In [None]:
if len(concepts) < 10:
    for concept in concepts:
        metrics_concept_df = compare_all_models_calibration_concept(base_models, m3_models_cal, m4_models_cal, m5_models_cal,
                                           test_metadata_df, test_cosine_similarity_df, test_embeddings,
                                              concept=concept)
        metrics_concept_df.to_csv(os.path.join(results_path, f'metrics_concept_{concept}.csv'))
        with open(os.path.join(results_path, f'metrics_concept_{concept}.tex'), 'w') as tf:
             tf.write(metrics_concept_df.to_latex())

## 5. Calibration curves

In [None]:
from calibration import plot_calibration_curves_concept, plot_calibration_curves_avg

In [None]:
fig = plot_calibration_curves_avg(test_metadata_df, test_cosine_similarity_df, 
                                test_embeddings, base_models, calibrated_models,
                                  results_path, dataset_name)

In [None]:
if len(concepts) < 10:
    for concept in concepts:
        fig = plot_calibration_curves_concept(test_metadata_df, test_cosine_similarity_df, 
                                            test_embeddings, base_models, calibrated_models,
                                            concept, results_path)
        fig.show()
else:
    for concept in concepts[:10]:
        fig = plot_calibration_curves_concept(test_metadata_df, test_cosine_similarity_df, 
                                            test_embeddings, base_models, calibrated_models,
                                            concept, results_path)
        fig.show()

## Old stuff

In [None]:
from scipy.special import logit, expit
from matplotlib import pyplot as plt
fig, ax = plt.subplots()

n = 501 # discretization
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

# None
plt.plot([0, 1], [0, 1], label = 'None', color = 'grey', linestyle = 'dashed', alpha = 0.5)

# Histogram binning
x_vals = m3_models_cal['Histogram'].calibrator.get_params()['_bin_bounds'][0]
y_vals = m3_models_cal['Histogram'].calibrator.get_params()['_bin_map']
y_vals = np.append(y_vals, y_vals[-1])
nbins = m3_models_cal['Histogram'].calibrator.get_params()['bins']
plt.step(x_vals, y_vals, where = 'post', label = 'Histogram ({} bins)'.format(nbins), color = colors[0])

# Isotonic regression (would be nice if I could find the true bins but this will do)
x_vals = np.linspace(0, 1, num=n, endpoint=True)[1:-1]
y_vals = m3_models_cal['Isotonic'].calibrated_classifiers_[0].calibrators[0].predict(logit(x_vals))
plt.plot(x_vals, y_vals, label = 'Isotonic', color = colors[1])

# Platt scaling
x_vals = np.linspace(0, 1, num=n, endpoint=True)[1:-1]
y_vals = m3_models_cal['Platt'].calibrated_classifiers_[0].calibrators[0].predict(logit(x_vals))
tmp = m3_models_cal['Platt'].calibrated_classifiers_[0].calibrators[0]
a, b = tmp.a_, tmp.b_
plt.plot(x_vals, y_vals, label = 'Platt (A={:.2f}, B={:.2f})'.format(a, b), color = colors[2])

# Temperature scaling

# Emma thinks this should be the correct version (it looks way more believable)
x_vals = np.linspace(0, 1, num=n, endpoint=True)[1:-1]
y_vals = expit(logit(x_vals) / m3_models_cal['Temperature'].temperature)
T = m3_models_cal['Temperature'].temperature
plt.plot(x_vals, y_vals, label = 'Temperature w/ logit (T={:.2f})'.format(T), color = colors[5])

# This matches what the code currently does on data (softmax and no logit)
tmp = m3_models_cal['Temperature']
T = tmp.temperature
x_vals_vec = np.array([1 - x_vals, x_vals]).T
y_vals = tmp.softmax(x_vals_vec / tmp.temperature)[:, 1]
plt.plot(x_vals, y_vals, label = 'Temperature (T={:.2f})'.format(T), color = colors[3])

# Beta calibration
x_vals = np.linspace(0, 1, num=n, endpoint=True)
y_vals = m3_models_cal['Beta'].calibrator.transform(x_vals)
tmp = m3_models_cal['Beta'].calibrator.get_params()
a, b = tmp['_sites']['weights']['values']
c = tmp['_sites']['bias']['values'][0]
plt.plot(x_vals, y_vals, label = 'Beta (a={:.2f}, b={:.2f}, c={:.2f})'.format(a, b, c), color = colors[4])

plt.legend()
ax.set_xlabel('Base model probability estimate')
ax.set_ylabel('Calibrated model probability estimate')
ax.set_title('Calibrators of {} Model'.format('(M3) Global Similarity LogReg'))
plt.show()

In [None]:
# x_vals = m3_models.predict_proba(X_cal)
# y_vals = m3_models_cal['Platt'].predict_proba(X_cal)
# plt.scatter(x_vals, y_vals, label = 'Platt from data', s = 6, color = 'purple')

# x_vals = m3_models.predict_proba(X_cal)
# y_vals = m3_models_cal['Isotonic'].predict_proba(X_cal)
# plt.scatter(x_vals, y_vals, label = 'Isotonic from data', s = 6, color = 'red')

# x_vals = m3_models.predict_proba(X_cal)
# y_vals = m3_models_cal['Temperature'].predict_proba(X_cal)
# plt.scatter(x_vals, y_vals, label = 'Temperature from data', s = 6, color = 'violet')

# x_vals = m3_models.predict_proba(X_cal)
# y_vals = m3_models_cal['Beta'].predict_proba(X_cal)
# plt.scatter(x_vals, y_vals, label = 'Beta from data', s = 6, color = 'violet') 
# why are a bunch of these giving me double vision, maybe they're giving probabilities for both true and false labels?

# tmp = m3_models_cal['Beta'].calibrator.get_params()
# a, b = tmp['_sites']['weights']['values']
# c = tmp['_sites']['bias']['values'][0]
# y_vals = np.exp(c) * x_vals**a / (1 - x_vals)**b
# y_vals = 1 / (1 + (1/y_vals))
# plt.plot(x_vals, y_vals, label = 'Beta guess')