In [None]:
# Note - this cell should be executed only once per session

import sys, os

# in order to get the config, it is not part of the library

if os.path.basename(os.getcwd()) != "notebooks":
    raise Exception(f"Wrong directory. Did you execute this cell twice?")
os.chdir("..")
sys.path.append(os.path.abspath("."))

%load_ext autoreload
%autoreload 2

In [None]:
from copy import copy
from kyle.calibration.calibration_methods import TemperatureScaling, ClassWiseCalibration, \
    ConfidenceReducedCalibration, BetaCalibration, BaseCalibrationMethod, IsotonicRegression
from kyle.evaluation import EvalStats

from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.datasets import load_iris, load_breast_cancer, make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt


## Loading Models and Data

In [None]:
n_classes = 5

dataset = make_classification(n_samples=60000, n_classes=n_classes, n_informative=15)

X, y = dataset
# X, y = dataset["data"], dataset["target"]

y.shape

In [None]:
test_size = 0.5
sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size)

train_index, test_index = list(sss.split(X, y))[0]
X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]

In [None]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

confidences = model.predict_proba(X_test)
y_pred = confidences.argmax(1)
accuracy = accuracy_score(y_pred, y_test)
print(accuracy)

In [None]:
# Loading model and data

confidences = confidences
gt_labels = y_test


## Visualizing Distribution of Confidences

In [None]:
cmap = cm.get_cmap("tab10")
bins = 50

fig, axes = plt.subplots(n_classes, figsize=(5, 5))
fig.suptitle("Distribution of confidences in predicted classes", fontsize=14)
for count, row in enumerate(axes):
    row.set_title(f"Predicted Class {count}")
    color_left, color_right = cmap(count), cmap(count + 5)
    max_confs = confidences[confidences.argmax(1) == count].max(1)
    row.hist(max_confs, density=True, color=color_left, bins=bins)

plt.show()


# Temperature Scaling in Normal, Reduced adn Class-wise

## Simple Evaluation with Train/Validation Split

In [None]:
test_size = 0.5
bins = 20 # for ECE

sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size)
train_index, test_index = list(sss.split(confidences, gt_labels))[0]
confidences_train, gt_labels_train = confidences[train_index], gt_labels[train_index]
confidences_test, gt_labels_test = confidences[test_index], gt_labels[test_index]

Here the initial reliability curve and ECE of the resnet

In [None]:
uncalibrated_eval_stats = EvalStats(gt_labels_test, confidences_test, bins=bins)

In [None]:
print(f"ECE uncalibrated: {uncalibrated_eval_stats.expected_calibration_error()}")
print(f"Marginal uncalibrated: {uncalibrated_eval_stats.expected_marginal_calibration_error(1)}")


In [None]:
uncalibrated_eval_stats.plot_reliability_curves([EvalStats.TOP_CLASS_LABEL], display_weights=True)
plt.title("Uncalibrated reliabilities")
plt.show()

## Reduced Temp Scaling

In [None]:
t_scaling_full = TemperatureScaling()
t_scaling_binary = ConfidenceReducedCalibration()

In [None]:
t_scaling_full.fit(confidences_train, gt_labels_train)
t_scaling_binary.fit(confidences_train, gt_labels_train)

In [None]:
recalibrated_full_confs = t_scaling_full.get_calibrated_confidences(confidences_test)
recalibrated_reduced_confs = t_scaling_binary.get_calibrated_confidences(confidences_test)

In [None]:
recalibrated_full_eval_stats = EvalStats(gt_labels_test, recalibrated_full_confs, bins=bins)
recalibrated_binary_eval_stats = EvalStats(gt_labels_test, recalibrated_reduced_confs, bins=bins)

print(f"Temp Scaling ECE: {recalibrated_full_eval_stats.expected_calibration_error()}")
print(f"Reduced Temp Scaling ECE: {recalibrated_binary_eval_stats.expected_calibration_error()}")

In [None]:
recalibrated_full_eval_stats.plot_reliability_curves([EvalStats.TOP_CLASS_LABEL], display_weights=True)
plt.title("Temp scaling")
plt.show()

recalibrated_binary_eval_stats.plot_reliability_curves([EvalStats.TOP_CLASS_LABEL], display_weights=True)
plt.title("Reduced temp scaling")
plt.show()


## Class-wise Temp Scaling

In [None]:
%%capture

classwise_scaler = ClassWiseCalibration()
classwise_scaler.fit(confidences_train, gt_labels_train)

In [None]:
classwise_recalibrated_confs = classwise_scaler.get_calibrated_confidences(confidences_test)
classwise_eval_stats = EvalStats(gt_labels_test, classwise_recalibrated_confs, bins=bins)

In [None]:
classwise_eval_stats.plot_reliability_curves([EvalStats.TOP_CLASS_LABEL], display_weights=True)
plt.title("Class-wise Calibrated")
plt.show()

In [None]:
print(f"Class-wise Temp Scaling ECE: {classwise_eval_stats.expected_calibration_error()}")
print(f"Temp Scaling ECE: {recalibrated_full_eval_stats.expected_calibration_error()}")

print(f"Class-wise Temp Scaling cwECE: {classwise_eval_stats.class_wise_expected_calibration_error()}")
print(f"Temp Scaling cwECE: {recalibrated_full_eval_stats.class_wise_expected_calibration_error()}")

## Cross Validation

In [None]:
temp = TemperatureScaling()
beta = BetaCalibration()
classwise_temt = ClassWiseCalibration()
classwise_beta = ClassWiseCalibration(BetaCalibration)
classwise_reduced_temp = ConfidenceReducedCalibration(ClassWiseCalibration())
reduced_temp = ConfidenceReducedCalibration()
reduced_beta = ConfidenceReducedCalibration(BetaCalibration())
isotonic = IsotonicRegression()
reduced_isotonic = ConfidenceReducedCalibration(IsotonicRegression())

classwise_reduced_isotonic = ClassWiseCalibration(lambda: ConfidenceReducedCalibration(IsotonicRegression()))

def compute_score(scaler, confs: np.ndarray, labels: np.ndarray, bins, metric="ECE"):
    calibrated_confs = scaler.get_calibrated_confidences(confs)
    eval_stats = EvalStats(labels, calibrated_confs, bins=bins)
    if metric == "ECE":
        return eval_stats.expected_calibration_error()
    elif metric == "cwECE":
        return eval_stats.class_wise_expected_calibration_error()
    elif isinstance(metric, int):
        return eval_stats.expected_marginal_calibration_error(metric)
    else:
        raise ValueError(f"Unknown metric {metric}")


class Identity(BaseCalibrationMethod):
    def fit(self, *args):
        pass

    def get_calibrated_confidences(self, confidences: np.ndarray):
        return confidences

def get_scores(scaler, metric, cv, bins):
    scoring = lambda *args: compute_score(*args, bins=bins, metric=metric)
    return cross_val_score(scaler, confidences, gt_labels, scoring=scoring, cv=cv)

In [None]:
%%capture
# get rid of the output produced by netcal. They use print instead of logging, it seems

cv = 6
bins = 20

# bring in some randomness to the evaluations
permutation = np.random.permutation(len(confidences))
confidences = confidences[permutation]
gt_labels = gt_labels[permutation]


uncalibrated_ECE = get_scores(Identity(), "ECE", cv, bins)
temp_ECE = get_scores(temp,  "ECE", cv, bins)
beta_ECE = get_scores(beta, "ECE", cv, bins)

cw_temp_ECE = get_scores(classwise_temt, "ECE", cv, bins)
cw_beta_ECE = get_scores(classwise_beta, "ECE", cv, bins)
reduced_temp_ECE = get_scores(reduced_temp, "ECE", cv, bins)
# reduced_beta = get_scores(reduced_beta, "ECE", cv, bins)

isotonic_ECE = get_scores(isotonic, "ECE", cv, bins)
reduced_isotonic_ECE = get_scores(reduced_isotonic, "ECE", cv, bins)
cw_reduced_isotonic_ECE = get_scores(classwise_reduced_isotonic, "ECE", cv, bins)

In [None]:
scores = [
    # uncalibrated_ECE,
    temp_ECE,
    cw_temp_ECE,
    # reduced_temp_ECE,
    beta_ECE,
    cw_beta_ECE,
    # reduced_beta,
    isotonic_ECE,
    reduced_isotonic_ECE,
    cw_reduced_isotonic_ECE,
]
labels = [
    # "Baseline 0 - ECE, uncalibrated",
    "Baseline 1 - ECE, temperature",
    "ECE, Class-wise temperature",
    # "ECE, Reduced temperature",
    "Baseline 2 - ECE, beta",
    "ECE, Class-wise beta",
    # "ECE, Reduced beta",
    "Baseline 3 - ECE, Isotonic Regression",
    "ECE, Reduced isotonic regression",
    "ECE, CW-Reduced isotonic regression"
]
plt.figure(figsize=(14,7))
plt.title(f"CV with {cv} folds on {len(confidences)} data points. \n"
          f"ECE Scores computed with {bins} bins")
plt.boxplot(scores, labels=labels)
plt.xticks(rotation=70)
plt.show()

In [None]:
%%capture

cv_class_wise = 6
bins_class_wise = 20

compute_cwECE = lambda *args: compute_score(*args, metric="cwECE", bins=bins_class_wise)

uncalibrated_cwECE = get_scores(Identity(), "cwECE", cv_class_wise, bins_class_wise)
temp_cwECE = get_scores(temp, "cwECE", cv_class_wise, bins_class_wise)
beta_cwECE = get_scores(beta, "cwECE", cv_class_wise, bins_class_wise)

cw_temp_cwECE = get_scores(classwise_temt, "cwECE", cv_class_wise, bins_class_wise)
cw_beta_cwECE = get_scores(classwise_beta, "cwECE", cv_class_wise, bins_class_wise)

isotonic_cwECE = get_scores(isotonic, "cwECE", cv_class_wise, bins_class_wise)
reduced_isotonic_cwECE = get_scores(reduced_isotonic, "cwECE", cv_class_wise, bins_class_wise)
cw_reduced_isotonic_cwECE = get_scores(classwise_reduced_isotonic, "cwECE", cv_class_wise, bins_class_wise)

In [None]:
scores_cwECE = [
    # uncalibrated_cwECE,
    temp_cwECE,
    cw_temp_cwECE,
    beta_cwECE,
    cw_beta_cwECE,
    isotonic_cwECE,
    reduced_isotonic_cwECE,
    cw_reduced_isotonic_cwECE,
]
labels_cwECE = [
    # "Baseline 0 - cwECE, uncalibrated",
    "Baseline 1 - cwECE, temperature",
    "cwECE, Class-wise temperature",
    "Baseline 2 - cwECE, beta",
    "cwECE, Class-wise beta",
    "Baseline 3 - cwECE, Isotonic Regression",
    "cwECE, Reduced Isotonic Regression",
    "cwECE, Class-wise reduced Isotonic Regression",

]
plt.figure(figsize=(14,7))
plt.title(f"CV with {cv_class_wise} folds on {len(confidences)} data points. \n"
          f"Class-wise ECE scores computed with {bins_class_wise} bins")
plt.boxplot(scores_cwECE, labels=labels_cwECE)
plt.xticks(rotation=70)
plt.show()


In [None]:
%%capture

cv_marginal = 6
class_for_marginal_error = 2
marginal_bins = 20
compute_score_marginal = lambda *args: compute_score(*args, metric=class_for_marginal_error, bins=marginal_bins)


uncalibrated_cwECE_marginal = get_scores(Identity(), class_for_marginal_error, cv_marginal, marginal_bins)
temp_cwECE_marginal = get_scores(temp, class_for_marginal_error, cv_marginal, marginal_bins)
beta_cwECE_marginal = get_scores(beta, class_for_marginal_error, cv_marginal, marginal_bins)
cw_temp_cwECE_marginal = get_scores(classwise_temt, class_for_marginal_error, cv_marginal, marginal_bins)
cw_beta_cwECE_marginal = get_scores(classwise_beta, class_for_marginal_error, cv_marginal, marginal_bins)

In [None]:
marginal_labels = [
    # "Baseline 0 - Marginal, uncalibrated",
    "Baseline 1 - Marginal, temperature",
    "Marginal, Class-wise temperature",
    "Baseline 2 - Marginal, beta",
    "Marginal, Class-wise beta",
]

marginal_scores = [
    # uncalibrated_cwECE_marginal,
    temp_cwECE_marginal,
    cw_temp_cwECE_marginal,
    beta_cwECE_marginal,
    cw_beta_cwECE_marginal
]
plt.figure(figsize=(14,7))
plt.title(f"CV with {cv_marginal} folds on {len(confidences)} data points. \n"
          f"Marginal scores for class {class_for_marginal_error}. Computed with {marginal_bins} bins")
plt.boxplot(marginal_scores, labels=marginal_labels)
plt.xticks(rotation=70)
plt.show()