In [8]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from pathlib import Path
from tqdm import tqdm
import sklearn.model_selection
import torch
from torch.utils.data.sampler import SubsetRandomSampler
from torch import nn
from torchvision.models import (
    resnet18, ResNet18_Weights,
    resnet34, ResNet34_Weights,
    resnet50, ResNet50_Weights,
    resnet101, ResNet101_Weights,
    vit_b_16, ViT_B_16_Weights,
    vit_b_32, ViT_B_32_Weights,
    vgg16, VGG16_Weights, 
    vgg16_bn, VGG16_BN_Weights,
    convnext_tiny, ConvNeXt_Tiny_Weights,
    convnext_base, ConvNeXt_Base_Weights,
    efficientnet_v2_s, EfficientNet_V2_S_Weights,
    efficientnet_v2_m, EfficientNet_V2_M_Weights
)
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Subset
from torchvision.transforms import Compose, Resize, Lambda, ToTensor, Grayscale, ToPILImage
import timm
from timm.data import resolve_data_config, create_transform
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import PIL

import sys
sys.path.append('./benchmarking-uncertainty-estimation-performance-main/utils')
from temperature_scaling import _ECELoss
from uncertainty_metrics import AUROC, ECE_calc
from general_calibration_error import gce
from df_posthoc_calibration.calibration import HB_binary, HB_toplabel

from calibrators import ModelWithVectorScaling, ModelWithTemperatureOriginal, ModelWithTemperature

torch.manual_seed(0)
np.random.seed(0)
rng = np.random.default_rng(0)

path_results = os.path.dirname(os.getcwd()) + '/results'

for p in [
    Path('/scratchf/IMAGENET'), # DeepLab
    Path(os.path.expandvars('$DSDIR/imagenet'))]: # Jean Zay
    if os.path.exists(p):
        path_dataset = p

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


  plt.style.use('seaborn')


In [2]:
BATCH_SIZE = 64
MODEL = 'ResNet50_V2' 

# TORCHVISION
models_and_weights_torchvision = {
    'ResNet18': (resnet18, ResNet18_Weights.IMAGENET1K_V1), # same/worse
    'ResNet34': (resnet34, ResNet34_Weights.IMAGENET1K_V1), # same/worse
    'ResNet50': (resnet50, ResNet50_Weights.IMAGENET1K_V1), # same/worse
    'ResNet50_V2': (resnet50, ResNet50_Weights.IMAGENET1K_V2), # better
    'ResNet101': (resnet101, ResNet101_Weights.IMAGENET1K_V1), # same/worse
    'ResNet101_V2': (resnet101, ResNet101_Weights.IMAGENET1K_V2), # better
    'ViT_B_16': (vit_b_16, ViT_B_16_Weights.IMAGENET1K_V1), # better
    'ViT_B_16_SWAG_E2E': (vit_b_16, ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1), # same/worse
    'ViT_B_16_SWAG_LINEAR': (vit_b_16, ViT_B_16_Weights.IMAGENET1K_SWAG_LINEAR_V1), # same/worse
    'ViT_B_32': (vit_b_32, ViT_B_32_Weights.IMAGENET1K_V1), # better
    'VGG16': (vgg16, VGG16_Weights.IMAGENET1K_V1), # same/worse
    'VGG16_BN': (vgg16_bn, VGG16_BN_Weights.IMAGENET1K_V1), # same/worse
    'ConvNeXt_Tiny': (convnext_tiny, ConvNeXt_Tiny_Weights.IMAGENET1K_V1), # better
    'ConvNeXt_Base': (convnext_base, ConvNeXt_Base_Weights.IMAGENET1K_V1), # better
    'EfficientNet_V2_S': (efficientnet_v2_s, EfficientNet_V2_S_Weights.IMAGENET1K_V1), # better
    'EfficientNet_V2_M': (efficientnet_v2_m, EfficientNet_V2_M_Weights.IMAGENET1K_V1) # better
}

# TIMM
models_timm = [
    'vit_base_patch16_224', # same/worse
    'vit_base_patch16_224_in21k', # NEED TO CONVERT IN21K PREDICTIONS TO IN1K
    'vit_base_patch16_224_miil', # same/worse
    'vit_base_patch16_224_miil_in21k', # NEED TO CONVERT IN21K PREDICTIONS TO IN1K
    'vit_base_patch16_384', # same/worse
    'vit_base_patch32_224', # same/worse
    'vit_base_patch32_224_in21k', # NEED TO CONVERT IN21K PREDICTIONS TO IN1K
    'vit_base_patch32_384', # same/worse
    ]

if MODEL in models_and_weights_torchvision.keys():
    TORCHVISION_OR_TIMM = 'torchvision'
elif MODEL in models_timm:
    TORCHVISION_OR_TIMM = 'timm'
else:
    raise ValueError

In [3]:
# LOAD CLASSIFIER
if TORCHVISION_OR_TIMM == 'timm':
    classifier = timm.create_model(MODEL, pretrained=True).eval().to(device)
    transforms = timm.data.create_transform(**timm.data.resolve_data_config({}, model=classifier))

elif TORCHVISION_OR_TIMM == 'torchvision':
    model, weights = models_and_weights_torchvision[MODEL]
    classifier = model(weights=weights).eval().to(device)
    classifier = torch.compile(classifier)
    transforms = weights.transforms()


# LOAD DATA
# dataset_train = ImageFolder(path_dataset+'/train', transform=transforms)
dataset_val = ImageFolder(path_dataset/'val', transform=transforms)

# dataloader_train = DataLoader(dataset_train, batch_size=BATCH_SIZE, num_workers=4, pin_memory=True, shuffle=True) 
dataloader_val = DataLoader(dataset_val, batch_size=BATCH_SIZE, num_workers=4, pin_memory=True, shuffle=True) 
# id_to_idx = {}
# idx_to_label = {}
# with open(path_imagenet_labels) as f:
#     for i, line in enumerate(f):
#         id_to_idx[line[:9]] = i
#         idx_to_label[i] = line[10:-1]



In [4]:
valid_size = 5000
num_classes = 1000

test_indices, valid_indices = sklearn.model_selection.train_test_split(np.arange(len(dataset_val)),
                                                                        train_size=len(dataset_val) - valid_size,
                                                                        stratify=dataset_val.targets)
valid_loader = torch.utils.data.DataLoader(dataset_val, pin_memory=True, batch_size=BATCH_SIZE,
                                            sampler=SubsetRandomSampler(valid_indices), num_workers=4)
test_loader = torch.utils.data.DataLoader(dataset_val, pin_memory=True, batch_size=BATCH_SIZE,
                                            sampler=SubsetRandomSampler(test_indices), num_workers=4)


model = ModelWithTemperatureOriginal(classifier)
print(f'Performing temperature scaling')
model.set_temperature(valid_loader)
baseline_temp = model.temperature.item()

Performing temperature scaling


In [5]:
model = ModelWithTemperature(classifier, binary_loss=True)
model.set_temperature(valid_loader)
optimal_temp = model.temperature.item()

In [6]:
all_logits_test = torch.zeros((len(test_indices), 1000))
all_labels_test = torch.zeros(len(test_indices), dtype=int)
i = 0
for x, y in test_loader:
    with torch.no_grad():
        logits = classifier(x.to(device))
    all_logits_test[i:i+logits.shape[0], :] = logits.cpu()
    all_labels_test[i:i+logits.shape[0]] = y.cpu()
    i += logits.shape[0]


all_logits_valid = torch.zeros((len(valid_indices), 1000))
all_labels_valid = torch.zeros(len(valid_indices), dtype=int)
i = 0
for x, y in valid_loader:
    with torch.no_grad():
        logits = classifier(x.to(device))
    all_logits_valid[i:i+logits.shape[0], :] = logits.cpu()
    all_labels_valid[i:i+logits.shape[0]] = y.cpu()
    i += logits.shape[0]

# optimize TS

In [None]:
from focal_calibration.Losses.brier_score import BrierScore

class ModelWithTemperature(nn.Module):
    """
    A thin decorator, which wraps a model with temperature scaling
    model (nn.Module):
        A classification neural network
        NB: Output of the neural network should be the classification logits,
            NOT the softmax (or log softmax)!
    """
    def __init__(self, model):
        super(ModelWithTemperature, self).__init__()
        self.model = model.eval()
        self.temperature = nn.Parameter(torch.ones(1) * 1.5)

    def forward(self, input):
        logits = self.model(input)
        return self.temperature_scale(logits)

    def temperature_scale(self, logits):
        """
        Perform temperature scaling on logits
        """
        # Expand temperature to match the size of logits
        temperature = self.temperature.unsqueeze(1).expand(logits.size(0), logits.size(1))
        return logits / temperature

    # This function probably should live outside of this class, but whatever
    def set_temperature(self, valid_loader):
        """
        Tune the tempearature of the model (using the validation set).
        We're going to set it to optimize NLL.
        valid_loader (DataLoader): validation set loader
        """
        self.cuda()
        nll_criterion = BrierScore().cuda()

        # First: collect all the logits and labels for the validation set
        logits_list = []
        labels_list = []
        with torch.no_grad():
            for input, label in valid_loader:
                input = input.cuda()
                logits = self.model(input)
                logits_list.append(logits)
                labels_list.append(label)
            logits = torch.cat(logits_list).cuda()
            labels = torch.cat(labels_list).cuda()

        # Next: optimize the temperature w.r.t. NLL
        optimizer = torch.optim.LBFGS([self.temperature], lr=0.01, max_iter=5000)

        def eval():
            optimizer.zero_grad()
            loss = nll_criterion(self.temperature_scale(logits), labels)
            loss.backward()
            return loss

        optimizer.step(eval)

        return self

model = ModelWithTemperature(classifier)
print(f'Performing temperature scaling')
model.set_temperature(valid_loader)
baseline_temp2 = model.temperature.item()

In [None]:
from focal_calibration.Losses.brier_score import BrierScore


class ModelWithTemperatureCustom(nn.Module):
    """
    A thin decorator, which wraps a model with temperature scaling
    model (nn.Module):
        A classification neural network
        NB: Output of the neural network should be the classification logits,
            NOT the softmax (or log softmax)!
    """
    def __init__(self, model):
        super(ModelWithTemperatureCustom, self).__init__()
        self.model = model.eval()
        self.temperature = nn.Parameter(torch.ones(1))

    def forward(self, input):
        logits = self.model(input)
        return self.temperature_scale(logits)

    def temperature_scale(self, logits):
        """
        Perform temperature scaling on logits
        """
        # Expand temperature to match the size of logits
        temperature = self.temperature.unsqueeze(1).expand(logits.size(0), logits.size(1))
        return logits / temperature

    # This function probably should live outside of this class, but whatever
    def set_temperature(self, valid_loader):
        """
        Tune the tempearature of the model (using the validation set).
        We're going to set it to optimize NLL.
        valid_loader (DataLoader): validation set loader
        """
        self.cuda()
        nll_criterion = nn.BCELoss().cuda()

        # First: collect all the logits and labels for the validation set
        logits_list = []
        probs_list = []
        labels_list = []
        train_labels_list = []
        with torch.no_grad():
            for input, label in valid_loader:
                input = input.cuda()
                label = label.cuda()
                logits = self.model(input)
                probs = torch.softmax(logits, axis=1)
                y_pred = probs.argmax(axis=1)
                correct = (y_pred == label).float()
                logits_list.append(logits)
                probs_list.append(probs)
                train_labels_list.append(correct)
                labels_list.append(label)
            logits = torch.cat(logits_list).cuda()
            train_probas = torch.max(torch.cat(probs_list).cuda(), 1, keepdim=True).values
            labels = torch.cat(labels_list).cuda()
            train_labels = torch.cat(train_labels_list).cuda().unsqueeze(1)

        # Next: optimize the temperature w.r.t. NLL
        optimizer = torch.optim.LBFGS([self.temperature], lr=0.01, max_iter=5000)

        def eval():
            optimizer.zero_grad()
            scaled_logits = self.temperature_scale(logits)
            probas = torch.softmax(scaled_logits, axis=1)
            max_probas = torch.max(probas, 1, keepdim=True).values
            # loss = nll_criterion(max_probas, train_labels)
            loss = ((max_probas - train_labels) ** 2).mean()
            loss.backward()
            return loss

        optimizer.step(eval)

        return self

model = ModelWithTemperatureCustom(classifier)
model.set_temperature(valid_loader)
optimal_temp = model.temperature.item()

In [None]:
t = 1
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
# ece, _ = ECE_calc(samples_certainties)
ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('original')
print(ece)
print(auroc)


t = baseline_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
# ece, _ = ECE_calc(samples_certainties)
ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nbaseline')
print(ece)
print(auroc)


t = optimal_temp_base
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
# ece, _ = ECE_calc(samples_certainties)
ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\noptimal')
print(ece)
print(auroc)

t = optimal_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
# ece, _ = ECE_calc(samples_certainties)
ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\noptimal')
print(ece)
print(auroc)

t = baseline_temp2
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
# ece, _ = ECE_calc(samples_certainties)
ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nbaseline2')
print(ece)
print(auroc)

# Clusters for vector scaling

In [None]:
loader = valid_loader

all_logits = torch.zeros((len(loader.sampler), num_classes))
all_labels = torch.zeros(len(loader.sampler), dtype=int)
i = 0
for x, y in loader:
    with torch.no_grad():
        logits = classifier(x.to(device))
    all_logits[i:i+logits.shape[0], :] = logits.cpu()
    all_labels[i:i+logits.shape[0]] = y.cpu()
    i += logits.shape[0]
probs = torch.softmax(all_logits, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels


acc_per_class = np.array([correct[all_labels == l].float().mean().item() for l in range(num_classes)])
avg_confid_per_class = np.array([certainties[all_labels == l].mean().item() for l in range(num_classes)])

fig, axs = plt.subplots(1, 2)
axs[0].hist(acc_per_class)
axs[0].set_xlabel('accuracy per class')
axs[1].hist(avg_confid_per_class)
axs[1].set_xlabel('average confid per class')

In [None]:
def get_adaptive_bins(predictions, num_bins):
  """Returns upper edges for binning an equal number of datapoints per bin."""
  if np.size(predictions) == 0:
    return np.linspace(0, 1, num_bins+1)[:-1]

  edge_indices = np.linspace(0, len(predictions), num_bins, endpoint=False)

  # Round into integers for indexing. If num_bins does not evenly divide
  # len(predictions), this means that bin sizes will alternate between SIZE and
  # SIZE+1.
  edge_indices = np.round(edge_indices).astype(int)

  # If there are many more bins than data points, some indices will be
  # out-of-bounds by one. Set them to be within bounds:
  edge_indices = np.minimum(edge_indices, len(predictions) - 1)

  # Obtain the edge values:
  edges = np.sort(predictions)[edge_indices]

  # Following the convention of numpy.digitize, we do not include the leftmost
  # edge (i.e. return the upper bin edges):
  return edges[1:]

In [None]:
boundaries = torch.linspace(avg_confid_per_class.min()-1e-6, avg_confid_per_class.max(), n_clusters+1) # -1e-6 to include min value in first bucket
boundaries

In [None]:
buckets = torch.bucketize(torch.tensor(avg_confid_per_class), boundaries)
buckets.unique()
for i, b in enumerate(buckets.unique()):
    print(i, (buckets == b).sum())

In [None]:
boundaries = get_adaptive_bins(avg_confid_per_class, 5)
boundaries

In [None]:
buckets = torch.bucketize(torch.tensor(avg_confid_per_class), torch.tensor(boundaries))
buckets.unique()
for i, b in enumerate(buckets.unique()):
    print(i, (buckets == b).sum())

In [None]:
from focal_calibration.Losses.brier_score import BrierScore

class ModelWithVectorScaling(nn.Module):
    """
    A thin decorator, which wraps a model with vector scaling
    model (nn.Module):
        A classification neural network
        NB: Output of the neural network should be the classification logits,
            NOT the softmax (or log softmax)!
    """
    def __init__(self, model, n_classes, binary_loss=False, n_clusters=None):
        super(ModelWithVectorScaling, self).__init__()
        self.model = model.eval()
        self.n_classes = n_classes
        self.binary_loss = binary_loss
        self.n_clusters = n_clusters
        if self.n_clusters is None:
            self.vector_params = nn.Parameter(torch.ones(self.n_classes) * 1.5)
        else:
            self.vector_params = nn.Parameter(torch.ones(self.n_clusters) * 1.5)

    def forward(self, input):
        logits = self.model(input)
        return self.scale(logits)
    
    def get_adaptive_bins(self, predictions, num_bins):
        """Returns upper edges for binning an equal number of datapoints per bin."""
        if np.size(predictions) == 0:
            return np.linspace(0, 1, num_bins+1)[:-1]

        edge_indices = np.linspace(0, len(predictions), num_bins, endpoint=False)

        # Round into integers for indexing. If num_bins does not evenly divide
        # len(predictions), this means that bin sizes will alternate between SIZE and
        # SIZE+1.
        edge_indices = np.round(edge_indices).astype(int)

        # If there are many more bins than data points, some indices will be
        # out-of-bounds by one. Set them to be within bounds:
        edge_indices = np.minimum(edge_indices, len(predictions) - 1)

        # Obtain the edge values:
        edges = np.sort(predictions)[edge_indices]

        # Following the convention of numpy.digitize, we do not include the leftmost
        # edge (i.e. return the upper bin edges):
        return edges[1:]

    def get_vector(self):
        if self.n_clusters is None:
            vector = self.vector_params
        else:
            vector = torch.zeros(self.n_classes).cuda()
            for i, b in enumerate(self.buckets.unique()):
                vector[self.buckets==b] = self.vector_params[i]
        return vector

    def scale(self, logits):
        """
        Perform scaling on logits
        """
        vector = self.get_vector()
        return logits * vector

    def set_vector(self, valid_loader):
        """
        Tune the vector (using the validation set).
        We're going to set it to optimize NLL.
        valid_loader (DataLoader): validation set loader
        """

        self.cuda()
        nll_criterion = nn.BCELoss().cuda() if self.binary_loss else nn.CrossEntropyLoss().cuda()

        # First: collect all the logits and labels for the validation set
        logits_list = []
        labels_list = []
        binary_labels_list = []
        with torch.no_grad():
            for input, label in valid_loader:
                input = input.cuda()
                label = label.cuda()
                logits = self.model(input)
                logits_list.append(logits)
                labels_list.append(label)
                if self.binary_loss:
                    probs = torch.softmax(logits, axis=1)
                    y_pred = probs.argmax(axis=1)
                    correct = (y_pred == label).float()
                    binary_labels_list.append(correct) # binary label
            logits = torch.cat(logits_list).cuda()
            labels = torch.cat(labels_list).cuda()
            if self.binary_loss:
                binary_labels = torch.cat(binary_labels_list).cuda().unsqueeze(1)

        if self.n_clusters is not None:
            probs = torch.softmax(logits, axis=1)
            certainties, y_pred = probs.max(axis=1)
            correct = (y_pred == labels)
            avg_confid_per_class = torch.tensor([certainties[labels == l].mean().item() for l in range(self.n_classes)])
            # boundaries = torch.linspace(avg_confid_per_class.min()-1e-6, avg_confid_per_class.max(), self.n_clusters+1) # -1e-6 to include min value in first bucket
            boundaries = torch.tensor(self.get_adaptive_bins(avg_confid_per_class.numpy(), self.n_clusters))
            self.buckets = torch.bucketize(avg_confid_per_class, boundaries)
            # acc_per_class = torch.tensor([correct[labels == l].float().mean().item() for l in range(self.n_classes)])
            # boundaries = torch.linspace(acc_per_class.min()-1e-6, acc_per_class.max(), self.n_clusters+1) # -1e-6 to include min value in first bucket
            # self.buckets = torch.bucketize(acc_per_class, boundaries)

        # Next: optimize the temperature w.r.t. NLL
        optimizer = torch.optim.LBFGS([self.vector_params], lr=0.01, max_iter=5000)

        def eval():
            optimizer.zero_grad()
            if self.binary_loss:
                scaled_logits = self.scale(logits)
                probas = torch.softmax(scaled_logits, axis=1)
                max_probas = torch.max(probas, 1, keepdim=True).values
                # loss = nll_criterion(max_probas, binary_labels)
                loss = ((max_probas - binary_labels) ** 2).mean()
            else:
                # loss = nll_criterion(self.scale(logits), labels)
                loss = BrierScore()(self.scale(logits), labels)
            loss.backward()
            return loss

        optimizer.step(eval)

        return self


In [None]:
n_clusters = 5

In [None]:
model = ModelWithVectorScaling(classifier, 1000)
model.set_vector(valid_loader)
baseline_VS = model.get_vector().detach().cpu()

In [None]:
model = ModelWithVectorScaling(classifier, 1000, True)
model.set_vector(valid_loader)
BVS = model.get_vector().detach().cpu()

In [None]:
model = ModelWithVectorScaling(classifier, 1000, n_clusters=n_clusters)
model.set_vector(valid_loader)
VS_clustered = model.get_vector().detach().cpu()

In [None]:
model = ModelWithVectorScaling(classifier, 1000, True, n_clusters=n_clusters)
model.set_vector(valid_loader)
BVS_clustered = model.get_vector().detach().cpu()

In [None]:
t = 1
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('original')
print(ece)
print(auroc)


t = baseline_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nbaseline')
print(ece)
print(auroc)


t = optimal_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\noptimal')
print(ece)
print(auroc)


probs = torch.softmax(all_logits_test*baseline_VS, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nbaseline VS')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test*BVS, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nBVS')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test*VS_clustered, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nVS clustered')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test*BVS_clustered, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nBVS clustered')
print(ece)
print(auroc)


# probs = torch.softmax(all_logits_test*optimal_VS, axis=1)
# certainties, y_pred = probs.max(axis=1)
# correct = y_pred == all_labels_test
# samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
# ece, _ = ECE_calc(samples_certainties)
# auroc = AUROC(samples_certainties)
# print('\noptimal VS')
# print(ece)
# print(auroc)


In [None]:
t = 1
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('original')
print(ece)
print(auroc)


t = baseline_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nbaseline')
print(ece)
print(auroc)


t = optimal_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\noptimal')
print(ece)
print(auroc)


probs = torch.softmax(all_logits_test*baseline_VS, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nbaseline VS')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test*BVS, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nBVS')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test*VS_clustered, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nVS clustered')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test*BVS_clustered, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nBVS clustered')
print(ece)
print(auroc)


# probs = torch.softmax(all_logits_test*optimal_VS, axis=1)
# certainties, y_pred = probs.max(axis=1)
# correct = y_pred == all_labels_test
# samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
# ece, _ = ECE_calc(samples_certainties)
# auroc = AUROC(samples_certainties)
# print('\noptimal VS')
# print(ece)
# print(auroc)


In [None]:
t = 1
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('original')
print(ece)
print(auroc)


t = baseline_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nbaseline')
print(ece)
print(auroc)


t = optimal_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\noptimal')
print(ece)
print(auroc)


probs = torch.softmax(all_logits_test*baseline_VS, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nbaseline VS')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test*BVS, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nBVS')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test*VS_clustered, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nVS clustered')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test*BVS_clustered, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nBVS clustered')
print(ece)
print(auroc)


# probs = torch.softmax(all_logits_test*optimal_VS, axis=1)
# certainties, y_pred = probs.max(axis=1)
# correct = y_pred == all_labels_test
# samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
# ece, _ = ECE_calc(samples_certainties)
# auroc = AUROC(samples_certainties)
# print('\noptimal VS')
# print(ece)
# print(auroc)


# TS to optimize selective classif

In [None]:
model = ModelWithTemperatureCustom(classifier)
model.set_temperature(valid_loader)
optimal_temp = model.temperature.item()

In [None]:
class ModelWithVectorScalingCustom(nn.Module):
    """
    A thin decorator, which wraps a model with temperature scaling
    model (nn.Module):
        A classification neural network
        NB: Output of the neural network should be the classification logits,
            NOT the softmax (or log softmax)!
    """
    def __init__(self, model):
        super(ModelWithVectorScalingCustom, self).__init__()
        self.model = model.eval()
        self.vector = nn.Parameter(torch.ones(1000) * 1.5)

    def forward(self, input):
        logits = self.model(input)
        return self.temperature_scale(logits)

    def temperature_scale(self, logits):
        """
        Perform temperature scaling on logits
        """
        # Expand temperature to match the size of logits
        return logits * self.vector

    # This function probably should live outside of this class, but whatever
    def set_temperature(self, valid_loader):
        """
        Tune the tempearature of the model (using the validation set).
        We're going to set it to optimize NLL.
        valid_loader (DataLoader): validation set loader
        """
        self.cuda()
        nll_criterion = nn.BCELoss().cuda()
        ece_criterion = _ECELoss().cuda()

        # First: collect all the logits and labels for the validation set
        logits_list = []
        probs_list = []
        labels_list = []
        train_labels_list = []
        with torch.no_grad():
            for input, label in valid_loader:
                input = input.cuda()
                label = label.cuda()
                logits = self.model(input)
                probs = torch.softmax(logits, axis=1)
                y_pred = probs.argmax(axis=1)
                correct = (y_pred == label).float()
                logits_list.append(logits)
                probs_list.append(probs)
                train_labels_list.append(correct)
                labels_list.append(label)
            logits = torch.cat(logits_list).cuda()
            train_probas = torch.max(torch.cat(probs_list).cuda(), 1, keepdim=True).values
            labels = torch.cat(labels_list).cuda()
            train_labels = torch.cat(train_labels_list).cuda().unsqueeze(1)

        # Calculate NLL and ECE before temperature scaling
        before_temperature_nll = nll_criterion(train_probas, train_labels).item()
        before_temperature_ece = ece_criterion(logits, labels).item()
        print('Before temperature - NLL: %.3f, ECE: %.3f' % (before_temperature_nll, before_temperature_ece))

        # Next: optimize the temperature w.r.t. NLL
        optimizer = torch.optim.LBFGS([self.vector], lr=0.01, max_iter=5000)

        def eval():
            optimizer.zero_grad()
            scaled_logits = self.temperature_scale(logits)
            probas = torch.softmax(scaled_logits, axis=1)
            max_probas = torch.max(probas, 1, keepdim=True).values
            loss = nll_criterion(max_probas, train_labels)
            loss.backward()
            return loss

        optimizer.step(eval)
        # Calculate NLL and ECE after temperature scaling
        scaled_logits = self.temperature_scale(logits)
        probas = torch.softmax(scaled_logits, axis=1)
        max_probas = torch.max(probas, 1, keepdim=True).values
        after_temperature_nll = nll_criterion(max_probas, train_labels).item()
        after_temperature_ece = ece_criterion(self.temperature_scale(logits), labels).item()
        # print('Optimal temperature: %.3f' % self.vector)
        print('After temperature - NLL: %.3f, ECE: %.3f' % (after_temperature_nll, after_temperature_ece))

        return self

model = ModelWithVectorScalingCustom(classifier)
model.set_temperature(valid_loader)
optimal_VS = model.vector.detach().cpu()

In [None]:
class ModelWithProbaScaling(nn.Module):
    def __init__(self, model, validation_dataloader):
        super(ModelWithProbaScaling, self).__init__()
        self.model = model.eval().cuda()
        self.coefficient = self.compute_coefficient(validation_dataloader)
        
    def forward(self, input):
        logits = self.model(input)
        probas = torch.softmax(logits, axis=1)
        probas = torch.clamp(probas * self.coefficient, 0, 1)
        return probas
    
    def compute_coefficient(self, dataloader):
        probs_list = []
        labels_list = []
        for input, label in dataloader:
            input = input.cuda()
            label = label.cuda()
            with torch.no_grad():
                logits = self.model(input)
            probs = torch.softmax(logits, axis=1)
            probs_list.append(probs)
            labels_list.append(label)
        all_probas = torch.cat(probs_list).cuda()
        all_labels = torch.cat(labels_list).cuda()    
        certainties, y_pred = all_probas.max(axis=1)
        correct = (y_pred == all_labels)
        accuracy = correct.float().mean().item()
        avg_confidence = certainties.mean().item()
        coefficient = accuracy / avg_confidence
        print(f'Accuracy: {accuracy:.3f}, Avg confidence: {avg_confidence:.3f}')
        return coefficient
    
model_scaled_proba = ModelWithProbaScaling(classifier, valid_loader)

## Eval AUROC

In [None]:
all_logits = torch.zeros((len(test_indices), 1000))
all_labels = torch.zeros(len(test_indices), dtype=int)
i = 0
for x, y in test_loader:
    with torch.no_grad():
        logits = classifier(x.to(device))
    all_logits[i:i+logits.shape[0], :] = logits.cpu()
    all_labels[i:i+logits.shape[0]] = y.cpu()
    i += logits.shape[0]

In [None]:
t = 1
probs = torch.softmax(all_logits/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('original')
print(ece)
print(auroc)


t = baseline_temp
probs = torch.softmax(all_logits/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nbaseline')
print(ece)
print(auroc)


# t = optimal_temp
# probs = torch.softmax(all_logits/t, axis=1)
# certainties, y_pred = probs.max(axis=1)
# correct = y_pred == all_labels
# samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
# ece, _ = ECE_calc(samples_certainties)
# auroc = AUROC(samples_certainties)
# print('\noptimal')
# print(ece)
# print(auroc)


probs = torch.softmax(all_logits*baseline_VS, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nbaseline VS')
print(ece)
print(auroc)

# probs = torch.softmax(all_logits*optimal_VS, axis=1)
# certainties, y_pred = probs.max(axis=1)
# correct = y_pred == all_labels
# samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
# ece, _ = ECE_calc(samples_certainties)
# auroc = AUROC(samples_certainties)
# print('\noptimal VS')
# print(ece)
# print(auroc)


In [None]:
all_probas = torch.zeros((len(test_indices), 1000))
all_labels = torch.zeros(len(test_indices), dtype=int)
i = 0
for x, y in test_loader:
    with torch.no_grad():
        probas = model_scaled_proba(x.to(device))
    all_probas[i:i+probas.shape[0], :] = probas.cpu()
    all_labels[i:i+probas.shape[0]] = y.cpu()
    i += probas.shape[0]
    
certainties, y_pred = all_probas.max(axis=1)
correct = y_pred == all_labels
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)

ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('\nscaled proba')
print(ece)
print(auroc)

# netcal library

In [None]:
from netcal.scaling import TemperatureScaling, LogisticCalibration
from netcal.metrics import ECE, MMCE
from netcal.presentation import ReliabilityDiagram

n_bins = 15

ece = ECE(n_bins)
diagram = ReliabilityDiagram(n_bins)
temperature = TemperatureScaling()

all_logits_valid = torch.zeros((len(valid_indices), 1000))
all_labels_valid = torch.zeros(len(valid_indices), dtype=int)
i = 0
for x, y in valid_loader:
    with torch.no_grad():
        logits = classifier(x.to(device))
    all_logits_valid[i:i+logits.shape[0], :] = logits.cpu()
    all_labels_valid[i:i+logits.shape[0]] = y.cpu()
    i += logits.shape[0]
confidences_valid = torch.softmax(all_logits_valid, axis=1).numpy()
ground_truth_valid = all_labels_valid.numpy()
confidences_test = torch.softmax(all_logits, axis=1).numpy()
ground_truth_test = all_labels.numpy()

In [None]:
ECE(n_bins).measure(confidences_test, ground_truth_test)

In [None]:
from netcal.metrics import ECE, MMCE
MMCE().measure(confidences_test, ground_truth_test)

In [None]:
diagram.plot(confidences_test, ground_truth_test);

In [None]:
temperature = TemperatureScaling(method='mle')
temperature.fit(confidences_valid, ground_truth_valid)
calibrated = temperature.transform(confidences_test)
ece.measure(calibrated, ground_truth_test)

In [None]:
temperature = TemperatureScaling(method='mle', use_cuda=True)
temperature.fit(confidences_valid, ground_truth_valid)
calibrated = temperature.transform(confidences_test)
ece.measure(calibrated, ground_truth_test)
temperature

In [None]:
lc = LogisticCalibration()
lc.fit(confidences_valid, ground_truth_valid)
calibrated = lc.transform(confidences_test)
ece.measure(calibrated, ground_truth_test)

In [None]:
lc

In [None]:
temperature.temperature

In [None]:
temperature = TemperatureScaling()
temperature.fit(np.array([[0.8, 0.2], [0.2, 0.8]]), np.array([0, 1])).temperature

In [None]:
temperature = TemperatureScaling()
temperature.fit(confidences_valid, np.ones(5000)).temperature

In [None]:
lc = LogisticCalibration()
lc.fit(np.array([[0.8, 0.2], [0.2, 0.8]]), np.array([0, 1]))
# calibrated = lc.transform(confidences_test)
# ece.measure(calibrated, ground_truth_test)

## CIFAR

In [None]:
from focal_calibration.Net.resnet import resnet50 as cifar_resnet50, resnet110 as cifar_resnet110
from focal_calibration.Data import cifar100 as cifar100_loader, cifar10 as cifar10_loader
from df_posthoc_calibration.assessment import toplabel_ece



models_and_weights_path_cifar = {
    'CIFAR10_resnet50': (cifar_resnet50, 'CIFAR10/resnet50_brier_score_350.model'),
    'CIFAR10_resnet110': (cifar_resnet110, 'CIFAR10/resnet110_brier_score_350.model'),

    'CIFAR100_resnet50': (cifar_resnet50, 'CIFAR100/resnet50_brier_score_350.model'),
    'CIFAR100_resnet110': (cifar_resnet110, 'CIFAR100/resnet110_brier_score_430.model')
}

def convert_state_dict(state_dict):
    # https://discuss.pytorch.org/t/solved-keyerror-unexpected-key-module-encoder-embedding-weight-in-state-dict/1686/4
    from collections import OrderedDict
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:] # remove `module.`
        new_state_dict[name] = v
    return new_state_dict
    
model_name = 'CIFAR10_resnet50'
num_classes = 10
architecture, weights_path = models_and_weights_path_cifar[model_name]
classifier = architecture(num_classes=num_classes, temp=1.0).eval().cuda()
classifier.load_state_dict(convert_state_dict(torch.load('./focal_calibration_models/' + weights_path)))

In [None]:
# train_loader, valid_loader = cifar100_loader.get_train_valid_loader(batch_size=256, augment=False, random_seed=1, data_dir=Path(os.path.expandvars('$DSDIR/'))) # seed=1 in original github
# test_loader = cifar100_loader.get_test_loader(batch_size=256, data_dir=Path(os.path.expandvars('$DSDIR/')))

train_loader, valid_loader = cifar10_loader.get_train_valid_loader(batch_size=256, augment=False, random_seed=1, data_dir=Path(os.path.expandvars('$DSDIR/'))) # seed=1 in original github
test_loader = cifar10_loader.get_test_loader(batch_size=256, data_dir=Path(os.path.expandvars('$DSDIR/')))


In [None]:
loader = test_loader

all_logits = torch.zeros((len(loader.sampler), num_classes))
all_labels = torch.zeros(len(loader.sampler), dtype=int)
i = 0
for x, y in loader:
    with torch.no_grad():
        logits = classifier(x.to(device))
    all_logits[i:i+logits.shape[0], :] = logits.cpu()
    all_labels[i:i+logits.shape[0]] = y.cpu()
    i += logits.shape[0]
    
t = 1
probs = torch.softmax(all_logits/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('original')
print(ece)
print(auroc)

t = optimal_temp
probs = torch.softmax(all_logits/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('optimal')
print(ece)
print(auroc)

t = baseline_temp
probs = torch.softmax(all_logits/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
ece, _ = ECE_calc(samples_certainties)
auroc = AUROC(samples_certainties)
print('baseline')
print(ece)
print(auroc)

# Test histogram binning

In [14]:
class Multi_HB_binary():
    def __init__(self, n_clusters, n_bins=15):
        self.n_bins = n_bins
        self.n_clusters = n_clusters
        self.n_classes = 1000

    def get_adaptive_bins(self, predictions, num_bins):
        """Returns upper edges for binning an equal number of datapoints per bin."""
        if np.size(predictions) == 0:
            return np.linspace(0, 1, num_bins+1)[:-1]

        edge_indices = np.linspace(0, len(predictions), num_bins, endpoint=False)

        # Round into integers for indexing. If num_bins does not evenly divide
        # len(predictions), this means that bin sizes will alternate between SIZE and
        # SIZE+1.
        edge_indices = np.round(edge_indices).astype(int)

        # If there are many more bins than data points, some indices will be
        # out-of-bounds by one. Set them to be within bounds:
        edge_indices = np.minimum(edge_indices, len(predictions) - 1)

        # Obtain the edge values:
        edges = np.sort(predictions)[edge_indices]

        # Following the convention of numpy.digitize, we do not include the leftmost
        # edge (i.e. return the upper bin edges):
        return edges[1:]

    def fit(self, logits, labels):
        probs = torch.softmax(logits, axis=1)
        certainties, y_pred = probs.max(axis=1)
        correct = (y_pred == labels)
        avg_confid_per_class = torch.tensor([certainties[labels == l].mean().item() for l in range(self.n_classes)])
        boundaries = torch.linspace(avg_confid_per_class.min()-1e-6, avg_confid_per_class.max(), self.n_clusters+1) # -1e-6 to include min value in first bucket
        # boundaries = torch.tensor(self.get_adaptive_bins(avg_confid_per_class.numpy(), self.n_clusters))
        self.buckets = torch.bucketize(avg_confid_per_class, boundaries)
        # acc_per_class = torch.tensor([correct[labels == l].float().mean().item() for l in range(self.n_classes)])
        # boundaries = torch.linspace(acc_per_class.min()-1e-6, acc_per_class.max(), self.n_clusters+1) # -1e-6 to include min value in first bucket
        # self.buckets = torch.bucketize(acc_per_class, boundaries)

        samples_buckets = torch.tensor([self.buckets[l] for l in y_pred])
        self.hb_dict = {}
        for b in self.buckets.unique().tolist():
            hb = HB_binary(n_bins=self.n_bins)
            print(len(certainties[samples_buckets == b]))
            hb.fit(certainties[samples_buckets == b].numpy(), correct[samples_buckets == b].numpy())
            self.hb_dict[b] = hb
    
    def predict_proba(self, logits):
        probs = torch.softmax(logits, axis=1)
        certainties, y_pred = probs.max(axis=1)

        samples_buckets = torch.tensor([self.buckets[l] for l in y_pred])

        all_probs_calib = torch.zeros_like(certainties)
        for i in range(len(certainties)):
            hb = self.hb_dict[samples_buckets[i].item()]
            all_probs_calib[i] = torch.tensor(hb.predict_proba(certainties[i].cpu().numpy()))
        
        return all_probs_calib

mhb = Multi_HB_binary(n_clusters=1)
mhb.fit(all_logits_valid, all_labels_valid)

5000


In [None]:
# fit HB on full val set
probs = torch.softmax(all_logits_valid, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid
hb = HB_binary()
hb.fit(certainties.numpy(), correct.numpy())

# fit HB on partial val set
probs = torch.softmax(all_logits_valid, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid
labels_selected = torch.randperm(1000)[:980]
idx = torch.isin(y_pred, labels_selected)
hb_p = HB_binary()
hb_p.fit(certainties[idx].numpy(), correct[idx].numpy())

# EVAL
probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
idx = torch.isin(y_pred, labels_selected)

probs = torch.softmax(all_logits_test[idx], axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test[idx]
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHB')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test[idx], axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb_p.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test[idx]
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHBP')
print(ece)
print(auroc)

In [11]:
# fit HB on full val set
probs = torch.softmax(all_logits_valid, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid
hb = HB_binary()
hb.fit(certainties.numpy(), correct.numpy())

# fit HB on partial val set
probs = torch.softmax(all_logits_valid, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid
labels_selected = torch.randperm(1000)[:900]
idx = torch.isin(y_pred, labels_selected)
hb_p = HB_binary()
hb_p.fit(certainties[idx].numpy(), correct[idx].numpy())

# EVAL
probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHB on full')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb_p.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHBP on full')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
idx = torch.isin(y_pred, labels_selected)

probs = torch.softmax(all_logits_test[idx], axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test[idx]
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHB on partial')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test[idx], axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb_p.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test[idx]
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHBP on partial')
print(ece)
print(auroc)


HB on full
tensor(0.0096, dtype=torch.float64)
0.799363718461624

HBP on full
tensor(0.0093, dtype=torch.float64)
0.796778330984703

HB on partial
tensor(0.0093, dtype=torch.float64)
0.7985101709779999

HBP on partial
tensor(0.0096, dtype=torch.float64)
0.7956449808694515


In [17]:
# fit HB on full val set
probs = torch.softmax(all_logits_valid, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid
hb = HB_binary()
hb.fit(certainties.numpy(), correct.numpy())

# fit HB on partial val set
probs = torch.softmax(all_logits_valid, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid
idx = torch.randperm(certainties.size(0))[:3000]
hb_p = HB_binary()
hb_p.fit(certainties[idx].numpy(), correct[idx].numpy())

# compute clusters and fit MHB on full val set
mhb = Multi_HB_binary(n_clusters=1)
mhb.fit(all_logits_valid, all_labels_valid)
samples_buckets = torch.tensor([mhb.buckets[l] for l in y_pred])

# fit HB on one bucket of val set
b = 1
hb_b = HB_binary()
hb_b.fit(certainties[samples_buckets==b].numpy(), correct[samples_buckets==b].numpy())


# EVAL
probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
samples_buckets = torch.tensor([mhb.buckets[l] for l in y_pred])


probs = torch.softmax(all_logits_test[samples_buckets==b], axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test[samples_buckets==b]
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHB')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test[samples_buckets==b], axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb_p.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test[samples_buckets==b]
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHBP')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test[samples_buckets==b], axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb_b.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test[samples_buckets==b]
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHBB')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test[samples_buckets==b], axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = mhb.predict_proba(all_logits_test[samples_buckets==b])
correct = y_pred == all_labels_test[samples_buckets==b]
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nMHB')
print(ece)
print(auroc)

5000

HB
tensor(0.0096, dtype=torch.float64)
0.799363718461624

HBP
tensor(0.0152, dtype=torch.float64)
0.7999666069537394

HBB
tensor(0.0096, dtype=torch.float64)
0.7991581363340243

MHB
tensor(0.0096)
0.7988653676384246


In [None]:
probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_buckets = torch.tensor([mhb.buckets[l] for l in y_pred])
b = 0

probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb.predict_proba(certainties[samples_buckets==b].cpu().numpy()))
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties, correct[samples_buckets==b]), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHB')
print(ece)
print(auroc)



probs = torch.softmax(all_logits_valid, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid
samples_buckets_valid = torch.tensor([mhb.buckets[l] for l in y_pred])
hb_b = HB_binary()
hb_b.fit(certainties[samples_buckets_valid==b].numpy(), correct[samples_buckets_valid==b].numpy())

probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb_b.predict_proba(certainties[samples_buckets==b].cpu().numpy()))
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties, correct[samples_buckets==b]), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHB on bucket')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = mhb.predict_proba(all_logits_test[samples_buckets==b])
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties, correct[samples_buckets==b]), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nMHB')
print(ece)
print(auroc)


In [7]:
from df_posthoc_calibration.calibration import HB_binary, HB_toplabel

all_logits_valid = torch.zeros((len(valid_indices), 1000))
all_labels_valid = torch.zeros(len(valid_indices), dtype=int)
i = 0
for x, y in valid_loader:
    with torch.no_grad():
        logits = classifier(x.to(device))
    all_logits_valid[i:i+logits.shape[0], :] = logits.cpu()
    all_labels_valid[i:i+logits.shape[0]] = y.cpu()
    i += logits.shape[0]

probs = torch.softmax(all_logits_valid, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid

hb = HB_binary()
hb.fit(certainties.numpy(), correct.numpy())

# hb_tl = HB_toplabel()
# hb_tl.fit(probs.numpy(), all_labels_valid.numpy()+1)

In [None]:
t = 1
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('original')
print(ece)
print(auroc)


t = baseline_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nbaseline')
print(ece)
print(auroc)


t = optimal_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\noptimal')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHB')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = mhb.predict_proba(all_logits_test)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nMHB')
print(ece)
print(auroc)


In [None]:
probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties

In [None]:
certainties = torch.tensor(hb.predict_proba(certainties.cpu().numpy()))
certainties

In [None]:
probs = torch.softmax(all_logits_valid, axis=1)
certainties, y_pred = probs.max(axis=1)

hb = HB_binary()
hb.fit(certainties.numpy(), all_labels_valid.numpy())

In [None]:
hb.bin_upper_edges

# Test TS + HB

In [10]:
probs = torch.softmax(all_logits_valid, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid
hb = HB_binary()
hb.fit(certainties.numpy(), correct.numpy())

t = baseline_temp
probs = torch.softmax(all_logits_valid/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid
hb_ts = HB_binary()
hb_ts.fit(certainties.numpy(), correct.numpy())

t = optimal_temp
probs = torch.softmax(all_logits_valid/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_valid
hb_bts = HB_binary()
hb_bts.fit(certainties.numpy(), correct.numpy())

In [12]:
t = 1
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('original')
print(ece)
print(auroc)


t = baseline_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nbaseline')
print(ece)
print(auroc)


t = optimal_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties.cpu(), correct.cpu()), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\noptimal')
print(ece)
print(auroc)

probs = torch.softmax(all_logits_test, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHB')
print(ece)
print(auroc)

t = baseline_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb_ts.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHB_TS')
print(ece)
print(auroc)

t = optimal_temp
probs = torch.softmax(all_logits_test/t, axis=1)
certainties, y_pred = probs.max(axis=1)
certainties = torch.tensor(hb_bts.predict_proba(certainties.cpu().numpy()))
correct = y_pred == all_labels_test
samples_certainties = torch.stack((certainties, correct), dim=1)
auroc = AUROC(samples_certainties)
ece, _ = ECE_calc(samples_certainties)
# ece = gce(all_labels_test, probs, binning_scheme='even', class_conditional=False, max_prob=True, norm='l1', num_bins=15)
print('\nHB_BTS')
print(ece)
print(auroc)



original
tensor(0.4117)
0.8063290870659501

baseline
tensor(0.0297)
0.8604652882588941

optimal
tensor(0.0137)
0.8581556791927044

HB
tensor(0.0096, dtype=torch.float64)
0.7991581363340243

HB_TS
tensor(0.0059, dtype=torch.float64)
0.8574424065344745

HB_BTS
tensor(0.0069, dtype=torch.float64)
0.8553329513898562
