# Post-hoc Uncertainty Calibration by Test-Time Augmentation 

#### Import libraries

In [None]:
import torch
import torchvision.datasets as dset
import torchvision.transforms as trn
import torch.utils.data as data


from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import balanced_accuracy_score
from scipy import optimize
from fastai.vision.all import *

import pandas as pd
import argparse
import ssl
import numpy as np

#### Set Seed for reproducibility

In [None]:
def set_seed(dls,x=42): 
    random.seed(x)
    dls.rng.seed(x) 
    np.random.seed(x)
    torch.manual_seed(x)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(x)

#### Specification of parser for experimental parameters

In [None]:
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model")
    parser.add_argument("--dataset")
    parser.add_argument("--batchsize", type=int)
    parser.add_argument("--n_bins", type=int)
    parser.add_argument("--mult_val", type=int) 
    parser.add_argument("--mult_test", type=int)
    args = parser.parse_args()

#### Load model and identify metadata from filename

In [None]:
learn = load_learner("models/" + trained_model)
metadata = trained_model.split("_")

dataset = metadata[0] 
model = metadata[1]
imgSize = int(metadata[-1].split(".")[0])

# number of classes
if dataset == "cifar10":
    n_classes = 10
if dataset == "cifar100":
    n_classes = 100
if dataset == "skin":
    n_classes = 2

result_fname = dataset + model + ".csv"

#### Load Validation Data

In [None]:
if dataset == "cifar10":
    df_train = pd.read_csv(path1 + "data/cifar10/cifar10_TrainValidSplit.csv")
    df_train["fn"] = ["data/" + x for x in df_train["fn"].values]
if dataset == "cifar100":
    df_train = pd.read_csv(+ "data/cifar100/cifar100_TrainValidSplit.csv")
    df_train["fn"] = ["data/" + x for x in df_train["fn"].values]
if dataset == "skin":
    df_train = pd.read_csv(path1 + "data/skin_new/skin_TrainValidSplit.csv")

#### Definition of hook for  storing the logits of the neural network

In [None]:
class AggregatingHook(Hook):
    '''aggregate hook outputs in 'stored' in case inference is carried out over the entire dataset in a loop'''
    def __init__(self, m, hook_func, is_forward=True, detach=True, cpu=False, gather=False):
        super().__init__(m, hook_func, is_forward, detach, cpu, gather)
        self.stored = list()

    def hook_fn(self, module, input, output):
        "Applies `hook_func` to `module`, `input`, `output`."
        if self.detach:
            input,output = to_detach(input, cpu=self.cpu, gather=self.gather),to_detach(output, cpu=self.cpu, gather=self.gather)
        self.stored.append(self.hook_func(module, input, output))

### Methods for Calibration

#### Calculation of ECE and MCE

In [None]:
def calibration(preds, targs, num_bins):
    """ Calculation of calibration
    
    References:
    https://arxiv.org/abs/1706.04599
    
    Args:
      targs: true classes (type: TensorCategory, size [number of samples])
      preds: confidence scores (type: Tensor, size [number of samples, number of classes]
      num_bins: number of bins
      
    Returns:
      cal: a dictionary
        {reliability_diag: reliability diagram,
         ece: Expected Calibration Error,
         mce: Maximum Calibration Error}
    """
    class_preds = preds.argmax(dim=-1)
    # and the confidence (probability) associated with it.
    max_value = torch.max(preds, dim=1)
    conf = max_value.values
    conf = np.clip(conf, 1e-6, 1-1e-6)
    #Storage
    acc_tab = np.zeros(num_bins) #empirical (true) confidence
    mean_conf = np.zeros(num_bins) #predicted confidence
    nb_items_bin = np.zeros(num_bins) #number of items in the bins
    tau_tab = np.linspace(0,1,num_bins+1) #limits of the bins

    for i in np.arange(num_bins): #iterate over the bins
        sec = (tau_tab[i + 1] > conf) & (conf >= tau_tab[i])
        nb_items_bin[i] = sum(sec) #number of samples in the bin
        # select the predicted classes, and the true classes
        class_preds_sec, targs_sec = class_preds[sec], targs[sec]
        # average of the predicted max probabilities
        mean_conf[i] = torch.mean(conf[sec]) if nb_items_bin[i] > 0 else np.nan
        # compute the empirical confidence
        acc_tab[i] = np.mean(np.array(class_preds_sec) == np.array(targs_sec)) if nb_items_bin[i] > 0 else np.nan
    # check that every sample in the test set is included into the calculation of the ECE
    try:
        assert (sum(nb_items_bin) == len(targs))
    except AssertionError:
        print("Error: Not all samples are included into the calculation of ECE")

    #Cleaning
    mean_conf = mean_conf[nb_items_bin > 0]
    acc_tab = acc_tab[nb_items_bin > 0]
    nb_items_bin = nb_items_bin[nb_items_bin > 0]
    
    #Reliability diagram
    reliability_diag = (mean_conf, acc_tab)
    #Expected Calibration Error
    ece = np.average(
        np.absolute(mean_conf - acc_tab),
        weights=nb_items_bin.astype(float) / np.sum(nb_items_bin))
    #Maximum Calibration Error
    mce = np.max(np.absolute(mean_conf - acc_tab))
    #Saving
    cal = {'reliability_diag': reliability_diag,
           'ece': ece,
           'mce': mce}
    return cal

#### Calculation of Brier score and negative log-likelihood

In [None]:
def get_other_scores(probs, targets, nbin=30, fn=abs):
    """
    Calculate accuracy, ECE, negative log-likelihood, Brier score
    :param probs: (numpy.array) predictions of dimension N x C where N is number of example, C is classes
    :param targets: (numpy.array) targets of dimension N
    :param nbin: (int) number of bins for calculating ECE
    :param fn: (function) function to transform conf - acc to fn(conf - acc) for ECE, sECE
    :return: tuple containing Accuracy, ECE, NLL, Brier
    """
    preds = np.argmax(probs, axis=1)
    correct = (preds == targets)
    class_probs = np.take_along_axis(probs, targets.astype(np.uint8)[:, None], axis=1)
    nll = np.mean(-np.log(class_probs))
    maxprobs = np.max(probs, axis=-1)
    one_hot = np.eye(probs.shape[1])[targets.astype(np.int32)]
    brier_score = np.mean(np.sum((probs - one_hot) ** 2, axis=1))
    return nll, brier_score

#### Implementation of baseline methods (TS, IR, IRM, ETS)

In [None]:
def mse_t(t, *args):
## find optimal temperature with MSE loss function

    logit, label = args
    logit = logit/t
    n = np.sum(np.exp(logit),1)  
    p = np.exp(logit)/n[:,None]
    mse = np.mean((p-label)**2)
    return mse


def ll_t(t, *args):
## find optimal temperature with Cross-Entropy loss function

    logit, label = args
    logit = logit/t
    n = np.sum(np.exp(logit),1)  
    p = np.clip(np.exp(logit)/n[:,None],1e-20,1-1e-20)
    N = p.shape[0]
    ce = -np.sum(label*np.log(p))/N
    return ce



def mse_w(w, *args):
## find optimal weight coefficients with MSE loss function

    p0, p1, p2, label = args
    p = w[0]*p0+w[1]*p1+w[2]*p2
    p = p/np.sum(p,1)[:,None]
    mse = np.mean((p-label)**2)   
    return mse


def ll_w(w, *args):
## find optimal weight coefficients with Cros-Entropy loss function

    p0, p1, p2, label = args
    p = (w[0]*p0+w[1]*p1+w[2]*p2)
    N = p.shape[0]
    ce = -np.sum(label*np.log(p))/N
    return ce


In [None]:
def temperature_scaling(logit,label,loss):
    bnds = ((0.05, 5.0),)
    if loss == 'ce':
        t = optimize.minimize(ll_t, 1.0 , args = (logit,label), method='L-BFGS-B', bounds=bnds, tol=1e-12)
    if loss == 'mse':
        t = optimize.minimize(mse_t, 1.0 , args = (logit,label), method='L-BFGS-B', bounds=bnds, tol=1e-12)
    t = t.x
    return t

def ensemble_scaling(logit,label,loss,t,n_class):

    p1 = np.exp(logit)/np.sum(np.exp(logit),1)[:,None]
    logit = logit/t
    p0 = np.exp(logit)/np.sum(np.exp(logit),1)[:,None]
    p2 = np.ones_like(p0)/n_class
    

    bnds_w = ((0.0, 1.0),(0.0, 1.0),(0.0, 1.0),)
    def my_constraint_fun(x): return np.sum(x)-1
    constraints = { "type":"eq", "fun":my_constraint_fun,}
    if loss == 'ce':
        w = optimize.minimize(ll_w, (1.0, 0.0, 0.0) , args = (p0,p1,p2,label), method='SLSQP', constraints = constraints, bounds=bnds_w, tol=1e-12, options={'disp': True})
    if loss == 'mse':
        w = optimize.minimize(mse_w, (1.0, 0.0, 0.0) , args = (p0,p1,p2,label), method='SLSQP', constraints = constraints, bounds=bnds_w, tol=1e-12, options={'disp': True})
    w = w.x
    return w

def ts_calibrate(logit, label,loss):
    t = temperature_scaling(logit,label,loss)
    print("temperature = " + str(t))
    return t

def ts_predict(logit, t):
    logit = logit/t
    preds_transformed = np.exp(logit)/np.sum(np.exp(logit),1)[:,None] 
    return torch.from_numpy(preds_transformed)

def ets_calibrate(logit, label, n_class, loss):
    t = temperature_scaling(logit,label,loss='mse') # loss can change to 'ce'
    w = ensemble_scaling(logit,label,'mse',t,n_class)
    return(t,w)

def ets_predict(logit, t, w, n_class):
    p1 = np.exp(logit)/np.sum(np.exp(logit),1)[:,None]
    logit = logit/t
    p0 = np.exp(logit)/np.sum(np.exp(logit),1)[:,None]
    p2 = np.ones_like(p0)/n_class
    preds_transformed = w[0]*p0 + w[1]*p1 +w[2]*p2
    return torch.from_numpy(preds_transformed)

def mir_calibrate(logit,label):
    p = np.exp(logit)/np.sum(np.exp(logit),1)[:,None] 
    ir = IsotonicRegression(out_of_bounds='clip')
    y_ = ir.fit_transform(p.flatten(), (label.flatten()))
    return ir 

def mir_predict(logit, ir):
    p = np.exp(logit)/np.sum(np.exp(logit),1)[:,None] 
    yt_ = ir.predict(p.flatten())
    preds_transformed = yt_.reshape(logit.shape)+1e-9*p
    return torch.from_numpy(preds_transformed)

def irova_calibrate(logit,label):
    p = np.exp(logit)/np.sum(np.exp(logit),1)[:,None]
    list_ir = []
    for ii in range(p.shape[1]):
        ir = IsotonicRegression(out_of_bounds='clip')
        y_ = ir.fit_transform(p[:, ii].astype('double'), label[:, ii].astype('double'))
        list_ir.append(ir)
    return list_ir

def irova_predict(logit, list_ir):
    p = np.exp(logit)/np.sum(np.exp(logit),1)[:,None]
    for ii in range(p.shape[1]):
        ir = list_ir[ii]
        p[:, ii] = ir.predict(p[:, ii]) + 1e-9 * p[:, ii]
    return torch.from_numpy(p)

def irovats_calibrate(logit, label, loss='mse'):
    t = ts_calibrate(logit, label, loss=loss)
    logit = logit / t
    list_ir = irova_calibrate(logit, label)
    return (t, list_ir)

def irovats_predict(logit,t,list_ir):
    logit = logit / t
    p = irova_predict(logit, list_ir)
    return p

#### Implementation of approach from Tomani et al.

In [None]:
class GaussianNoise(Transform):
    
    def __init__(self, mean=0., std=1., **kwargs):
        self.std = std
        self.mean = mean
        super().__init__(**kwargs)
    
    def encodes(self, x:TensorImage):
        x = x / 255.0
        x = x + torch.randn(x.size()).cuda() * self.std + self.mean
        x = x * 255.0
        x = x.type(torch.ByteTensor).cuda()
        x = torch.clip(x,0,255)
       
        return x

In [None]:
def get_accuracy(learn, df, imgSize, epsilon = 0.0,bs = 300):
    
    dblock = createDatablock(imgSize, epsilon)
    dls = dblock.dataloaders(df, bs=bs, num_workers=3)
    set_seed(dls)
    learn.dls = dls
    
    return learn.validate()[1]

def estimateEpsilon(learn, df, imgSize, n_classes, bs = 300):
""" Calculation of epsilons for the given target accuracies """    
    no_pertubations = 6
    epsilon_max = 0.3
    epsilon_diff = 0.004
    acc = []
    epsilons = np.arange(0,epsilon_max, epsilon_diff)
    result = []
    
    min_acc = 1 / float(n_classes)
    perturbation_levels = range(no_pertubations)
    
    max_acc = get_accuracy(learn, df, imgSize, 0.0, 300)
    
    target_accuracy_list = []
    for perturbation_level in perturbation_levels:
        target_accuracy_list.append(max_acc - (max_acc - min_acc)  * perturbation_level / (len(perturbation_levels) - 1))
    
    for i in epsilons:
        if  i != 0.0 and acc[-1] <= (min_acc + 0.05):
            acc.append(min_acc + 0.05)
        else:
            acc.append(get_accuracy(learn, df, imgSize, i, bs))
            
        print("For epsilon = " + str(i) +  "the achieved accuracy is " + str(acc[-1]))
        
            
    for target_accuracy in target_accuracy_list:
        idx = np.argmin(np.abs(acc-np.ones(len(acc))*(target_accuracy+0.03)))
        result.append(epsilons[idx])

    return result

def getLogitsCVPR(learn, df, imgSize, epsilons, bs = 300):
""" Calculation of logits for the given epsilons"""

    targets = []
    
    hook = AggregatingHook(
                m = learn.model[1][8],
                hook_func = lambda m,i,o: o,
                cpu = True
    )
    
    for epsilon in epsilons:
        
        dblock = createDatablock(imgSize, epsilon)
        dls = dblock.dataloaders(df, bs=bs, num_workers=3)
        set_seed(dls)
        learn.dls = dls
        
        preds, targs = learn.get_preds()
        
        targets.append(targs)
            
    logits = torch.cat(hook.stored)
    targets = torch.cat(targets,0)
    
    return (logits,targets)

#### Create Datablock

In [None]:
def createDatablock(imgSize, epsilon = None):
    
    if epsilon == None:
        # Datablock without Gaussian Noise
        dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
              get_x=ColReader("fn"),
              get_y=ColReader("label"), 
              splitter=ColSplitter(col="valid"),
              item_tfms = [Resize(imgSize)],
              batch_tfms = [*aug_transforms(mult = mult_val), Normalize.from_stats(*imagenet_stats)])
    
    else:
        # Datablock with Gaussian Noise
        dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
              get_x=ColReader("fn"),
              get_y=ColReader("label"), 
              splitter=ColSplitter(col="valid"),
              item_tfms = [Resize(imgSize)],
              batch_tfms = [*aug_transforms(mult = mult_val), Normalize.from_stats(*imagenet_stats), GaussianNoise(mean = 0.0, std = epsilon)])
    
    return dblock

### Determination of parameters of post-hoc calibration baseline methods

#### Calculate logits on validation set

In [None]:
dblock = createDatablock(imgSize)
              
dls = dblock.dataloaders(df_train, bs=bs, num_workers=3)
learn.dls = dls

hook = AggregatingHook(
    m = learn.model[1][8],
    hook_func = lambda m,i,o: o,
    cpu = True
)

preds, targs = learn.get_preds()
logits = torch.cat(hook.stored)

#### Calculate parameters of baseline methods on the original validation set

In [None]:
targs_onehot = np.eye(n_classes)[targs.numpy()].astype(int)
temperature = ts_calibrate(logits.numpy(), targs_onehot, 'ce')
temperature_ensemble, weights_ensemble = ets_calibrate(logits.numpy(),targs_onehot, n_classes, 'ce')
isotonic_regression = mir_calibrate(logits.numpy(), targs_onehot)
list_ir = irova_calibrate(logits.numpy(), targs_onehot)
t_irovats, list_irovats = irovats_calibrate(logits.numpy(), targs_onehot)

#### Calculate parameters of baseline methods on the perturbated validation set

In [None]:
epsilons = estimateEpsilon(learn, df_train, imgSize, n_classes)
logits_p, targs_p = getLogitsCVPR(learn,df_train,imgSize,epsilons)

In [None]:
targs_onehot_p = np.eye(n_classes)[targs_p.numpy()].astype(int)
temperature_p = ts_calibrate(logits_p.numpy(), targs_onehot_p, 'ce')
temperature_ensemble_p, weights_ensemble_p = ets_calibrate(logits_p.numpy(),targs_onehot_p, n_classes, 'ce')
isotonic_regression_p = mir_calibrate(logits_p.numpy().astype('float32'), targs_onehot_p)
list_ir_p = irova_calibrate(logits_p.numpy(), targs_onehot_p)
t_irovats_p, list_irovats_p = irovats_calibrate(logits_p.numpy(), targs_onehot_p)

#### Store experimental results in dataframe

In [None]:
fname = path2 +  "csv/" + dataset + model + "_" + str(n_bins) + "_" + str(mult_val) + "_" + str(mult_test) + ".csv"
if os.path.exists(fname):
    df = pd.read_csv(fname)
else:
    df = pd.DataFrame()

### Evaluation on CIFAR-C

#### Set experimental parameters

In [None]:
corruptions = ['Gaussian Noise', 'Shot Noise', 'Impulse Noise', 'Defocus Blur', 'Glass Blur', 'Motion Blur', 'Zoom Blur', \
               'Snow', 'Fog', 'Frost', 'Brightness', 'Contrast', 'Elastic', 'Pixelate', 'JPEG', 'Speckle Noise', \
               'Gaussian Blur', 'Spatter', 'Saturate']

severity = list(range(1,6))

vocabular = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

comb_cor_sev = list(itertools.product(corruptions, severity))

calibration_methods = ["None", "ts", "ets", "mir", "irova", "irovats"]

#### Calculate results on original CIFAR data set

In [None]:
if dataset == "cifar10":
    df_test = pd.read_csv(path1 + "data/cifar10/cifar10_TrainTestSplit.csv")
    df_test["fn"] = [path1 + "data/cifar10/" + x for x in df_test["fn"].values]
if dataset == "cifar100":
    df_test = pd.read_csv(path1 + "data/cifar100/cifar100_TrainTestSplit.csv")
    df_test["fn"] = [path1 + "data/" + x for x in df_test["fn"].values]

block = DataBlock(blocks=(ImageBlock, CategoryBlock),
          get_x=ColReader("fn"),
          get_y=ColReader("label"), 
          splitter=ColSplitter(col="valid"),
          item_tfms = [Resize(imgSize)],
          batch_tfms = [*aug_transforms(mult = mult_test), Normalize.from_stats(*imagenet_stats)])

dls = dblock.dataloaders(df_test, bs=bs, num_workers=3)
learn.dls = dls
set_seed(dls)

for ttaugmented in [False, True]:
    if ttaugmented:
        hook = AggregatingHook(
        m = learn.model[1][8],
        hook_func = lambda m,i,o: o,
        cpu = True
        )

        preds, targs = learn.tta()
        logits = torch.cat(hook.stored)
        logits = torch.reshape(logits,(5,df_test.valid.sum(),n_classes))
        logits = sum(logits,0)/5

    else:
        hook = AggregatingHook(
        m = learn.model[1][8],
        hook_func = lambda m,i,o: o,
        cpu = True
        )

        preds, targs = learn.get_preds()
        logits = torch.cat(hook.stored)

    for perturbed in [False, True]:
        if not perturbed:
            for calibration_method in calibration_methods:
                if calibration_method == "None":
                    preds_transformed = preds

                if calibration_method == "ts":
                    preds_transformed = ts_predict(logits.numpy(),temperature)

                if calibration_method == "ets":
                    preds_transformed = ets_predict(logits.numpy(),temperature_ensemble, weights_ensemble, n_classes)

                if calibration_method == "mir":
                    preds_transformed = mir_predict(logits.numpy(),isotonic_regression)

                if calibration_method == "irova":
                    preds_transformed = irova_predict(logits.numpy(),list_ir)
                
                if calibration_method == "irovats":
                    preds_transformed = irovats_predict(logits.numpy(),temperature, list_ir)
                
                cal = calibration(preds_transformed, targs, n_bins)
                ece = cal["ece"]
                nloss, brier = get_other_scores(preds_transformed.numpy(), targs.numpy())
                mce = cal["mce"]
                acc = float(accuracy(preds,targs))
                
                to_append = pd.DataFrame({
                    "model": model,
                    "dataset": dataset,
                    "TestSet": "Base",
                    "calib": calibration_method,
                    "pertubed": perturbed,
                    "ttaugmented": ttaugmented,
                    "n_bins": n_bins,
                    "mult_val": mult_val,
                    "mult_test": mult_test,
                    "accuracy": acc,
                    "ece": ece,
                    "mce": mce,
                    "brier": brier,
                    "nll": nloss
                }, index = [5] )
                df = pd.concat([df, to_append],ignore_index=True)
        else:
            for calibration_method in calibration_methods:   
                if calibration_method == "None":
                    preds_transformed = preds

                if calibration_method == "ts":
                    preds_transformed = ts_predict(logits.numpy(),temperature_p)

                if calibration_method == "ets":
                    preds_transformed = ets_predict(logits.numpy(),temperature_ensemble_p, weights_ensemble_p, n_classes)

                if calibration_method == "mir":
                    preds_transformed = mir_predict(logits.numpy(),isotonic_regression_p)

                if calibration_method == "irova":
                    preds_transformed = irova_predict(logits.numpy(),list_ir_p)
                
                if calibration_method == "irovats":
                    preds_transformed = irovats_predict(logits.numpy(),temperature_p, list_ir_p)
                

                cal = calibration(preds_transformed, targs, n_bins)
                ece = cal["ece"]
                mce = cal["mce"]
                nloss, brier = get_other_scores(preds_transformed.numpy(), targs.numpy())
                
                acc = float(accuracy(preds,targs))
                to_append = pd.DataFrame({
                    "model": model,
                    "dataset": dataset,
                    "TestSet": "Base",
                    "calib": calibration_method,
                    "pertubed": perturbed,
                    "ttaugmented": ttaugmented,
                    "n_bins": n_bins,
                    "mult_val": mult_val,
                    "mult_test": mult_test,
                    "accuracy": acc,
                    "ece": ece,
                    "mce": mce,
                    "brier": brier,
                    "nll":nloss
                }, index = [5] )
                df = pd.concat([df, to_append],ignore_index=True)
df.to_csv(path2 + "csv/" + dataset + model + "_" + str(n_bins) + "_" + str(mult_val) + "_" + str(mult_test) + ".csv")

#### Calculate results on CIFAR with perturbations

In [None]:
for i in comb_cor_sev:
    
    if len(df) != 0 and (i[0] + "_severity_" + str(i[1])) in df.TestSet.values:
        print(i[0] + "_severity_" + str(i[1]))
        continue
    
    path = "data/" + dataset + "-c/"
    path = path1 + path + i[0] + "_severity_" + str(i[1])
    print(path)
    labels = []
    fname = []

    for root, dirs, files in os.walk(path): 
        for j in files:
            if  dataset == "cifar100":
                if j.split(".")[1] == "jpg":
                    if len(j.split("_")) == 2:
                        labels.append(j.split("_")[0])
                        fname.append(str(path) +"/" + j)
                    else:
                        labels.append(j.split("_")[0] + "_" + j.split("_")[1])
                        fname.append(str(path) +"/" + j)
            if dataset == "cifar10":
                if j.split(".")[1] == "jpg":
                    labels.append(vocabular[int(j.split("_")[0])])
                    fname.append(str(path) +"/" + j)

    test_dict = {"fn": fname, "label": labels}
    
    df_test = pd.DataFrame(test_dict)
    df_test["valid"] = 1
    df_temp = df_train
    df_temp["valid"] = 0
    df_test = df_test.append(df_temp)
    
    block = DataBlock(blocks=(ImageBlock, CategoryBlock),
              get_x=ColReader("fn"),
              get_y=ColReader("label"), 
              splitter=ColSplitter(col="valid"),
              item_tfms = [Resize(imgSize)],
              batch_tfms = [*aug_transforms(mult = mult_test), Normalize.from_stats(*imagenet_stats)])
              
    dls = dblock.dataloaders(df_test, bs=bs, num_workers=3)
    learn.dls = dls
    set_seed(dls)
    
    for ttaugmented in [True, False]:
        if ttaugmented:
            hook = AggregatingHook(
            m = learn.model[1][8],
            hook_func = lambda m,i,o: o,
            cpu = True
            )

            preds, targs = learn.tta()
            logits = torch.cat(hook.stored)
            logits = torch.reshape(logits,(5,df_test.valid.sum(),n_classes))
            logits = sum(logits,0)/5

        else:
            hook = AggregatingHook(
            m = learn.model[1][8],
            hook_func = lambda m,i,o: o,
            cpu = True
            )

            preds, targs = learn.get_preds()
            logits = torch.cat(hook.stored)

        for perturbed in [True, False]:
            if not perturbed:
                for calibration_method in calibration_methods:
                    if calibration_method == "None":
                        preds_transformed = preds

                    if calibration_method == "ts":
                        preds_transformed = ts_predict(logits.numpy(),temperature)

                    if calibration_method == "ets":
                        preds_transformed = ets_predict(logits.numpy(),temperature_ensemble, weights_ensemble, n_classes)

                    if calibration_method == "mir":
                        preds_transformed = mir_predict(logits.numpy(),isotonic_regression)

                    if calibration_method == "irova":
                        preds_transformed = irova_predict(logits.numpy(),list_ir)

                    if calibration_method == "irovats":
                        preds_transformed = irovats_predict(logits.numpy(),temperature, list_ir)

                    cal = calibration(preds_transformed, targs, n_bins)
                    ece = cal["ece"]
                    mce = cal["mce"]
                    nloss, brier = get_other_scores(preds_transformed.numpy(), targs.numpy())
                    acc = float(accuracy(preds,targs))
                    to_append = pd.DataFrame({
                        "model": model,
                        "dataset": dataset,
                        "TestSet": i[0] + "_severity_" + str(i[1]),
                        "calib": calibration_method,
                        "pertubed": perturbed,
                        "ttaugmented": ttaugmented,
                        "n_bins": n_bins,
                        "mult_val": mult_val,
                        "mult_test": mult_test,
                        "accuracy": acc,
                        "ece": ece,
                        "mce": mce,
                        "brier": brier,
                        "nll": nloss
                    }, index = [5] )
                    df = pd.concat([df, to_append],ignore_index=True)
            else:
                for calibration_method in calibration_methods:   
                    if calibration_method == "None":
                        preds_transformed = preds

                    if calibration_method == "ts":
                        preds_transformed = ts_predict(logits.numpy(),temperature_p)

                    if calibration_method == "ets":
                        preds_transformed = ets_predict(logits.numpy(),temperature_ensemble_p, weights_ensemble_p, n_classes)

                    if calibration_method == "mir":
                        preds_transformed = mir_predict(logits.numpy(),isotonic_regression_p)

                    if calibration_method == "irova":
                        preds_transformed = irova_predict(logits.numpy(),list_ir_p)

                    if calibration_method == "irovats":
                        preds_transformed = irovats_predict(logits.numpy(),temperature_p, list_ir_p)


                    cal = calibration(preds_transformed, targs, n_bins)
                    ece = cal["ece"]
                    mce = cal["mce"]
                    nloss, brier = get_other_scores(preds.numpy(), targs.numpy())
                    acc = float(accuracy(preds,targs))
                    to_append = pd.DataFrame({
                        "model": model,
                        "dataset": dataset,
                        "TestSet": i[0] + "_severity_" + str(i[1]),
                        "calib": calibration_method,
                        "pertubed": perturbed,
                        "ttaugmented": ttaugmented,
                        "n_bins": n_bins,
                        "mult_val": mult_val,
                        "mult_test": mult_test,
                        "accuracy": acc,
                        "ece": ece,
                        "mce": mce,
                        "brier": brier,
                        "nll": nloss
                    }, index = [5] )
                    df = pd.concat([df, to_append],ignore_index=True)

        df.to_csv(path2 + "csv/" + dataset + model + "_" + str(n_bins) + "_" + str(mult_val) + "_" + str(mult_test) + ".csv")
                

### Evaluation on Skin Images

In [None]:
testsets = ["ID", "Sidney","MSK2020"]

for testset in testsets:
    if testset == "ID":
        df_test = pd.read_csv(path1 + "data/skin_new/skin_TrainIDTestSplit.csv")
        df_test["fn"] = [path1 + x for x in df_test["fn"].values]
        df_test = df_test[["fn","label","valid"]]
    if testset == "Sidney":
        df_test = pd.read_csv(path1 + "data/skin_new/skin_TrainISIC2020_SIDNEYSplit.csv")
        df_test["fn"] = [path1 + x for x in df_test["fn"].values]
        df_test = df_test[["fn","label","valid"]]
    if testset == "MSK2020":
        df_test = pd.read_csv(path1 + "data/skin_new/skin_TrainISIC2020_MSKCCSplit.csv")
        df_test["fn"] = [path1 + x for x in df_test["fn"].values]
        df_test = df_test[["fn","label","valid"]]
    
    dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
              get_x=ColReader("fn"),
              get_y=ColReader("label"), 
              splitter=ColSplitter(col="valid"),
              item_tfms = [Resize(imgSize)],
              batch_tfms = [*aug_transforms(), Normalize.from_stats(*imagenet_stats)])
              
    dls = dblock.dataloaders(df_test, bs=bs, num_workers=3)
    learn.dls = dls
    set_seed(dls)

    for ttaugmented in [False, True]:
        if ttaugmented:
            hook = AggregatingHook(
            m = learn.model[1][8],
            hook_func = lambda m,i,o: o,
            cpu = True
            )

            preds, targs = learn.tta()
            logits = torch.cat(hook.stored)
            logits = torch.reshape(logits,(5,df_test.valid.sum(),n_classes))
            logits = sum(logits,0)/5

        else:
            hook = AggregatingHook(
            m = learn.model[1][8],
            hook_func = lambda m,i,o: o,
            cpu = True
            )

            preds, targs = learn.get_preds()
            logits = torch.cat(hook.stored)

        for perturbed in [False, True]:
            if not perturbed:
                for calibration_method in calibration_methods:
                    if calibration_method == "None":
                        preds_transformed = preds

                    if calibration_method == "ts":
                        preds_transformed = ts_predict(logits.numpy(),temperature)

                    if calibration_method == "ets":
                        preds_transformed = ets_predict(logits.numpy(),temperature_ensemble, weights_ensemble, n_classes)

                    if calibration_method == "mir":
                        preds_transformed = mir_predict(logits.numpy(),isotonic_regression)

                    if calibration_method == "irova":
                        preds_transformed = irova_predict(logits.numpy(),list_ir)

                    if calibration_method == "irovats":
                        preds_transformed = irovats_predict(logits.numpy(),temperature, list_ir)

                    cal = calibration(preds_transformed, targs, n_bins)
                    ece = cal["ece"]
                    nloss, brier = get_other_scores(preds_transformed.numpy(), targs.numpy())
                    mce = cal["mce"]
                    acc = float(balanced_accuracy_score(targs,torch.argmax(preds,dim=1)))

                    to_append = pd.DataFrame({
                        "model": model,
                        "dataset": dataset,
                        "TestSet": testset,
                        "calib": calibration_method,
                        "pertubed": perturbed,
                        "ttaugmented": ttaugmented,
                        "n_bins": n_bins,
                        "mult_val": mult_val,
                        "mult_test": mult_test,
                        "accuracy": acc,
                        "ece": ece,
                        "brier": brier,
                        "nll": nloss,
                        "mce": mce
                    }, index = [5] )
                    df = pd.concat([df, to_append],ignore_index=True)
            else:
                for calibration_method in calibration_methods:   
                    if calibration_method == "None":
                        preds_transformed = preds

                    if calibration_method == "ts":
                        preds_transformed = ts_predict(logits.numpy(),temperature_p)

                    if calibration_method == "ets":
                        preds_transformed = ets_predict(logits.numpy(),temperature_ensemble_p, weights_ensemble_p, n_classes)

                    if calibration_method == "mir":
                        preds_transformed = mir_predict(logits.numpy(),isotonic_regression_p)

                    if calibration_method == "irova":
                        preds_transformed = irova_predict(logits.numpy(),list_ir_p)

                    if calibration_method == "irovats":
                        preds_transformed = irovats_predict(logits.numpy(),temperature_p, list_ir_p)


                    cal = calibration(preds_transformed, targs, n_bins)
                    ece = cal["ece"]
                    nloss, brier = get_other_scores(preds_transformed.numpy(), targs.numpy())
                    mce = cal["mce"]
                    acc = float(balanced_accuracy_score(targs,torch.argmax(preds,dim=1)))
                    to_append = pd.DataFrame({
                        "model": model,
                        "dataset": dataset,
                        "TestSet": testset,
                        "calib": calibration_method,
                        "pertubed": perturbed,
                        "ttaugmented": ttaugmented,
                        "n_bins": n_bins,
                        "mult_val": mult_val,
                        "mult_test": mult_test,
                        "accuracy": acc,
                        "ece": ece,
                        "brier": brier,
                        "nll": nloss,
                        "mce": mce
                    }, index = [5] )
                    df = pd.concat([df, to_append],ignore_index=True)
df.to_csv(path2 + "csv/" + dataset + model + "_" + str(n_bins) + "_" + str(mult_val) + "_" + str(mult_test) + "_test.csv")