VIDEO LINK: https://drive.google.com/file/d/1SdMKerxA15Y0_rF0M_GnQyvGVnELDwZx/view?usp=sharing

In [325]:
# Kindly be logged into GIT on your system to run below command
# If some linux commands dont work, u might need to install them

In [1]:
!git clone https://github.com/GavinKerrigan/conf_matrix_and_calibration

Cloning into 'conf_matrix_and_calibration'...
remote: Enumerating objects: 44, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 44 (delta 12), reused 22 (delta 3), pack-reused 0[K
Unpacking objects: 100% (44/44), done.


In [2]:
torch_info = !pip3 show torch

In [3]:
if torch_info[1][9:-6] != '1.11.0':
    !pip3 uninstall torch -y
    !pip3 install torch==1.11.0

In [4]:
!pip3 install attrdict
!pip3 install deepdish
!pip3 install pyro-ppl
!pip3 install uncertainty-calibration

Collecting attrdict
  Downloading attrdict-2.0.1-py2.py3-none-any.whl (9.9 kB)
Installing collected packages: attrdict
Successfully installed attrdict-2.0.1
Collecting deepdish
  Downloading deepdish-0.3.7-py2.py3-none-any.whl (37 kB)
Installing collected packages: deepdish
Successfully installed deepdish-0.3.7
Collecting pyro-ppl
  Downloading pyro_ppl-1.8.1-py3-none-any.whl (718 kB)
[K     |████████████████████████████████| 718 kB 5.3 MB/s 
Collecting pyro-api>=0.1.1
  Downloading pyro_api-0.1.2-py3-none-any.whl (11 kB)
Installing collected packages: pyro-api, pyro-ppl
Successfully installed pyro-api-0.1.2 pyro-ppl-1.8.1
Collecting uncertainty-calibration
  Downloading uncertainty-calibration-0.0.9.tar.gz (11 kB)
Collecting parameterized
  Downloading parameterized-0.8.1-py2.py3-none-any.whl (26 kB)
Building wheels for collected packages: uncertainty-calibration
  Building wheel for uncertainty-calibration (setup.py) ... [?25l[?25hdone
  Created wheel for uncertainty-calibration: 

In [5]:
!ls

conf_matrix_and_calibration  sample_data


In [6]:
cd conf_matrix_and_calibration

/content/conf_matrix_and_calibration


In [7]:
!ls

calibrators.py	combination_methods.py	ensemble_ts.py	imax_calib  README.md
cifar10h	data_utils.py		experiments	metrics.py  utils.py


In [8]:
!mkdir data

In [9]:
!ls

calibrators.py		data		experiments  README.md
cifar10h		data_utils.py	imax_calib   utils.py
combination_methods.py	ensemble_ts.py	metrics.py


In [10]:
mv cifar10h data

In [11]:
!ls

calibrators.py		data	       ensemble_ts.py  imax_calib  README.md
combination_methods.py	data_utils.py  experiments     metrics.py  utils.py


In [12]:
cd experiments

/content/conf_matrix_and_calibration/experiments


In [13]:
mkdir output

In [14]:
cd output

/content/conf_matrix_and_calibration/experiments/output


In [15]:
mkdir cifar10h

In [16]:
cd cifar10h

/content/conf_matrix_and_calibration/experiments/output/cifar10h


In [17]:
!mkdir final

In [18]:
cd final

/content/conf_matrix_and_calibration/experiments/output/cifar10h/final


In [19]:
!mkdir fully_sup_CI

In [20]:
!mkdir calibrate_comb_MAP

In [21]:
cd ../../../

/content/conf_matrix_and_calibration/experiments


## Question1): Reproducing the Results in the Paper

* I have used **ResNet-110** and **DenseNet-BC** as the 2 dissimilar models which are combined with human output values
* To achieve above the model_names list was modified
* **ResNet-110**: It is a deep residual network with 110 layers
* **DenseNet-BC**: It is a Convolutinal Neural Network with 190 Layers and a growth rate of 40, using bottleneck layers

* There were 2 experiments in the paper and the github repo.
* The first file was calibrate_combo_experiment.py and second was calibrate_experiment.py


* The calibration_combo_experiment.py file basically produces results(**calibration metrics**), where we can compare the **calibration error** on the model alone and the combined(along with human inferences) model, on different metrics, for 10 observations

In [24]:
# running calibration_combo_experiment.py for 15 total datapoints, meaning about 10 training datapoints
# Generates the data for Table 2 (and Appendix D) in the paper.
import sys
sys.path.insert(0, '../')
from data_utils import *
from utils import *
from combination_methods import *
from tqdm.auto import tqdm
import torch
from sklearn.model_selection import train_test_split
from metrics import *
import csv
import numpy as np
import os
from calibrators import *
# Redefining load_CIFAR10H, so that custom number of points can be given as input
def load_CIFAR10H(model_name, num_points = 10000):
    """ Loads the CIFAR-10H predictions (human and model) and true labels.
    """
    # dirname = os.path.dirname(__file__)
    dirname = '..'
    if model_name == 'r_low_acc':
        data_path = os.path.join(dirname, 'data/cifar10h/human_model_truth_cifar10h.csv')
        data = np.genfromtxt(data_path, delimiter=',')

        human_counts = data[:num_points, :10]
        model_probs = data[:num_points, 10:20]
        true_labels = data[:num_points, -1]

        true_labels -= 1  # data has labels 1-10 -- shifting so that they are zero-indexed.
    else:
        data_path = os.path.join(dirname, f'data/cifar10h/{model_name}.csv')
        data = np.genfromtxt(data_path, delimiter=',')

        true_labels = data[:num_points, 0]
        human_counts = data[:num_points, 1:11]
        model_probs = data[:num_points, 11:]

    true_labels = true_labels.astype(int)

    return human_counts, model_probs, true_labels


def _run_experiment(y_h=None, model_probs=None, y_true=None, **kwargs):
    seed = kwargs.pop('seed', 0)
    n_runs = kwargs.pop('n_runs', 25)
    test_size = kwargs.pop('test_size', 0.3)
    calibration_methods = kwargs.pop('calibration_methods', ['none'])
    calibration_metrics = kwargs.pop('calibration_metrics', {'ECE': get_ECE})
    output_file_acc = kwargs.pop('output_file_acc', './acc.csv')
    output_file_calibration = kwargs.pop('output_file_calibration', './cal.csv')

    acc_data = []
    cal_data = []
    for i in tqdm(range(n_runs), leave=False, desc='Runs'):
        # Train/test split
        y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
            y_h, model_probs, y_true, test_size=test_size, random_state=i * seed)

        # Limit to 5k datapoints
        y_h_tr = y_h_tr[:5000]
        model_probs_tr = model_probs_tr[:5000, :]
        y_true_tr = y_true_tr[:5000]

        acc_h = get_acc(y_h_te, y_true_te)
        acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te)

        _acc_data = [acc_h, acc_m]
        _cal_data = []
        DIAG_ACC = 0.75
        MU_BETA = 0.5
        SIGMA_BETA = 0.5
        combiners = {'MAP_CI': MAPOracleCombiner(diag_acc=DIAG_ACC, mu_beta=MU_BETA, sigma_beta=SIGMA_BETA),
                     'uncal_MAP_CI': MAPOracleCombiner(diag_acc=DIAG_ACC, mu_beta=MU_BETA, sigma_beta=SIGMA_BETA)}
        for combiner_name, combiner in combiners.items():
            combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
            if combiner_name == 'uncal_MAP_CI':
                combiner.calibrator.temperature = 1  # pretty hacky way to get uncalibrated temps.. but w/e

            y_comb_te = combiner.combine(model_probs_te, y_h_te)
            acc_comb = get_acc(y_comb_te, y_true_te)
            _acc_data.append(acc_comb)

            model_probs_calibrated_te = combiner.calibrate(model_probs_te)
            y_comb_prob_te = combiner.combine_proba(model_probs_te, y_h_te)

            # ----- Calibrate combination
            ts_calibrator = TSCalibratorMAP()
            comb_probs_tr = combiner.combine_proba(model_probs_tr, y_h_tr)
            comb_logits_tr = np.log(np.clip(comb_probs_tr, 1e-50, 1))
            ts_calibrator.fit(comb_logits_tr, y_true_tr)
            y_comb_prob_te_calibrated = ts_calibrator.calibrate(y_comb_prob_te)

            for metric, fxn in calibration_metrics.items():
                cal_m = fxn(model_probs_calibrated_te, y_true_te)
                cal_comb = fxn(y_comb_prob_te, y_true_te)
                cal_comb_calibrated = fxn(y_comb_prob_te_calibrated, y_true_te)
                _cal_data.append([combiner_name, metric, cal_m, cal_comb, cal_comb_calibrated])

            acc_data += [_acc_data]
            cal_data += _cal_data

    # Save data to CSV
    header_acc = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
    with open(output_file_acc, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_acc)
        writer.writerows(acc_data)
    header_cal = ['calibration method', 'metric', 'model', 'comb', 'comb (post cal)']
    with open(output_file_calibration, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_cal)
        writer.writerows(cal_data)


def run_experiment_cifar10(out_fpath=None, experiment_args=None, seed=0):
    model_names = [ 'resnet-110', 'densenet-bc-L190-k40']
    for model_name in tqdm(model_names, desc='Models', leave=True):
        # Specify output files
        output_file_acc = out_fpath + f'{model_name}_accuracy_10.csv'
        output_file_calibration = out_fpath + f'{model_name}_calibration_10.csv'
        assert not os.path.exists(output_file_acc), 'Output filepath already exists'
        assert not os.path.exists(output_file_calibration), 'Output filepath already exists'
        experiment_args['output_file_acc'] = output_file_acc
        experiment_args['output_file_calibration'] = output_file_calibration

        # Load data
        human_counts, model_probs, y_true = load_CIFAR10H(model_name, num_points = 15)
        y_h = simulate_single_human(human_counts, seed=seed)

        _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)

if __name__ == '__main__':
    seed = 9658
    torch.manual_seed(seed)
    np.random.seed(seed)

    calibration_methods = ['none', 'confusion']
    """
    calibration_metrics = {'ECE width': lambda probs, y: get_ECE(probs, y, mode='width'),
                           'ECE mass': lambda probs, y: get_ECE(probs, y, mode='mass'),
                           'cwECE thresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
                           'cwECE thresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
                           'cwECE nothresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width',
                                                                               threshold_mode=None),
                           'cwECE nothresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass',
                                                                              threshold_mode=None),
                           'kumar MCE': get_MCE,
                           'kumar MCE (bin)': lambda probs, y: cal.get_binning_ce(probs, y,
                                                                                  p=1, debias=False, mode='marginal'),
                           'kumar MCE (scale)': lambda probs, y: cal.lower_bound_scaling_ce(probs, y,
                                                                                            p=1, debias=False,
                                                                                            mode='marginal'),
                           'kumar ECE': cal.get_ece}
    """
    calibration_metrics = {'ECE (W)': lambda probs, y: get_ECE(probs, y, mode='width'),
                           'ECE (M)': lambda probs, y: get_ECE(probs, y, mode='mass'),
                           'cwECE (WT)': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
                           'cwECE (MT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
                           'cwECE (WNT)': lambda probs, y: get_cw_ECE(probs, y, mode='width', threshold_mode=None),
                           'cwECE (MNT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass', threshold_mode=None),
                           'NLL': get_NLL}

    args = {'n_runs': 25,
            'test_size': 0.3,
            'calibration_methods': calibration_methods,
            'calibration_metrics': calibration_metrics,
            'seed': seed
            }

    out_fpath = './output/cifar10h/final/calibrate_comb_MAP/'
    run_experiment_cifar10(out_fpath=out_fpath, experiment_args=args, seed=seed)    


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

In [25]:
# Running Calibration Combo Experiment for 7143 data points, meaning 5000 training data points
# import sys
# sys.path.insert(0, '../')
from data_utils import *
from utils import *
from combination_methods import *
from tqdm.auto import tqdm
import torch
from sklearn.model_selection import train_test_split
from metrics import *
import csv
import numpy as np
import os
from calibrators import *

# Generates the data for Table 2 (and Appendix D) in our paper.

def load_CIFAR10H(model_name, num_points = 10000):
    """ Loads the CIFAR-10H predictions (human and model) and true labels.
    """
    # dirname = os.path.dirname(__file__)
    dirname = '..'
    if model_name == 'r_low_acc':
        data_path = os.path.join(dirname, 'data/cifar10h/human_model_truth_cifar10h.csv')
        data = np.genfromtxt(data_path, delimiter=',')

        human_counts = data[:num_points, :10]
        model_probs = data[:num_points, 10:20]
        true_labels = data[:num_points, -1]

        true_labels -= 1  # data has labels 1-10 -- shifting so that they are zero-indexed.
    else:
        data_path = os.path.join(dirname, f'data/cifar10h/{model_name}.csv')
        data = np.genfromtxt(data_path, delimiter=',')

        true_labels = data[:num_points, 0]
        human_counts = data[:num_points, 1:11]
        model_probs = data[:num_points, 11:]

    true_labels = true_labels.astype(int)

    return human_counts, model_probs, true_labels


def _run_experiment(y_h=None, model_probs=None, y_true=None, **kwargs):
    seed = kwargs.pop('seed', 0)
    n_runs = kwargs.pop('n_runs', 25)
    test_size = kwargs.pop('test_size', 0.3)
    calibration_methods = kwargs.pop('calibration_methods', ['none'])
    calibration_metrics = kwargs.pop('calibration_metrics', {'ECE': get_ECE})
    output_file_acc = kwargs.pop('output_file_acc', './acc.csv')
    output_file_calibration = kwargs.pop('output_file_calibration', './cal.csv')

    acc_data = []
    cal_data = []
    for i in tqdm(range(n_runs), leave=False, desc='Runs'):
        # Train/test split
        y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
            y_h, model_probs, y_true, test_size=test_size, random_state=i * seed)

        # # Limit to 5k datapoints
        # y_h_tr = y_h_tr[:5000]
        # model_probs_tr = model_probs_tr[:5000, :]
        # y_true_tr = y_true_tr[:5000]

        acc_h = get_acc(y_h_te, y_true_te)
        acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te)

        _acc_data = [acc_h, acc_m]
        _cal_data = []
        DIAG_ACC = 0.75
        MU_BETA = 0.5
        SIGMA_BETA = 0.5
        combiners = {'MAP_CI': MAPOracleCombiner(diag_acc=DIAG_ACC, mu_beta=MU_BETA, sigma_beta=SIGMA_BETA),
                     'uncal_MAP_CI': MAPOracleCombiner(diag_acc=DIAG_ACC, mu_beta=MU_BETA, sigma_beta=SIGMA_BETA)}
        for combiner_name, combiner in combiners.items():
            combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
            if combiner_name == 'uncal_MAP_CI':
                combiner.calibrator.temperature = 1  # pretty hacky way to get uncalibrated temps.. but w/e

            y_comb_te = combiner.combine(model_probs_te, y_h_te)
            acc_comb = get_acc(y_comb_te, y_true_te)
            _acc_data.append(acc_comb)

            model_probs_calibrated_te = combiner.calibrate(model_probs_te)
            y_comb_prob_te = combiner.combine_proba(model_probs_te, y_h_te)

            # ----- Calibrate combination
            ts_calibrator = TSCalibratorMAP()
            comb_probs_tr = combiner.combine_proba(model_probs_tr, y_h_tr)
            comb_logits_tr = np.log(np.clip(comb_probs_tr, 1e-50, 1))
            ts_calibrator.fit(comb_logits_tr, y_true_tr)
            y_comb_prob_te_calibrated = ts_calibrator.calibrate(y_comb_prob_te)

            for metric, fxn in calibration_metrics.items():
                cal_m = fxn(model_probs_calibrated_te, y_true_te)
                cal_comb = fxn(y_comb_prob_te, y_true_te)
                cal_comb_calibrated = fxn(y_comb_prob_te_calibrated, y_true_te)
                _cal_data.append([combiner_name, metric, cal_m, cal_comb, cal_comb_calibrated])

            acc_data += [_acc_data]
            cal_data += _cal_data

    # Save data to CSV
    header_acc = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
    with open(output_file_acc, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_acc)
        writer.writerows(acc_data)
    header_cal = ['calibration method', 'metric', 'model', 'comb', 'comb (post cal)']
    with open(output_file_calibration, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_cal)
        writer.writerows(cal_data)


def run_experiment_cifar10(out_fpath=None, experiment_args=None, seed=0):
    model_names = [ 'resnet-110', 'densenet-bc-L190-k40']
    for model_name in tqdm(model_names, desc='Models', leave=True):
        # Specify output files
        output_file_acc = out_fpath + f'{model_name}_accuracy_5000.csv'
        output_file_calibration = out_fpath + f'{model_name}_calibration_5000.csv'
        assert not os.path.exists(output_file_acc), 'Output filepath already exists'
        assert not os.path.exists(output_file_calibration), 'Output filepath already exists'
        experiment_args['output_file_acc'] = output_file_acc
        experiment_args['output_file_calibration'] = output_file_calibration

        # Load data
        human_counts, model_probs, y_true = load_CIFAR10H(model_name, num_points = 7143)
        y_h = simulate_single_human(human_counts, seed=seed)

        _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)

if __name__ == '__main__':
    seed = 9658
    torch.manual_seed(seed)
    np.random.seed(seed)

    calibration_methods = ['none', 'confusion']
    """
    calibration_metrics = {'ECE width': lambda probs, y: get_ECE(probs, y, mode='width'),
                           'ECE mass': lambda probs, y: get_ECE(probs, y, mode='mass'),
                           'cwECE thresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
                           'cwECE thresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
                           'cwECE nothresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width',
                                                                               threshold_mode=None),
                           'cwECE nothresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass',
                                                                              threshold_mode=None),
                           'kumar MCE': get_MCE,
                           'kumar MCE (bin)': lambda probs, y: cal.get_binning_ce(probs, y,
                                                                                  p=1, debias=False, mode='marginal'),
                           'kumar MCE (scale)': lambda probs, y: cal.lower_bound_scaling_ce(probs, y,
                                                                                            p=1, debias=False,
                                                                                            mode='marginal'),
                           'kumar ECE': cal.get_ece}
    """
    calibration_metrics = {'ECE (W)': lambda probs, y: get_ECE(probs, y, mode='width'),
                           'ECE (M)': lambda probs, y: get_ECE(probs, y, mode='mass'),
                           'cwECE (WT)': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
                           'cwECE (MT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
                           'cwECE (WNT)': lambda probs, y: get_cw_ECE(probs, y, mode='width', threshold_mode=None),
                           'cwECE (MNT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass', threshold_mode=None),
                           'NLL': get_NLL}

    args = {'n_runs': 25,
            'test_size': 0.3,
            'calibration_methods': calibration_methods,
            'calibration_metrics': calibration_metrics,
            'seed': seed
            }

    out_fpath = './output/cifar10h/final/calibrate_comb_MAP/'
    run_experiment_cifar10(out_fpath=out_fpath, experiment_args=args, seed=seed)    


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

In [26]:
import pandas as pd

In [27]:
resnet_110_acc_10 = pd.read_csv("output/cifar10h/final/calibrate_comb_MAP/resnet-110_accuracy_10.csv")
resnet_110_calib_10 = pd.read_csv("output/cifar10h/final/calibrate_comb_MAP/resnet-110_calibration_10.csv")
densenet_acc_10 = pd.read_csv("output/cifar10h/final/calibrate_comb_MAP/densenet-bc-L190-k40_accuracy_10.csv")
densener_calib_10 = pd.read_csv("output/cifar10h/final/calibrate_comb_MAP/densenet-bc-L190-k40_calibration_10.csv")

In [28]:
resnet_110_acc_5000 = pd.read_csv("output/cifar10h/final/calibrate_comb_MAP/resnet-110_accuracy_5000.csv")
resnet_110_calib_5000 = pd.read_csv("output/cifar10h/final/calibrate_comb_MAP/resnet-110_calibration_5000.csv")
densenet_acc_5000 = pd.read_csv("output/cifar10h/final/calibrate_comb_MAP/densenet-bc-L190-k40_accuracy_5000.csv")
densener_calib_5000 = pd.read_csv("output/cifar10h/final/calibrate_comb_MAP/densenet-bc-L190-k40_calibration_5000.csv")

In [265]:
resnet_110_calib_5000.head()

Unnamed: 0,calibration method,metric,model,comb,comb (post cal)
0,MAP_CI,ECE (W),0.027257,0.016001,0.010947
1,MAP_CI,ECE (M),0.023254,0.015391,0.009863
2,MAP_CI,cwECE (WT),0.054972,0.032658,0.032106
3,MAP_CI,cwECE (MT),0.020239,0.015805,0.013735
4,MAP_CI,cwECE (WNT),0.008371,0.004828,0.004454


In [277]:
metric_list = resnet_110_calib_5000['metric'].unique()
metric_list

array(['ECE (W)', 'ECE (M)', 'cwECE (WT)', 'cwECE (MT)', 'cwECE (WNT)',
       'cwECE (MNT)', 'NLL'], dtype=object)

In [294]:
for metric in metric_list:
    temp = resnet_110_calib_5000[(resnet_110_calib_5000['metric'] == metric) & (resnet_110_calib_5000['calibration method'] == 'MAP_CI')]
    print(f'for metric: {metric}:')
    print(np.mean(temp['model']), np.std(temp['model']))
    print(np.mean(temp['comb']), np.std(temp['comb']))
    print(np.mean(temp['comb (post cal)']), np.std(temp['comb (post cal)']))

for metric: ECE (W):
0.023479307405035135 0.003323274150384892
0.014402669163155824 0.0022186070298601352
0.010253647237662922 0.001892225814813155
for metric: ECE (M):
0.019278404403619304 0.00385864418198119
0.01311660098231819 0.002022602978562105
0.00801715734647348 0.0020518758168307292
for metric: cwECE (WT):
0.0490507852613085 0.0031367310566982553
0.029718415010940948 0.0025818373139331685
0.02925931548597067 0.0021728506404883894
for metric: cwECE (MT):
0.020149754355828407 0.0025845763166015786
0.012246509015368867 0.0021685907509438854
0.011057281720049167 0.002621046919650429
for metric: cwECE (WNT):
0.007505674287959638 0.0005426299378158303
0.004209919080123847 0.0003736351575321343
0.003882420858063414 0.0003119711376672349
for metric: cwECE (MNT):
0.0033861665572550255 0.0005645050719641251
0.001518527817454947 0.00035871435434804233
0.0015214601931633881 0.0003702068567895458
for metric: NLL:
0.21033667698589775 0.011996516712305796
0.09965567736440867 0.01048364209770

### It is seen that as we combine and then calibrate, the mean and the variance of errors decrease

### Running Calibration_Experiment.py
* Various calibration methods were used and the model was combined with human inferences
* The resultant models from combination of calibrated models had higher accuracy
* This case corresponds to Calibrated Probabilities and Human Label Combinations, out of the three listed in the paper.

In [30]:
# running calibration_experiment.py
# Generates the data for Appendix C in the paper.
import sys
sys.path.insert(0, '../')
from data_utils import *
from utils import *
from combination_methods import *
from tqdm.auto import tqdm
import torch
from sklearn.model_selection import train_test_split
from metrics import *
import csv
import numpy as np
import os
def load_CIFAR10H(model_name, num_points = 10000):
    """ Loads the CIFAR-10H predictions (human and model) and true labels.
    """
    # dirname = os.path.dirname(__file__)
    dirname = '..'
    if model_name == 'r_low_acc':
        data_path = os.path.join(dirname, 'data/cifar10h/human_model_truth_cifar10h.csv')
        data = np.genfromtxt(data_path, delimiter=',')

        human_counts = data[:num_points, :10]
        model_probs = data[:num_points, 10:20]
        true_labels = data[:num_points, -1]

        true_labels -= 1  # data has labels 1-10 -- shifting so that they are zero-indexed.
    else:
        data_path = os.path.join(dirname, f'data/cifar10h/{model_name}.csv')
        data = np.genfromtxt(data_path, delimiter=',')

        true_labels = data[:num_points, 0]
        human_counts = data[:num_points, 1:11]
        model_probs = data[:num_points, 11:]

    true_labels = true_labels.astype(int)

    return human_counts, model_probs, true_labels



def _run_experiment(y_h=None, model_probs=None, y_true=None, **kwargs):
    seed = kwargs.pop('seed', 0)
    n_runs = kwargs.pop('n_runs', 25)
    test_size = kwargs.pop('test_size', 0.375)
    calibration_methods = kwargs.pop('calibration_methods', ['none'])
    calibration_metrics = kwargs.pop('calibration_metrics', {'ECE': get_ECE})
    output_file_acc = kwargs.pop('output_file_acc', './acc.csv')
    output_file_calibration = kwargs.pop('output_file_calibration', './cal.csv')

    acc_data = []
    cal_data = []
    for i in tqdm(range(n_runs), leave=False, desc='Runs'):
        # Train/test split
        y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
            y_h, model_probs, y_true, test_size=test_size, random_state=i * seed)

        acc_h = get_acc(y_h_te, y_true_te)
        acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te)

        _acc_data = [acc_h, acc_m]
        _cal_data = []
        for calibration_method in calibration_methods:
            if calibration_method == 'confusion':
                combiner = DoubleConfusionCombiner()
                combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
            else:
                combiner = OracleCombiner(calibration_method=calibration_method)
                combiner.fit(model_probs_tr, y_h_tr, y_true_tr)

            y_comb_te = combiner.combine(model_probs_te, y_h_te)
            acc_comb = get_acc(y_comb_te, y_true_te)
            _acc_data.append(acc_comb)

            model_probs_calibrated_te = combiner.calibrate(model_probs_te)
            y_comb_prob_te = combiner.combine_proba(model_probs_te, y_h_te)
            for metric, fxn in calibration_metrics.items():
                cal_m = fxn(model_probs_calibrated_te, y_true_te)
                cal_comb = fxn(y_comb_prob_te, y_true_te)
                _cal_data.append([calibration_method, metric, cal_m, cal_comb])

        acc_data += [_acc_data]
        cal_data += _cal_data

    # Save data to CSV
    header_acc = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
    with open(output_file_acc, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_acc)
        writer.writerows(acc_data)
    header_cal = ['calibration method', 'metric', 'model', 'comb']
    with open(output_file_calibration, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_cal)
        writer.writerows(cal_data)


def run_experiment_cifar10(out_fpath=None, experiment_args=None, seed=0):
    # model_names = ['r_low_acc', 'resnet-110', 'preresnet-110', 'densenet-bc-L190-k40']
    model_names = ['resnet-110', 'densenet-bc-L190-k40']
    for model_name in tqdm(model_names, desc='Models', leave=True):
        # Specify output files
        output_file_acc = out_fpath + f'{model_name}_accuracy.csv'
        output_file_calibration = out_fpath + f'{model_name}_calibration.csv'
        assert not os.path.exists(output_file_acc), 'Output filepath already exists'
        assert not os.path.exists(output_file_calibration), 'Output filepath already exists'
        experiment_args['output_file_acc'] = output_file_acc
        experiment_args['output_file_calibration'] = output_file_calibration

        # Load data
        human_counts, model_probs, y_true = load_CIFAR10H(model_name, num_points = 8000)
        y_h = simulate_single_human(human_counts, seed=seed)

        _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)


if __name__ == '__main__':
    seed = 9658
    torch.manual_seed(seed)
    np.random.seed(seed)

    calibration_methods = ['none', 'confusion', 'temperature scaling', 'ensemble temperature scaling', 'imax binning']                                                                                                              
    """
    calibration_metrics = {'ECE width': lambda probs, y: get_ECE(probs, y, mode='width'),
                           'ECE mass': lambda probs, y: get_ECE(probs, y, mode='mass'),
                           'cwECE thresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
                           'cwECE thresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
                           'cwECE nothresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width',
                                                                               threshold_mode=None),
                           'cwECE nothresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass',
                                                                              threshold_mode=None),
                           'kumar MCE': get_MCE,
                           'kumar MCE (bin)': lambda probs, y: cal.get_binning_ce(probs, y,
                                                                                  p=1, debias=False, mode='marginal'),
                           'kumar MCE (scale)': lambda probs, y: cal.lower_bound_scaling_ce(probs, y,
                                                                                            p=1, debias=False,
                                                                                            mode='marginal'),
                           'kumar ECE': cal.get_ece}
    """
    calibration_metrics = {'ECE (W)': lambda probs, y: get_ECE(probs, y, mode='width'),
                           'ECE (M)': lambda probs, y: get_ECE(probs, y, mode='mass'),
                           'cwECE (WT)': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
                           'cwECE (MT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
                           'cwECE (WNT)': lambda probs, y: get_cw_ECE(probs, y, mode='width', threshold_mode=None),
                           'cwECE (MNT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass', threshold_mode=None),
                           'NLL': get_NLL}
    
    args = {'n_runs': 25,
            'test_size': 0.3,
            'calibration_methods': calibration_methods,
            'calibration_metrics': calibration_metrics,
            'seed': seed
            }

    out_fpath = './output/cifar10h/final/fully_sup_CI/'
    run_experiment_cifar10(out_fpath=out_fpath, experiment_args=args, seed=seed)


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

In [298]:
resnet_acc1 = pd.read_csv("output/cifar10h/final/fully_sup_CI/resnet-110_accuracy.csv")
resnet_calib1 = pd.read_csv("output/cifar10h/final/fully_sup_CI/resnet-110_calibration.csv")

In [299]:
resnet_acc1.head()

Unnamed: 0,human,model,comb none,comb confusion,comb temperature scaling,comb ensemble temperature scaling,comb imax binning
0,0.955,0.94,0.968333,0.9525,0.972917,0.973333,0.973333
1,0.955417,0.9375,0.967083,0.954583,0.972083,0.972083,0.971667
2,0.948333,0.933333,0.965,0.944583,0.967917,0.9675,0.9675
3,0.95,0.944167,0.97,0.95,0.971667,0.971667,0.971667
4,0.953333,0.942917,0.972917,0.956667,0.974167,0.974167,0.9725


In [300]:
resnet_a1 = resnet_acc1.copy()

In [301]:
for col in resnet_a1.columns:
    resnet_a1[col]*=-100
    resnet_a1[col]+=100

In [302]:
resnet_a1.head()

Unnamed: 0,human,model,comb none,comb confusion,comb temperature scaling,comb ensemble temperature scaling,comb imax binning
0,4.5,6.0,3.166667,4.75,2.708333,2.666667,2.666667
1,4.458333,6.25,3.291667,4.541667,2.791667,2.791667,2.833333
2,5.166667,6.666667,3.5,5.541667,3.208333,3.25,3.25
3,5.0,5.583333,3.0,5.0,2.833333,2.833333,2.833333
4,4.666667,5.708333,2.708333,4.333333,2.583333,2.583333,2.75


In [303]:
densenet_acc1 = pd.read_csv("output/cifar10h/final/fully_sup_CI/densenet-bc-L190-k40_accuracy.csv")
densenet_calib1 = pd.read_csv("output/cifar10h/final/fully_sup_CI/densenet-bc-L190-k40_calibration.csv")

In [304]:
densenet_acc1.head()

Unnamed: 0,human,model,comb none,comb confusion,comb temperature scaling,comb ensemble temperature scaling,comb imax binning
0,0.955,0.95875,0.972917,0.959167,0.97875,0.97875,0.979167
1,0.955417,0.969167,0.975833,0.965417,0.979167,0.9775,0.9775
2,0.948333,0.96875,0.974167,0.954167,0.974167,0.972917,0.972917
3,0.95,0.969167,0.977917,0.957917,0.978333,0.97625,0.97875
4,0.953333,0.96875,0.980833,0.962917,0.981667,0.982917,0.982083


In [305]:
densenet_a1 = densenet_acc1.copy()

In [306]:
for col in densenet_a1.columns:
    densenet_a1[col]*=-100
    densenet_a1[col]+=100

In [307]:
densenet_a1.head()

Unnamed: 0,human,model,comb none,comb confusion,comb temperature scaling,comb ensemble temperature scaling,comb imax binning
0,4.5,4.125,2.708333,4.083333,2.125,2.125,2.083333
1,4.458333,3.083333,2.416667,3.458333,2.083333,2.25,2.25
2,5.166667,3.125,2.583333,4.583333,2.583333,2.708333,2.708333
3,5.0,3.083333,2.208333,4.208333,2.166667,2.375,2.125
4,4.666667,3.125,1.916667,3.708333,1.833333,1.708333,1.791667


In [308]:
col_names = ['model_name'] + list(resnet_a1.columns)

In [309]:
results = pd.DataFrame(columns = col_names)

In [310]:
results

Unnamed: 0,model_name,human,model,comb none,comb confusion,comb temperature scaling,comb ensemble temperature scaling,comb imax binning


In [311]:
results = results.append(dict(zip(col_names,['ResNet-110'] + [str(np.mean(resnet_a1[col])) + ' (+-) ' + str(np.std(resnet_a1[col])) for col in resnet_a1.columns])), ignore_index =True)

In [312]:
results = results.append(dict(zip(col_names,['DenseNet-BC'] + [str(np.mean(densenet_a1[col])) + ' (+-) ' + str(np.std(densenet_a1[col])) for col in densenet_a1.columns])), ignore_index =True)

In [313]:
results

Unnamed: 0,model_name,human,model,comb none,comb confusion,comb temperature scaling,comb ensemble temperature scaling,comb imax binning
0,ResNet-110,4.454999999999996 (+-) 0.3233762858068306,6.1833333333333345 (+-) 0.31950482521134765,2.963333333333334 (+-) 0.28487814158961794,4.646666666666671 (+-) 0.3415894221625338,2.6533333333333333 (+-) 0.25487469688282394,2.6433333333333326 (+-) 0.2645961282995499,2.6916666666666607 (+-) 0.24692553983381887
1,DenseNet-BC,4.454999999999996 (+-) 0.3233762858068306,3.361666666666666 (+-) 0.3248332905763582,2.341666666666671 (+-) 0.23363076281460357,3.6466666666666674 (+-) 0.3706450833044697,2.0833333333333344 (+-) 0.21245914639970206,2.1549999999999985 (+-) 0.21779194965226267,2.0716666666666668 (+-) 0.22746184051151735


## Yes, I am able to get the results, orderwise consistent with the paper.
* The **Hyper Parameters** used were:
    * number of data points been taken, from the CIFAR-10H set,for combination
    * DIAG_ACC
    * MU-BETA
    * SIGMA_BETA

## Challenges Faced: Mainly in files and folders not in particular order/directory structure.

## Question 2: Modelling Multiple Humans

### Approach Followed :    
* Each Image of CIFAR10H has about 50(roughly, slightly vary to 52, 51, 49 in some cases) human labels.
* Earlier the final decision labels was selected after shuffling.
* The shuffling is done such that the more voted class is more likely to occur at 0th index
* Now, in the modified **approach**, I have chosen first n_humans(say 3), and then chose on final class based on majority
* If no majority then, randomly selecting.
* The simulate_multiple_humans function contains the above logic, it takes argument n_humans

In [47]:
# running calibration_experiment.py
# Generates the data for Appendix C in the paper.

# This code is run for n_humans = 3
from data_utils import *

import sys
sys.path.insert(0, '../')

from utils import *
from combination_methods import *
from tqdm.auto import tqdm
import torch
from sklearn.model_selection import train_test_split
from metrics import *
import csv
import numpy as np
import os
def load_CIFAR10H(model_name, num_points = 10000):
    """ Loads the CIFAR-10H predictions (human and model) and true labels.
    """
    # dirname = os.path.dirname(__file__)
    dirname = '..'
    if model_name == 'r_low_acc':
        data_path = os.path.join(dirname, 'data/cifar10h/human_model_truth_cifar10h.csv')
        data = np.genfromtxt(data_path, delimiter=',')

        human_counts = data[:num_points, :10]
        model_probs = data[:num_points, 10:20]
        true_labels = data[:num_points, -1]

        true_labels -= 1  # data has labels 1-10 -- shifting so that they are zero-indexed.
    else:
        data_path = os.path.join(dirname, f'data/cifar10h/{model_name}.csv')
        data = np.genfromtxt(data_path, delimiter=',')

        true_labels = data[:num_points, 0]
        human_counts = data[:num_points, 1:11]
        model_probs = data[:num_points, 11:]

    true_labels = true_labels.astype(int)

    return human_counts, model_probs, true_labels

def simulate_multi_humans(human_counts, seed = 0, n_humans = 1):
    rng = np.random.default_rng(seed)
    human_labels_per_input = np.sum(human_counts, axis=1)
    min_human_labels = int(min(human_labels_per_input))
    n_rows = human_counts.shape[0]
    n_classes = human_counts.shape[1]
    human_labels = np.empty(shape=(n_rows, min_human_labels))
    final_decision = np.zeros(n_rows)
    for row in range(n_rows):
        temp = []
        for i in range(n_classes):
            temp += [i] * int(human_counts[row, i])
        rng.shuffle(temp)
        human_labels[row, :] = temp[:min_human_labels]
        predictions = {}
        for human in range(min(n_humans, min_human_labels)):
            human_pred = human_labels[row, human]
            if human_pred not in predictions:
                predictions[human_pred] = 1
            else:
                predictions[human_pred]+= 1
        max_val = max(predictions.values())
        ties = [key for key, value in predictions.items() if value == max_val]
        rng.shuffle(ties)
        final_decision[row] = ties[0]
    return final_decision.astype(int)

def _run_experiment(y_h=None, model_probs=None, y_true=None, **kwargs):
    seed = kwargs.pop('seed', 0)
    n_runs = kwargs.pop('n_runs', 25)
    test_size = kwargs.pop('test_size', 0.375)
    calibration_methods = kwargs.pop('calibration_methods', ['none'])
    calibration_metrics = kwargs.pop('calibration_metrics', {'ECE': get_ECE})
    output_file_acc = kwargs.pop('output_file_acc', './acc.csv')
    output_file_calibration = kwargs.pop('output_file_calibration', './cal.csv')

    acc_data = []
    cal_data = []
    for i in tqdm(range(n_runs), leave=False, desc='Runs'):
        # Train/test split
        y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
            y_h, model_probs, y_true, test_size=test_size, random_state=i * seed)

        acc_h = get_acc(y_h_te, y_true_te)
        acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te)

        _acc_data = [acc_h, acc_m]
        _cal_data = []
        for calibration_method in calibration_methods:
            if calibration_method == 'confusion':
                combiner = DoubleConfusionCombiner()
                combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
            else:
                combiner = OracleCombiner(calibration_method=calibration_method)
                combiner.fit(model_probs_tr, y_h_tr, y_true_tr)

            y_comb_te = combiner.combine(model_probs_te, y_h_te)
            acc_comb = get_acc(y_comb_te, y_true_te)
            _acc_data.append(acc_comb)

            model_probs_calibrated_te = combiner.calibrate(model_probs_te)
            y_comb_prob_te = combiner.combine_proba(model_probs_te, y_h_te)
            for metric, fxn in calibration_metrics.items():
                cal_m = fxn(model_probs_calibrated_te, y_true_te)
                cal_comb = fxn(y_comb_prob_te, y_true_te)
                _cal_data.append([calibration_method, metric, cal_m, cal_comb])

        acc_data += [_acc_data]
        cal_data += _cal_data

    # Save data to CSV
    header_acc = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
    with open(output_file_acc, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_acc)
        writer.writerows(acc_data)
    header_cal = ['calibration method', 'metric', 'model', 'comb']
    with open(output_file_calibration, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_cal)
        writer.writerows(cal_data)


def run_experiment_cifar10(out_fpath=None, experiment_args=None, seed=0):
    # model_names = ['r_low_acc', 'resnet-110', 'preresnet-110', 'densenet-bc-L190-k40']
    model_names = ['resnet-110', 'densenet-bc-L190-k40']
    for model_name in tqdm(model_names, desc='Models', leave=True):
        # Specify output files
        output_file_acc = out_fpath + f'{model_name}_accuracy_3.csv'
        output_file_calibration = out_fpath + f'{model_name}_calibration_3.csv'
        assert not os.path.exists(output_file_acc), 'Output filepath already exists'
        assert not os.path.exists(output_file_calibration), 'Output filepath already exists'
        experiment_args['output_file_acc'] = output_file_acc
        experiment_args['output_file_calibration'] = output_file_calibration

        # Load data
        human_counts, model_probs, y_true = load_CIFAR10H(model_name, num_points = 8000)
        y_h = simulate_multi_humans(human_counts, seed=seed, n_humans = 3)

        _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)


if __name__ == '__main__':
    seed = 9658
    torch.manual_seed(seed)
    np.random.seed(seed)

    calibration_methods = ['none', 'confusion', 'temperature scaling', 'ensemble temperature scaling', 'imax binning']                                                                                                              
    """
    calibration_metrics = {'ECE width': lambda probs, y: get_ECE(probs, y, mode='width'),
                           'ECE mass': lambda probs, y: get_ECE(probs, y, mode='mass'),
                           'cwECE thresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
                           'cwECE thresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
                           'cwECE nothresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width',
                                                                               threshold_mode=None),
                           'cwECE nothresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass',
                                                                              threshold_mode=None),
                           'kumar MCE': get_MCE,
                           'kumar MCE (bin)': lambda probs, y: cal.get_binning_ce(probs, y,
                                                                                  p=1, debias=False, mode='marginal'),
                           'kumar MCE (scale)': lambda probs, y: cal.lower_bound_scaling_ce(probs, y,
                                                                                            p=1, debias=False,
                                                                                            mode='marginal'),
                           'kumar ECE': cal.get_ece}
    """
    calibration_metrics = {'ECE (W)': lambda probs, y: get_ECE(probs, y, mode='width'),
                           'ECE (M)': lambda probs, y: get_ECE(probs, y, mode='mass'),
                           'cwECE (WT)': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
                           'cwECE (MT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
                           'cwECE (WNT)': lambda probs, y: get_cw_ECE(probs, y, mode='width', threshold_mode=None),
                           'cwECE (MNT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass', threshold_mode=None),
                           'NLL': get_NLL}
    
    args = {'n_runs': 25,
            'test_size': 0.3,
            'calibration_methods': calibration_methods,
            'calibration_metrics': calibration_metrics,
            'seed': seed
            }

    out_fpath = './output/cifar10h/final/fully_sup_CI/'
    run_experiment_cifar10(out_fpath=out_fpath, experiment_args=args, seed=seed)


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

In [266]:
resnet_acc3 = pd.read_csv("output/cifar10h/final/fully_sup_CI/resnet-110_accuracy_3.csv")
resnet_calib3 = pd.read_csv("output/cifar10h/final/fully_sup_CI/resnet-110_calibration_3.csv")

In [267]:
resnet_a3 = resnet_acc3.copy()

In [268]:
for col in resnet_a3.columns:
    resnet_a3[col]*=-100
    resnet_a3[col]+=100

In [269]:
col_names = ['model_name'] + ['n_humans'] + list(resnet_a1.columns)
results_new = pd.DataFrame(columns = col_names)

In [270]:
results_new

Unnamed: 0,model_name,n_humans,human,model,comb none,comb confusion,comb temperature scaling,comb ensemble temperature scaling,comb imax binning


In [271]:
densenet_acc3 = pd.read_csv("output/cifar10h/final/fully_sup_CI/densenet-bc-L190-k40_accuracy_3.csv")
densenet_calib3 = pd.read_csv("output/cifar10h/final/fully_sup_CI/densenet-bc-L190-k40_calibration_3.csv")

In [272]:
densenet_a3 = densenet_acc3.copy()

In [273]:
for col in densenet_a3.columns:
    densenet_a3[col]*=-100
    densenet_a3[col]+=100

In [274]:
results_new = results_new.append(dict(zip(col_names,['ResNet-110'] + [1] + [str(np.mean(resnet_a1[col])) + ' (+-) ' + str(np.std(resnet_a1[col])) for col in resnet_a1.columns])), ignore_index =True)
results_new = results_new.append(dict(zip(col_names,['DenseNet-BC'] + [1] + [str(np.mean(densenet_a1[col])) + ' (+-) ' + str(np.std(densenet_a1[col])) for col in densenet_a1.columns])), ignore_index =True)
results_new = results_new.append(dict(zip(col_names,['ResNet-110'] + [3] + [str(np.mean(resnet_a3[col])) + ' (+-) ' + str(np.std(resnet_a3[col])) for col in resnet_a3.columns])), ignore_index =True)
results_new = results_new.append(dict(zip(col_names,['DenseNet-BC'] + [3] + [str(np.mean(densenet_a3[col])) + ' (+-) ' + str(np.std(densenet_a3[col])) for col in densenet_a3.columns])), ignore_index =True)

In [275]:
results_new

Unnamed: 0,model_name,n_humans,human,model,comb none,comb confusion,comb temperature scaling,comb ensemble temperature scaling,comb imax binning
0,ResNet-110,1,4.454999999999996 (+-) 0.3233762858068306,6.1833333333333345 (+-) 0.31950482521134765,2.963333333333334 (+-) 0.28487814158961794,4.646666666666671 (+-) 0.3415894221625338,2.6533333333333333 (+-) 0.25487469688282394,2.6433333333333326 (+-) 0.2645961282995499,2.6916666666666607 (+-) 0.24692553983381887
1,DenseNet-BC,1,4.454999999999996 (+-) 0.3233762858068306,3.361666666666666 (+-) 0.3248332905763582,2.341666666666671 (+-) 0.23363076281460357,3.6466666666666674 (+-) 0.3706450833044697,2.0833333333333344 (+-) 0.21245914639970206,2.1549999999999985 (+-) 0.21779194965226267,2.0716666666666668 (+-) 0.22746184051151735
2,ResNet-110,3,1.8949999999999967 (+-) 0.21019831905449168,6.1833333333333345 (+-) 0.31950482521134765,2.008333333333328 (+-) 0.19685019685029156,2.029999999999997 (+-) 0.17261067303166386,1.4099999999999988 (+-) 0.1778263822446595,1.4133333333333313 (+-) 0.17115619896587098,1.4750000000000016 (+-) 0.1802775637732009
3,DenseNet-BC,3,1.8949999999999967 (+-) 0.21019831905449168,3.361666666666666 (+-) 0.3248332905763582,1.7366666666666664 (+-) 0.21496769783181768,2.0883333333333325 (+-) 0.2949387883763159,1.4666666666666612 (+-) 0.16329931618554894,1.433333333333335 (+-) 0.17834112132527305,1.398333333333332 (+-) 0.1826046123306921


### Observations 2.1:
* It is clear from above dataframe, on **increasing number of humans, the mean error and standard deviation of combination model has decreased** for both the models
* **However**, it is to be noted, this improvement in the current scenario would be upper bounded by the case when we take max vote, out of all aprroximately 50 human inferences, instead of voting from a subset of it

In [316]:
# Running Calibration Combo Experiment for 7143 data points, meaning 5000 training data points
# import sys
# sys.path.insert(0, '../')
from data_utils import *
from utils import *
from combination_methods import *
from tqdm.auto import tqdm
import torch
from sklearn.model_selection import train_test_split
from metrics import *
import csv
import numpy as np
import os
from calibrators import *

# Generates the data for Table 2 (and Appendix D) in our paper.

def load_CIFAR10H(model_name, num_points = 10000):
    """ Loads the CIFAR-10H predictions (human and model) and true labels.
    """
    # dirname = os.path.dirname(__file__)
    dirname = '..'
    if model_name == 'r_low_acc':
        data_path = os.path.join(dirname, 'data/cifar10h/human_model_truth_cifar10h.csv')
        data = np.genfromtxt(data_path, delimiter=',')

        human_counts = data[:num_points, :10]
        model_probs = data[:num_points, 10:20]
        true_labels = data[:num_points, -1]

        true_labels -= 1  # data has labels 1-10 -- shifting so that they are zero-indexed.
    else:
        data_path = os.path.join(dirname, f'data/cifar10h/{model_name}.csv')
        data = np.genfromtxt(data_path, delimiter=',')

        true_labels = data[:num_points, 0]
        human_counts = data[:num_points, 1:11]
        model_probs = data[:num_points, 11:]

    true_labels = true_labels.astype(int)

    return human_counts, model_probs, true_labels

def simulate_multi_humans(human_counts, seed = 0, n_humans = 1):
    rng = np.random.default_rng(seed)
    human_labels_per_input = np.sum(human_counts, axis=1)
    min_human_labels = int(min(human_labels_per_input))
    n_rows = human_counts.shape[0]
    n_classes = human_counts.shape[1]
    human_labels = np.empty(shape=(n_rows, min_human_labels))
    final_decision = np.zeros(n_rows)
    for row in range(n_rows):
        temp = []
        for i in range(n_classes):
            temp += [i] * int(human_counts[row, i])
        rng.shuffle(temp)
        human_labels[row, :] = temp[:min_human_labels]
        predictions = {}
        for human in range(min(n_humans, min_human_labels)):
            human_pred = human_labels[row, human]
            if human_pred not in predictions:
                predictions[human_pred] = 1
            else:
                predictions[human_pred]+= 1
        max_val = max(predictions.values())
        ties = [key for key, value in predictions.items() if value == max_val]
        rng.shuffle(ties)
        final_decision[row] = ties[0]
    return final_decision.astype(int)

def _run_experiment(y_h=None, model_probs=None, y_true=None, **kwargs):
    seed = kwargs.pop('seed', 0)
    n_runs = kwargs.pop('n_runs', 25)
    test_size = kwargs.pop('test_size', 0.3)
    calibration_methods = kwargs.pop('calibration_methods', ['none'])
    calibration_metrics = kwargs.pop('calibration_metrics', {'ECE': get_ECE})
    output_file_acc = kwargs.pop('output_file_acc', './acc.csv')
    output_file_calibration = kwargs.pop('output_file_calibration', './cal.csv')

    acc_data = []
    cal_data = []
    for i in tqdm(range(n_runs), leave=False, desc='Runs'):
        # Train/test split
        y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
            y_h, model_probs, y_true, test_size=test_size, random_state=i * seed)

        # # Limit to 5k datapoints
        # y_h_tr = y_h_tr[:5000]
        # model_probs_tr = model_probs_tr[:5000, :]
        # y_true_tr = y_true_tr[:5000]

        acc_h = get_acc(y_h_te, y_true_te)
        acc_m = get_acc(np.argmax(model_probs_te, axis=1), y_true_te)

        _acc_data = [acc_h, acc_m]
        _cal_data = []
        DIAG_ACC = 0.75
        MU_BETA = 0.5
        SIGMA_BETA = 0.5
        combiners = {'MAP_CI': MAPOracleCombiner(diag_acc=DIAG_ACC, mu_beta=MU_BETA, sigma_beta=SIGMA_BETA),
                     'uncal_MAP_CI': MAPOracleCombiner(diag_acc=DIAG_ACC, mu_beta=MU_BETA, sigma_beta=SIGMA_BETA)}
        for combiner_name, combiner in combiners.items():
            combiner.fit(model_probs_tr, y_h_tr, y_true_tr)
            if combiner_name == 'uncal_MAP_CI':
                combiner.calibrator.temperature = 1  # pretty hacky way to get uncalibrated temps.. but w/e

            y_comb_te = combiner.combine(model_probs_te, y_h_te)
            acc_comb = get_acc(y_comb_te, y_true_te)
            _acc_data.append(acc_comb)

            model_probs_calibrated_te = combiner.calibrate(model_probs_te)
            y_comb_prob_te = combiner.combine_proba(model_probs_te, y_h_te)

            # ----- Calibrate combination
            ts_calibrator = TSCalibratorMAP()
            comb_probs_tr = combiner.combine_proba(model_probs_tr, y_h_tr)
            comb_logits_tr = np.log(np.clip(comb_probs_tr, 1e-50, 1))
            ts_calibrator.fit(comb_logits_tr, y_true_tr)
            y_comb_prob_te_calibrated = ts_calibrator.calibrate(y_comb_prob_te)

            for metric, fxn in calibration_metrics.items():
                cal_m = fxn(model_probs_calibrated_te, y_true_te)
                cal_comb = fxn(y_comb_prob_te, y_true_te)
                cal_comb_calibrated = fxn(y_comb_prob_te_calibrated, y_true_te)
                _cal_data.append([combiner_name, metric, cal_m, cal_comb, cal_comb_calibrated])

            acc_data += [_acc_data]
            cal_data += _cal_data

    # Save data to CSV
    header_acc = ['human', 'model'] + [f'comb {cal_m}' for cal_m in calibration_methods]
    with open(output_file_acc, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_acc)
        writer.writerows(acc_data)
    header_cal = ['calibration method', 'metric', 'model', 'comb', 'comb (post cal)']
    with open(output_file_calibration, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(header_cal)
        writer.writerows(cal_data)


def run_experiment_cifar10(out_fpath=None, experiment_args=None, seed=0):
    model_names = [ 'resnet-110', 'densenet-bc-L190-k40']
    for model_name in tqdm(model_names, desc='Models', leave=True):
        # Specify output files
        output_file_acc = out_fpath + f'{model_name}_accuracy_5000_3.csv'
        output_file_calibration = out_fpath + f'{model_name}_calibration_5000_3.csv'
        assert not os.path.exists(output_file_acc), 'Output filepath already exists'
        assert not os.path.exists(output_file_calibration), 'Output filepath already exists'
        experiment_args['output_file_acc'] = output_file_acc
        experiment_args['output_file_calibration'] = output_file_calibration

        # Load data
        human_counts, model_probs, y_true = load_CIFAR10H(model_name, num_points = 7143)
        y_h = simulate_multi_humans(human_counts, seed=seed, n_humans = 3)

        _run_experiment(y_h=y_h, model_probs=model_probs, y_true=y_true, **experiment_args)

if __name__ == '__main__':
    seed = 9658
    torch.manual_seed(seed)
    np.random.seed(seed)

    calibration_methods = ['none', 'confusion']
    """
    calibration_metrics = {'ECE width': lambda probs, y: get_ECE(probs, y, mode='width'),
                           'ECE mass': lambda probs, y: get_ECE(probs, y, mode='mass'),
                           'cwECE thresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
                           'cwECE thresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
                           'cwECE nothresh width': lambda probs, y: get_cw_ECE(probs, y, mode='width',
                                                                               threshold_mode=None),
                           'cwECE nothresh mass': lambda probs, y: get_cw_ECE(probs, y, mode='mass',
                                                                              threshold_mode=None),
                           'kumar MCE': get_MCE,
                           'kumar MCE (bin)': lambda probs, y: cal.get_binning_ce(probs, y,
                                                                                  p=1, debias=False, mode='marginal'),
                           'kumar MCE (scale)': lambda probs, y: cal.lower_bound_scaling_ce(probs, y,
                                                                                            p=1, debias=False,
                                                                                            mode='marginal'),
                           'kumar ECE': cal.get_ece}
    """
    calibration_metrics = {'ECE (W)': lambda probs, y: get_ECE(probs, y, mode='width'),
                           'ECE (M)': lambda probs, y: get_ECE(probs, y, mode='mass'),
                           'cwECE (WT)': lambda probs, y: get_cw_ECE(probs, y, mode='width'),
                           'cwECE (MT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass'),
                           'cwECE (WNT)': lambda probs, y: get_cw_ECE(probs, y, mode='width', threshold_mode=None),
                           'cwECE (MNT)': lambda probs, y: get_cw_ECE(probs, y, mode='mass', threshold_mode=None),
                           'NLL': get_NLL}

    args = {'n_runs': 25,
            'test_size': 0.3,
            'calibration_methods': calibration_methods,
            'calibration_metrics': calibration_metrics,
            'seed': seed
            }

    out_fpath = './output/cifar10h/final/calibrate_comb_MAP/'
    run_experiment_cifar10(out_fpath=out_fpath, experiment_args=args, seed=seed)    


Models:   0%|          | 0/2 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

Runs:   0%|          | 0/25 [00:00<?, ?it/s]

In [319]:
!pwd

/content/conf_matrix_and_calibration/experiments


In [320]:
resnet_combo_50003 = pd.read_csv('./output/cifar10h/final/calibrate_comb_MAP/resnet-110_calibration_5000_3.csv')

In [322]:
for metric in metric_list:
    temp = resnet_combo_50003[(resnet_combo_50003['metric'] == metric) & (resnet_combo_50003['calibration method'] == 'MAP_CI')]
    print(f'for metric: {metric}:')
    print(np.mean(temp['model']), np.std(temp['model']))
    print(np.mean(temp['comb']), np.std(temp['comb']))
    print(np.mean(temp['comb (post cal)']), np.std(temp['comb (post cal)']))

for metric: ECE (W):
0.023479307405035135 0.003323274150384892
0.011244238751059179 0.0014449168824146458
0.0085845104181011 0.0015438839585518377
for metric: ECE (M):
0.019278404403619304 0.00385864418198119
0.009533213785761056 0.002009045996287333
0.006489207126065225 0.0016175013853173136
for metric: cwECE (WT):
0.0490507852613085 0.0031367310566982553
0.021354670454105332 0.0027131222583648964
0.021391060084175177 0.0027337308152254208
for metric: cwECE (MT):
0.020149754355828407 0.0025845763166015786
0.009420907849394538 0.0021232882166128423
0.008452570859240682 0.002036792565502771
for metric: cwECE (WNT):
0.007505674287959638 0.0005426299378158303
0.003035141647504404 0.00036974781489346214
0.0028979910279063647 0.0003554410190067213
for metric: cwECE (MNT):
0.0033861665572550255 0.0005645050719641251
0.00107953043335808 0.0002823698005617442
0.0010936252282279719 0.0002924223045673331
for metric: NLL:
0.21033667698589775 0.011996516712305796
0.07787300124799726 0.012147483988

In [317]:
for metric in metric_list:
    temp = resnet_110_calib_5000[(resnet_110_calib_5000['metric'] == metric) & (resnet_110_calib_5000['calibration method'] == 'MAP_CI')]
    print(f'for metric: {metric}:')
    print(np.mean(temp['model']), np.std(temp['model']))
    print(np.mean(temp['comb']), np.std(temp['comb']))
    print(np.mean(temp['comb (post cal)']), np.std(temp['comb (post cal)']))

for metric: ECE (W):
0.023479307405035135 0.003323274150384892
0.014402669163155824 0.0022186070298601352
0.010253647237662922 0.001892225814813155
for metric: ECE (M):
0.019278404403619304 0.00385864418198119
0.01311660098231819 0.002022602978562105
0.00801715734647348 0.0020518758168307292
for metric: cwECE (WT):
0.0490507852613085 0.0031367310566982553
0.029718415010940948 0.0025818373139331685
0.02925931548597067 0.0021728506404883894
for metric: cwECE (MT):
0.020149754355828407 0.0025845763166015786
0.012246509015368867 0.0021685907509438854
0.011057281720049167 0.002621046919650429
for metric: cwECE (WNT):
0.007505674287959638 0.0005426299378158303
0.004209919080123847 0.0003736351575321343
0.003882420858063414 0.0003119711376672349
for metric: cwECE (MNT):
0.0033861665572550255 0.0005645050719641251
0.001518527817454947 0.00035871435434804233
0.0015214601931633881 0.0003702068567895458
for metric: NLL:
0.21033667698589775 0.011996516712305796
0.09965567736440867 0.01048364209770

## Some more Observations 2.2:
* Regarding Calibration Accuracy : As n_humans increase it has increased
* Variance decreased

## Use cases:
* These multiple human + Ai teams may be useful in criminal courts, when there are many witnesses, in predictive policing.

## Question 3): Neural Network for Calibrated Probabilities:
* 

In [58]:
import tensorflow as tf
import keras

In [255]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([          
                             tf.keras.layers.InputLayer(input_shape = (11)) ,                                                          
                             tf.keras.layers.Dense(8, activation = 'sigmoid'),  
                             tf.keras.layers.Dense(16, activation = 'relu'),                                                                                                                       
                             tf.keras.layers.Dense(10, activation = 'softmax')
])                    

In [256]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 96        
                                                                 
 dense_1 (Dense)             (None, 16)                144       
                                                                 
 dense_2 (Dense)             (None, 10)                170       
                                                                 
Total params: 410
Trainable params: 410
Non-trainable params: 0
_________________________________________________________________


In [257]:
y_h, model_probs, y_true = load_CIFAR10H('resnet-110', num_points = 10000)

In [258]:
human_pred = np.zeros(10000)

In [259]:
max_vote = np.amax(y_h, axis = 1)

In [260]:
for i in range(10000):
    human_pred[i] = np.where(y_h[i, :] == max_vote[i])[0][0]

In [261]:
   y_h_tr, y_h_te, model_probs_tr, model_probs_te, y_true_tr, y_true_te = train_test_split(
            human_pred, model_probs, y_true, test_size=0.3, random_state=0)

In [262]:
model.compile(loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
EPOCHS = 25
history = model.fit(np.concatenate((np.reshape(y_h_tr, (7000, -1)), model_probs_tr), axis = 1), y_true_tr, batch_size = 8, epochs = EPOCHS)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [263]:
loss, accuracy = model.evaluate(x = np.concatenate((np.reshape(y_h_te, (3000, -1)), model_probs_te), axis = 1), y = y_true_te, batch_size=len(y_true_te))



In [295]:
error = (1 - 0.9687)* 100

In [315]:
error

3.1299999999999994

In [314]:
results

Unnamed: 0,model_name,human,model,comb none,comb confusion,comb temperature scaling,comb ensemble temperature scaling,comb imax binning
0,ResNet-110,4.454999999999996 (+-) 0.3233762858068306,6.1833333333333345 (+-) 0.31950482521134765,2.963333333333334 (+-) 0.28487814158961794,4.646666666666671 (+-) 0.3415894221625338,2.6533333333333333 (+-) 0.25487469688282394,2.6433333333333326 (+-) 0.2645961282995499,2.6916666666666607 (+-) 0.24692553983381887
1,DenseNet-BC,4.454999999999996 (+-) 0.3233762858068306,3.361666666666666 (+-) 0.3248332905763582,2.341666666666671 (+-) 0.23363076281460357,3.6466666666666674 (+-) 0.3706450833044697,2.0833333333333344 (+-) 0.21245914639970206,2.1549999999999985 (+-) 0.21779194965226267,2.0716666666666668 (+-) 0.22746184051151735


## Comparing calibration of neural network with the results of part1 we observe:
* Neural network calibration is better than the model alone
* Better than combination usin confusion matrices
* Inferior accuracy to others