# Intro
This Notebook is used for the extraction and saving of binned data derived from raw data outputs. As formats of code for MNIST and CIFAR-10 where different, implementations for both have been made.

In [7]:
import numpy as np
import pandas as pd
from scipy.special import softmax
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import os
import six

In [8]:
# Helper function created by Bo: 
# https://github.com/lyn1874/region_based_active_learning/blob/main/eval_calibration/calibration_lib.py
def bin_predictions_and_accuracies(probabilities, ground_truth, bins=10):
    """
    Author: Bo Li, 2020
    A helper function which histograms a vector of probabilities into bins.
  
    Args:
      probabilities: A numpy vector of N probabilities assigned to each prediction
      ground_truth: A numpy vector of N ground truth labels in {0,1}
      bins: Number of equal width bins to bin predictions into in [0, 1], or an
        array representing bin edges.
  
    Returns:
      bin_edges: Numpy vector of floats containing the edges of the bins
        (including leftmost and rightmost).
      accuracies: Numpy vector of floats for the average accuracy of the
        predictions in each bin.
      counts: Numpy vector of ints containing the number of examples per bin.
    """
    _validate_probabilities(probabilities)
    _check_rank_nonempty(rank=1,
                         probabilities=probabilities,
                         ground_truth=ground_truth)

    if len(probabilities) != len(ground_truth):
        raise ValueError(
            'Probabilies and ground truth must have the same number of elements.')

    if [v for v in ground_truth if v not in [0., 1., True, False]]:
        raise ValueError(
            'Ground truth must contain binary labels {0,1} or {False, True}.')

    if isinstance(bins, int):
        num_bins = bins
    else:
        num_bins = bins.size - 1

    # Ensure probabilities are never 0, since the bins in np.digitize are open on
    # one side.
    probabilities = np.where(probabilities == 0, 1e-8, probabilities)
    counts, bin_edges = np.histogram(probabilities, bins=bins, range=[0., 1.])
    indices = np.digitize(probabilities, bin_edges, right=True)
    accuracies = np.array([np.mean(ground_truth[indices == i])
                           for i in range(1, num_bins + 1)])
    return bin_edges, accuracies, counts

def _validate_probabilities(probabilities, multiclass=False):
    if np.max(probabilities) > 1. or np.min(probabilities) < 0.:
        raise ValueError('All probabilities must be in [0,1].')
    if multiclass and not np.allclose(1, np.sum(probabilities, axis=-1),
                                      atol=1e-5):
        raise ValueError(
            'Multiclass probabilities must sum to 1 along the last dimension.')

def _check_rank_nonempty(rank, **kwargs):
    for key, array in six.iteritems(kwargs):
        if len(array) <= 1 or array.ndim != rank:
            raise ValueError(
                '%s must be a rank-1 array of length > 1; actual shape is %s.' %
                (key, array.shape))

# MNIST

In [5]:
# data specific attributes
directory = "./Data/MNIST-iid-5epochs/Version_00_iid/"
clients = 10
classes = 10
rounds = 91
n_val = 10240
nametag = "iid-MNIST_"

# methodological attributes
bins =40


#############################################################################
#Preallocating arrays
overall_accuracy = np.zeros(rounds)
bin_accuracies = np.zeros((rounds,bins))
bin_counts = np.zeros((rounds,bins))

for r in range(0,rounds):
    #Reading relevant files for current round
    if r<10:
        rdir = directory + "Round_0" + str(r) + '/'
    else:
        rdir = directory + "Round_" + str(r) + '/'

    ground_truth = pd.read_csv(rdir + '_Ground_Truth_Indices.csv',header=0).to_numpy().squeeze()
    prediction = pd.read_csv(rdir + '_Index_Matrix.csv',header=0).to_numpy().squeeze()
    
    #Calculating overall accuracy for current round
    overall_accuracy[r] = np.mean(ground_truth == prediction)
    prop = pd.read_csv(rdir + '_server_.csv',header=0).to_numpy()
    prop = np.max(prop,axis=1)

    #Binning
    edges, bin_accuracies[r,:], bin_counts[r,:] = bin_predictions_and_accuracies(prop,  ground_truth == prediction, bins=bins)
    
np.save(nametag+'overall_accuracy',overall_accuracy)
np.save(nametag+'accVunc.npy',bin_accuracies)
np.save(nametag+'binCounts.npy',bin_counts)

# CIFAR-10
We only outputted training accuracy for some experiments of CIFAR-10, thus option for getting training accuracy is there in those cases.

In [13]:
# data specific attributes
directory = "./Data/Cifar_VGG11/"
clients = 10
classes = 10
rounds = 81
n_val = 10000
nametag = "VGG11_Cifar_"

# methodological attributes
bins =40
train_available = True

#############################################################
#Preallocating arrays
overall_accuracy = np.zeros(rounds)
train_accuracy = np.zeros(rounds)
bin_accuracies = np.zeros((rounds,bins))
bin_counts = np.zeros((rounds,bins))

for r in range(0,rounds):
    #Reading relevant files for current round
    if r<10:
        rdir = directory + "communication_round_00" + str(r) + '/'
    else:
        rdir = directory + "communication_round_0" + str(r) + '/'

    taf = pd.read_csv(rdir + 'taf.csv',header=None).to_numpy()
    ground_truth = taf[:,0].astype(int)
    prediction = taf[:,1].astype(int)

    #Calculating overall test accuracy for current round
    overall_accuracy[r] = np.mean(ground_truth == prediction)

    if train_available == True:
        taf_train = pd.read_csv(rdir + 'taf_train.csv',header=None).to_numpy()
        ttrue = taf_train[:,0].astype(int)
        tpred = taf_train[:,1].astype(int)
        train_accuracy[r] = np.mean(ttrue == tpred)
    
    prop = pd.read_csv(rdir + 'server_pred.csv',header=None).to_numpy()
    prop = softmax(prop,axis=1)
    prop = np.max(prop,axis=1)
        
    #Binning
    edges, bin_accuracies[r,:], bin_counts[r,:] = bin_predictions_and_accuracies(prop,  ground_truth == prediction, bins=bins)

np.save(nametag+'overall_accuracy',overall_accuracy)
np.save(nametag+'accVunc.npy',bin_accuracies)
np.save(nametag+'binCounts.npy',bin_counts)

if train_available == True:
    np.save(nametag+'train_accuracy',train_accuracy)