# Analysis for Table C for ImageNet

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm
import pickle
import random
import copy

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

### Orders for Plotting

In [4]:
ORDER_DICT = {'ResNet-50': 1,
              'kNN': 2,
              'EMD-NN':  3,
              'EMD-Corr': 4,
              'CHM-NN': 5,
              'CHM-Corr': 6}

##  Load summary fo AI Performance

In [5]:
random.seed(42)

with open('../data/Classification-Summary-ImageNet.pickle', 'rb') as f:
  classification_summary = pickle.load(f)

In [6]:
classification_summary.keys()

dict_keys(['all_gts', 'resnet_wnid', 'knn_wnid', 'emd_wnid', 'chm_wnid', 'resnet_conf', 'knn_conf', 'emd_conf', 'chm_conf', 'resnet_wnid_real', 'knn_wnid_real', 'emd_wnid_real', 'chm_wnid_real', 'resnet_real_conf', 'knn__real_conf', 'emd__real_conf', 'chm__real_conf'])

### Unpacking

In [7]:
real_gt_real = classification_summary['all_gts']

resnet_wnid_real = classification_summary['resnet_wnid_real']
knn_wnid_real    = classification_summary['knn_wnid_real']   
emd_wnid_real    = classification_summary['emd_wnid_real']   
chm_wnid_real    = classification_summary['chm_wnid_real']   

IsM1Correct_real = np.asarray(classification_summary['resnet_wnid_real'])
IsM2Correct_real = np.asarray(classification_summary['knn_wnid_real'])
IsM3Correct_real = np.asarray(classification_summary['emd_wnid_real'])
IsM4Correct_real = np.asarray(classification_summary['chm_wnid_real'])

M1Conf_real = np.asarray(classification_summary['resnet_real_conf'])
M2Conf_real = np.asarray(classification_summary['knn__real_conf'])
M3Conf_real = np.asarray(classification_summary['emd__real_conf'])
M4Conf_real = np.asarray(classification_summary['chm__real_conf'])

## AI Performance Analysis

In [8]:
MethodName = ['ResNet', 'KNN', 'EMD', 'CHM']
Methods    = [IsM1Correct_real, IsM2Correct_real, IsM3Correct_real, IsM4Correct_real]
Confs      = [M1Conf_real, M2Conf_real, M3Conf_real, M4Conf_real]

TAnalysis = {}
TCount = {}

for N, M, C in zip(MethodName, Methods, Confs):
  TAnalysis[N] = {}
  TCount[N] = {}
  
  for T in tqdm(np.arange(0., 1.0, 0.05)):
    TAnalysis[N][round(T, 2)] = np.mean(M[C>T])
    TCount[N][round(T, 2)]    = len(M[C>T])

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 2339.27it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 1989.19it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 1991.83it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 2035.33it

## AI PERFORMANCE 

In [9]:
df = pd.DataFrame(TAnalysis).T
df.head()

Unnamed: 0,0.00,0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95
ResNet,0.831442,0.831772,0.834234,0.839614,0.846505,0.85441,0.863547,0.873226,0.882908,0.893759,0.904679,0.915649,0.925908,0.935507,0.94503,0.953841,0.962131,0.969919,0.978228,0.986707
KNN,0.821448,0.821559,0.825217,0.83336,0.84363,0.856655,0.868481,0.881031,0.894353,0.906684,0.919062,0.930628,0.942384,0.952483,0.961545,0.969122,0.975746,0.980968,0.987206,0.991919
EMD,0.823901,0.823976,0.827845,0.836306,0.847205,0.858682,0.871195,0.883656,0.895869,0.908023,0.92126,0.934005,0.945312,0.954274,0.961638,0.969449,0.976223,0.981759,0.987267,0.991824
CHM,0.820502,0.820558,0.823378,0.830329,0.839564,0.850043,0.861659,0.874038,0.886519,0.899047,0.912743,0.92607,0.937167,0.947746,0.956517,0.96341,0.97125,0.977706,0.984702,0.989741


## Number of Images 

In [10]:
# Number of Images 
df2 = pd.DataFrame(TCount).T
df2.head()

Unnamed: 0,0.00,0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95
ResNet,44424,44404,44243,43863,43350,42709,41934,41081,40165,39081,37851,36455,35078,33740,32363,30893,29338,27492,25032,21064
KNN,44424,44418,44192,43651,42879,41857,40823,39649,38354,36982,35422,33760,32213,30684,29073,27463,25687,23697,20948,16334
EMD,44424,44420,44181,43618,42809,41863,40806,39598,38317,36955,35344,33654,32036,30464,28883,27233,25361,23244,20341,15655
CHM,44424,44421,44247,43773,43070,42232,41253,40139,38914,37562,35917,34235,32642,31079,29575,27986,26226,24132,21375,16864


## Ratios (out of 44424 Images)

In [11]:
# RATIOs (out of 44424 Images)
df3 = pd.DataFrame(TCount).T
df3/len(real_gt_real)

Unnamed: 0,0.00,0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95
ResNet,1.0,0.99955,0.995926,0.987372,0.975824,0.961395,0.943949,0.924748,0.904128,0.879727,0.852039,0.820615,0.789618,0.759499,0.728503,0.695412,0.660409,0.618855,0.563479,0.474158
KNN,1.0,0.999865,0.994778,0.982599,0.965222,0.942216,0.91894,0.892513,0.863362,0.832478,0.797362,0.75995,0.725126,0.690708,0.654444,0.618202,0.578223,0.533428,0.471547,0.367684
EMD,1.0,0.99991,0.99453,0.981857,0.963646,0.942351,0.918558,0.891365,0.862529,0.83187,0.795606,0.757563,0.721142,0.685755,0.650167,0.613024,0.570885,0.523231,0.457883,0.3524
CHM,1.0,0.999932,0.996016,0.985346,0.969521,0.950657,0.92862,0.903543,0.875968,0.845534,0.808504,0.770642,0.734783,0.699599,0.665744,0.629975,0.590357,0.54322,0.481159,0.379615


## Human Performance Analysis

In [12]:
df = pd.read_csv('../data/ImageNet-Human-Study-Summary.csv')

## Removing Bad Users 

We manually marked two users as **bad** based on their average performance on `1:1` ratio samples

1. `6395881` 
1. `6396044`


And after resampling we excluded these `4` users:

1. `6386000`
1. `6382894`
1. `6385998`
1. `6382878`

In [13]:
df = df[~df['UID'].isin([6395881, 6396044] + [6386000, 6382894, 6385998, 6382878] )]

In [14]:
print(f"total {len(df)} entries")

total 10620 etnries


In [15]:
df_treshold = copy.deepcopy(df[['Method', 'Correctness', 'ConfidenceScore']]) 

## Human Performance

In [16]:
human_accuracy = {}
for T in np.arange(0., 1.05, 0.05):
  human_accuracy[f'{T:0.2f}'] = df_treshold[df_treshold['ConfidenceScore']<T].groupby('Method')['Correctness'].mean()

In [17]:
pd.DataFrame.from_dict(human_accuracy).loc[::-1].reset_index().sort_values(by='Method', key=lambda x: x.map(ORDER_DICT))

Unnamed: 0,Method,0.00,0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,1.00
1,ResNet-50,,1.0,1.0,0.890909,0.859813,0.898204,0.924107,0.891374,0.867347,0.846154,0.837937,0.815166,0.807956,0.805,0.778286,0.760593,0.761044,0.75653,0.756332,0.760836,0.815248
0,kNN,,,1.0,0.971429,0.971429,0.902256,0.800643,0.778626,0.778626,0.734021,0.707826,0.679939,0.678705,0.678705,0.674317,0.674317,0.665,0.669251,0.669251,0.704176,0.704176
2,EMD-NN,,,1.0,0.961039,0.961039,0.952381,0.883636,0.831135,0.831135,0.814898,0.774545,0.735878,0.704057,0.704057,0.686808,0.686808,0.684564,0.69697,0.69697,0.731529,0.731529
3,EMD-Corr,,,1.0,0.952941,0.952941,0.955696,0.892734,0.851948,0.851948,0.825893,0.791667,0.746667,0.724018,0.724018,0.70439,0.70439,0.707136,0.71741,0.71741,0.746269,0.746269
4,CHM-NN,,,,0.947368,0.947368,0.889655,0.813725,0.805774,0.805774,0.792208,0.77881,0.73717,0.69483,0.69483,0.689873,0.689873,0.692308,0.687943,0.687943,0.714922,0.714922
5,CHM-Corr,,,,0.918919,0.918919,0.869863,0.819048,0.783505,0.783505,0.773913,0.768519,0.733542,0.703046,0.703046,0.702128,0.702128,0.705645,0.693202,0.693202,0.712984,0.712984


## Number of Samples

In [18]:
human_count = {}
for T in np.arange(0., 1.05, 0.05):
  human_count[f'{T:0.2f}'] = df_treshold[df_treshold['ConfidenceScore']<T].groupby('Method')['Correctness'].count()

In [19]:
pd.DataFrame.from_dict(human_count).loc[::-1].reset_index().sort_values(by='Method', key=lambda x: x.map(ORDER_DICT))

Unnamed: 0,Method,0.00,0.05,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,1.00
1,ResNet-50,,3.0,16.0,55,107,167,224,313,392,455,543,633,729,800,875,944,996,1072,1145,1292,1797
0,kNN,,,3.0,70,70,133,311,393,393,485,575,653,803,803,915,915,1000,1161,1161,1413,1413
2,EMD-NN,,,2.0,77,77,147,275,379,379,443,550,655,838,838,993,993,1043,1155,1155,1367,1367
3,EMD-Corr,,,3.0,85,85,158,289,385,385,448,576,675,866,866,1025,1025,1079,1189,1189,1407,1407
4,CHM-NN,,,,76,76,145,306,381,381,462,538,643,793,793,948,948,1001,1128,1128,1347,1347
5,CHM-Corr,,,,74,74,146,315,388,388,460,540,638,788,788,940,940,992,1118,1118,1317,1317


# Sample we ask Human

In [20]:
optimal_T_imagenet = {'ResNet-50':  0.65,
                      'kNN':      0.4,
                      'EMD-NN':    0.45,
                      'EMD-Corr':  0.5,
                      'CHM-NN':    0.5,
                      'CHM-Corr':  0.5}

In [21]:
df_for_human = df[df['ConfidenceScore'] <optimal_T_imagenet['kNN']]

In [22]:
df_for_human.groupby(['Method', 'ClassifierCorrectness'])['UID'].count().reset_index().sort_values(by='Method', key=lambda x: x.map(ORDER_DICT))

Unnamed: 0,Method,ClassifierCorrectness,UID
8,ResNet-50,0,358
9,ResNet-50,1,34
10,kNN,0,341
11,kNN,1,52
6,EMD-NN,0,359
7,EMD-NN,1,20
4,EMD-Corr,0,368
5,EMD-Corr,1,17
2,CHM-NN,0,346
3,CHM-NN,1,35


In [23]:
dfs = []

for k, v in optimal_T_imagenet.items():
  new_df = df[(df['ConfidenceScore'] < v) & (df['Method'] ==  k)]
  dfs.append(new_df)

In [24]:
df_offloads = pd.concat(dfs, axis=0)

In [25]:
df_difflevel = df_offloads.groupby(['UID', 'Method', 'DifficultyLevel', 'ClassifierCorrectness'])['Correctness'].agg(["count", "sum", "mean"]).unstack().stack().reset_index().sort_values(by='Method', key=lambda x: x.map(ORDER_DICT))
df_difflevel['mean'] = df_difflevel['mean']*100

In [None]:
with sns.plotting_context('paper', font_scale = 1.4):
  g = sns.FacetGrid(df_difflevel, col="DifficultyLevel", row="ClassifierCorrectness", height=6, col_order=['Easy', 'Medium', 'Hard'])
  g.map(sns.barplot, "Method", "mean", order=['ResNet-50', 'kNN', 'EMD-NN', 'EMD-Corr', 'CHM-NN', 'CHM-Corr'], palette="tab10", capsize=.1)

  [plt.setp(ax.get_xticklabels(), rotation=0) for ax in g.axes.flat]

  for ax in g.axes.flat:
    for container in ax.containers:
      ax.bar_label(container, fontsize=12, fmt='%.2f%%', label_type='center')

  plt.ylim(0, 105)
  
  axes = g.axes.flatten()
  for ci, C in enumerate(['Wrong', 'Correct']):
    for di, D in enumerate(['Easy', 'Medium', 'Hard']):
      axes[(ci*3)+di].set_title(f"AI:{C} | Difficulty Category:{D}")

  axes[0].set_ylabel('Average human accuracy')
  axes[3].set_ylabel('Average human accuracy')

  g.fig.subplots_adjust(top=0.9)
  g.fig.suptitle('MEAN ACCURACY - ImageNet - Samples offloaded to Human @ Optimal Threshold (for each classifier) \n\n')
  
  plt.show()

In [None]:
df_correctness = df_offloads.groupby(['UID', 'Method', 'ClassifierCorrectness'])['Correctness'].agg(["count", "sum", "mean"]).unstack().stack().reset_index()
df_correctness['mean'] *=100

In [None]:
with sns.plotting_context('paper', font_scale = 1.4):
  g = sns.FacetGrid(df_correctness, col="ClassifierCorrectness", height=6)
  g.map(sns.barplot, "Method", "mean", order=['ResNet-50', 'kNN', 'EMD-NN', 'EMD-Corr', 'CHM-NN', 'CHM-Corr'], palette="tab10", capsize=.1)
  
  [plt.setp(ax.get_xticklabels(), rotation=0) for ax in g.axes.flat]

  for ax in g.axes.flat:
    for container in ax.containers:
      ax.bar_label(container, fontsize=12, label_type='center', fmt='%.2f%%')

  axes = g.axes.flatten()
  axes[0].set_ylabel('Average human Accuracy')
  g.fig.subplots_adjust(top=0.9)
  g.fig.suptitle('MEAN ACCURACY - ImageNet - Samples offloaded to Human @ Optimal Threshold (for each classifier) \n\n')
  
  plt.show()

In [None]:
df_correctness_count = df_offloads.groupby(['Method', 'ClassifierCorrectness'])['UID'].count().reset_index()

In [None]:
with sns.plotting_context('paper', font_scale = 1.4):
  g = sns.FacetGrid(df_correctness_count, col="ClassifierCorrectness", height=6)
  g.map(sns.barplot, "Method", "UID", order=['ResNet-50', 'kNN', 'EMD-NN', 'EMD-Corr', 'CHM-NN', 'CHM-Corr'], palette="tab10", capsize=.1)
  
  [plt.setp(ax.get_xticklabels(), rotation=0) for ax in g.axes.flat]

  for ax in g.axes.flat:
    for container in ax.containers:
      ax.bar_label(container, fontsize=12, label_type='center')

  axes = g.axes.flatten()
  axes[0].set_ylabel('Average human Accuracy')
  
  g.fig.subplots_adjust(top=0.9)
  g.fig.suptitle('# sample - ImageNet - Samples offloaded to Human @ Optimal Threshold (for each classifier) \n\n')
  
  plt.show()