# Visualizing classification probabilities

_Alex Malz (GCCL@RUB)_

Note: A lot of the cells here are very slow to run but only need to be run once, ever.
Follow the `##` commented out instructions to run the slow cells the first time, then comment it out again when fiddling with plots.

In [None]:
import bisect
import multiprocessing as mp
import numpy as np
np.random.seed(42)
import os
import pandas as pd
import pickle
# from sklearn.neighbors.kde import KernelDensity
import scipy.stats as sps
# import seaborn as sns
# import sys

# epsilon = sys.float_info.epsilon

In [None]:
import matplotlib as mpl
# print(mpl.rcParams.items)
mpl.use('PS')
mpl.rcParams['text.usetex'] = False
mpl.rcParams['mathtext.rm'] = 'serif'
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams["font.family"] = "serif"
mpl.rcParams["mathtext.fontset"] = "dejavuserif"
mpl.rcParams['font.serif'] = 'DejaVu Serif'
# mpl.rcParams['text.usetex'] = False
# mpl.rcParams['mathtext.rm'] = 'serif'
# mpl.rcParams['font.weight'] = 'light'
# mpl.rcParams['font.family'] = 'serif'
# mpl.rcParams['font.serif'] = ['Times New Roman']
# # mpl.rcParams['font.family'] = ['Times New Roman']
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['savefig.dpi'] = 250
mpl.rcParams['figure.dpi'] = 250
mpl.rcParams['savefig.format'] = 'svg'
mpl.rcParams['savefig.bbox'] = 'tight'
import matplotlib.pyplot as plt
%matplotlib inline
# print(mpl.rcParams.items)
from matplotlib.colors import LogNorm

import pylab
from mpl_toolkits.axes_grid1 import ImageGrid
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

# def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
#     new_cmap = mpl.colors.LinearSegmentedColormap.from_list(
#         'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
#         cmap(np.linspace(minval, maxval, n)))
#     return new_cmap

# cmap = plt.get_cmap('hot_r')
# fave_cmap = truncate_colormap(cmap, 0.35, 1.0)

In [None]:
# import proclam
# from proclam import *

# # epsilon = 1.e-1

## Preprocessing

First, establish informative names.

In [None]:
label_dict = {90: 'SNIa',
              67: 'SNIa-91bg',
              52: 'SNIax',
              42: 'SNII',
              62: 'SNIbc',
              95: 'SLSN-I',
              15: 'TDE',
              64: 'KN',
              88: 'AGN',
              92: 'RRL',
              65: 'M-dwarf',
              16: 'EB',
              53: 'Mira',
              6: r'$\mu$Lens-Single'}

true_labels = label_dict.copy()
true_labels[991] = r'$\mu$Lens-Binary'
true_labels[992] = 'ILOT'
true_labels[993] = 'CaRT'
true_labels[994] = 'PISN'
true_labels[995] = r'$mu$Lens-String'

sub_labels = label_dict.copy()
sub_labels[99] = 'Other'

### Truth table

The true labels subdivide class 99 but the predicted labels don't (which breaks some `proclam.metrcs.util` functions, among other things).
Warning, this fix is slow, so only run the cell below once and then read it in from a file next time.

In [None]:
## Use me if you've never run the notebook before.
# truth = pd.read_csv('plasticc_test_metadata.csv')
# truth['ideal_label'] = truth['true_target']
# for i in [991, 992, 993, 994]:
#     truth.loc[truth['true_target'] == i, 'ideal_label'] = 99
# header = ['object_id', 'true_target', 'ideal_label']
# truth.to_csv('truth.csv', columns=header, index=False)

## Use me if you've run the notebook before.
truth = pd.read_csv('truth.csv', index_col='object_id')

true_class_ids, true_class_counts = np.unique(truth['true_target'], return_counts=True)
ideal_class_ids, ideal_class_counts = np.unique(truth['ideal_label'], return_counts=True)

same for the validation classifier, data found [here](https://www.dropbox.com/s/m4l79ire87behmm/validation_tables.tar.gz?dl=0), but first massage the data format (pretty slow!)

In [None]:
# # Run me only if you've never run the notebook before.
# val_label_dict = {'SNIa-Normal': 90,#'SNIa',
#               'Ia-91bg': 67,#'SNIa-91bg',
#               'SNIa-x': 52,#'SNIax',
#               'SNCC-II': 42,#'SNII',
#               'SNCC-Ibc': 62,#'SNIbc',
#               'SLSN-I': 95,#'SLSN-I',
#               'TDE': 15,#'TDE',
#               'Kilonova': 64,#'KN',
#               'AGN': 88,#'AGN',
#               'RRLyrae': 92,#'RRL',
#               'Mdwarf': 65,#'M-dwarf',
#               'EBE': 16,#'EB',
#               'Mira': 53,#'Mira',
#               'uLens-point': 6}#r'$\mu$Lens-Single'}

# comparisons = []
# for field in ['DDF', 'WFD']:
#     contestant = 'validation_'+field
#     comparisons.append(contestant)
#     if not os.path.exists('submissions'):
#         os.makedirs('submissions')
#     if not os.path.exists(os.path.join('submissions', contestant)):
#         os.makedirs(os.path.join('submissions', contestant))
#     for status in ['truth_table', 'predicted_prob']:
#         fn = status+'_'+field+'.csv'
#         validation = pd.read_csv(fn)#, index_col='objids')
#         validation.rename(columns=val_label_dict, inplace=True)
#         validation['objids'] = validation['objids'].apply(lambda x: x.rpartition('_')[-1])
#         validation.rename(columns={'objids': 'object_id'}, inplace=True)
#         validation.set_index('object_id', inplace=True)
#         if status == 'truth_table':
#             truecat = validation.copy()
#             truecat['true_target'] = truecat.idxmax(axis=1)
#             truecat['true_target'].to_csv(field+'truth.csv', header=True)
#         elif status == 'predicted_prob':
#             probcat = validation.copy()
#             probcat.to_csv('submissions/validation/'+field+'probs.csv', header=True)
# #     print('saved per-true-class data for '+contestant)

In [None]:
# # wfd = pd.read_csv('WFDtruth.csv')
# # wfd.set_index('object_id', inplace=True)
# # ddf = pd.read_csv('DDFtruth.csv')
# # ddf.set_index('object_id', inplace=True)
# # in_val = wfd.append(ddf)
# # in_val.to_csv('submissions/validation/truth.csv', header=True)

# # wfd_probs = pd.read_csv('submissions/validation/WFDprobs.csv').set_index('object_id')
# # ddf_probs = pd.read_csv('submissions/validation/DDFprobs.csv').set_index('object_id')
# # val_probs = wfd_probs.append(ddf_probs)
# val_probs = pd.read_csv('0_validation.csv').set_index('object_id')
# # print(val_probs.isnull().values.sum())
# # print(np.any(np.isnan(val_probs)))
# val_probs.to_csv('submissions/validation/validation_probs.csv', header=True)

In [None]:
# # # Run me only if you've never run the notebook before. (slow)
# for i in label_dict.keys():
#     one_class = val_probs.loc[in_val[in_val['true_target'] == i].index, :]
#     if ((np.all(one_class == epsilon) or np.any(np.isnan(one_class)))):
#         print('arrr, thar be NaNs or all 0s!')
#         one_class.fillna(epsilon)
#         one_class.replace(to_replace=0, value=epsilon)
#         one_class.replace(to_replace=0., value=epsilon)
# #     assert ~((np.all(one_class == epsilon) or np.any(np.isnan(one_class))))
#     if ~((np.all(one_class == epsilon) or np.any(np.isnan(one_class)))):
#         normalized = one_class.sum(axis=1)
#     else:
#         print('normalization failed for '+str(i)+' with all '+str(len(one_class))+' zeros='+str(np.all(one_class == epsilon))+' and NaNs='+str(np.any(np.isnan(one_class))))
#         normalized = 1.
#     to_save = one_class.divide(normalized, axis=0)
#     pred_class_inds = {i: int(i[6:]) for i in to_save.columns.values}
#     to_save.rename(columns=pred_class_inds, inplace=True)
#     to_save.to_csv(os.path.join('submissions/validation', 'probvecs'+str(i)+'true.csv'), header=True)
# #     print('saved validation class '+str(i))

### Classification probabilities

This is just for the winning submission for now.
You have to get the files with the submission results first and put them in the same directory as this notebook.
_(I don't know if it's okay to post the link to where those live or if it's private and will be published with the paper through Zenodo or something.)_

In [None]:
contestants = ['1_Kyle', '2_MikeSilogram', '3_MajorTom', 
               '4_AhmetErdem', '5_SKZLostInTranslation', '6_StefanStefanov', 
               '7_hklee', 
               '8_rapidsai', '9_ThreeMusketeers',
               '10_JJ', 
               '11_SimonChen', '12_Go_Spartans'
                ]
all_contestants = ['validation'] + contestants
print([x for x in enumerate(contestants)])

The following cell chops up the data into bite-sized pieces.
It's slow but only needs to be done once.

In [None]:
# # Run me only if you've never run the notebook before. (SLOW!)
# for contestant in contestants:
#     submission = pd.read_csv(contestant+'.csv', index_col='object_id')
#     pred_class_inds = {i: int(i[6:]) for i in submission.columns.values}
#     submission.rename(columns=pred_class_inds, inplace=True)
#     if not os.path.exists('submissions'):
#         os.makedirs('submissions')
#     if not os.path.exists(os.path.join('submissions', contestant)):
#         os.makedirs(os.path.join('submissions', contestant))
#     for i in truth.true_target.unique():
#         to_save = submission[truth['true_target'] == i]
#         to_save.to_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(i)+'true.csv'))
#     print('saved per-true-class data for '+contestant)

In [None]:
def break_down_submission(ind):
    contestant = contestants[ind]
    submission = pd.read_csv(contestant+'.csv', index_col='object_id')
    pred_class_inds = {i: int(i[6:]) for i in submission.columns.values}
    submission.rename(columns=pred_class_inds, inplace=True)
    if not os.path.exists('submissions'):
        os.makedirs('submissions')
    if not os.path.exists(os.path.join('submissions', contestant)):
        os.makedirs(os.path.join('submissions', contestant))
    for i in truth.true_target.unique():
        to_save = submission[truth['true_target'] == i]
        to_save.to_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(i)+'true.csv'))
    print('saved per-true-class data for '+contestant)
    return contestant

In [None]:
# nps = mp.cpu_count() - 4
# pool = mp.Pool(nps)
# printout = pool.map(break_down_submission, range(len(contestants)))

need to account for mismatch in true vs. predicted labels for class 99

In [None]:
# # Run me only if you've never run the notebook before.
# for contestant in contestants:
#     umbrella = []
#     for i in truth.true_target.unique():
#         if str(i)[:2] == '99':
#             umbrella.append(pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(i)+'true.csv'), index_col='object_id'))
#     submission = pd.concat(umbrella)
#     submission.to_csv(os.path.join('submissions/'+contestant, 'probvecs99true.csv'))
#     print('saved 99 aggregate data for '+contestant)

In [None]:
def aggregate_99(ind):
    contestant = contestants[ind]
    umbrella = []
    for i in truth.true_target.unique():
        if str(i)[:2] == '99':
            umbrella.append(pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(i)+'true.csv'), index_col='object_id'))
    submission = pd.concat(umbrella)
    submission.to_csv(os.path.join('submissions/'+contestant, 'probvecs99true.csv'))
    print('saved 99 aggregate data for '+contestant)
    return contestant

In [None]:
# nps = mp.cpu_count() - 4
# pool = mp.Pool(nps)
# printout = pool.map(aggregate_99, range(len(contestants)))

We have to further split up the data into small pieces and calculate the KDE of each, which is also slow, though only has to be done once.
UPDATE: This isn't actually that slow anymore.

In [None]:
# # Run me only if you've never run the notebook before.
# for contestant in contestants:
#     for one_target in truth.true_target.unique():
#         all_prob = pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(one_target)+'true.csv'), index_col='object_id')
#         n_true = len(all_prob)
#         print('calculating '+contestant+'\'s KDE for true class '+str(one_target)+' with '+str(n_true)+' objects')
# #         pred_class_inds = {i: int(i[6:]) for i in all_prob.columns.values}
# #         all_prob.rename(columns=pred_class_inds, inplace=True)
#         for cl_est in all_prob.columns:
#             to_plot = all_prob[cl_est].values
#             if not np.all(to_plot == 0):
#                 kernel = sps.gaussian_kde(to_plot)
#             else:
#                 kernel = None
#             with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'wb') as fn:
#                 pickle.dump(kernel, fn)
#             print('completed '+contestant+'\'s KDE for predicted class '+str(cl_est))

In [None]:
def split_and_kde(ind):
    contestant = contestants[ind]
    for one_target in truth.true_target.unique():
        all_prob = pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(one_target)+'true.csv'), index_col='object_id')
        n_true = len(all_prob)
        print('calculating '+contestant+'\'s KDE for true class '+str(one_target)+' with '+str(n_true)+' objects')
#         pred_class_inds = {i: int(i[6:]) for i in all_prob.columns.values}
#         all_prob.rename(columns=pred_class_inds, inplace=True)
        for cl_est in all_prob.columns:
            to_plot = all_prob[cl_est].values
            if not np.all(np.isclose(to_plot, 0.)):
                kernel = sps.gaussian_kde(to_plot)
            else:
                kernel = None
            with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'wb') as fn:
                pickle.dump(kernel, fn)
            print('completed '+contestant+'\'s KDE for predicted class '+str(cl_est))
    return contestant

In [None]:
# nps = mp.cpu_count() - 4
# pool = mp.Pool(nps)
# printout = pool.map(split_and_kde, range(len(contestants)))

Now remove all 99 and only run on objects in the validation set, for comparison.
THIS IS WHAT WAS BROKEN -- THE OBJECT IDS DO NOT CORRESPOND BETWEEN VALIDATION AND PLASTICC!

In [None]:
# # Run me only if you've never run the notebook before.
# for contestant in contestants:
#     if not os.path.exists('submissions/validation/'+contestant):
#         os.makedirs('submissions/validation/'+contestant)
# #     'validation_truth.csv'
#     for one_target in label_dict.keys():
#         all_prob = pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(one_target)+'true.csv'), index_col='object_id')
#         val_prob = pd.read_csv(os.path.join('submissions/validation', 'probvecs'+str(one_target)+'true.csv'), index_col='object_id')
#         in_val = all_prob.loc[val_prob.index, :]
#         in_val.fillna(epsilon)
#         in_val.replace(to_replace=0, value=epsilon)
#         in_val.replace(to_replace=0., value=epsilon)
#         normalize = in_val.sum(axis=1) - in_val['99']
#         in_val = in_val.divide(normalize, axis=0)
#         in_val.to_csv(os.path.join('submissions/validation/'+contestant, 'probvecs'+str(one_target)+'true.csv'), header=True)
#         n_true = len(in_val)
#         print('calculating '+contestant+'\'s KDE for true class '+str(one_target)+' with '+str(n_true)+' objects')
#         for cl_est in label_dict.keys():
#             to_plot = in_val[str(cl_est)].values
#             if not np.any(np.isnan(to_plot)):
#                 kernel = sps.gaussian_kde(to_plot)
#             else:
#                 print(str(sum(np.isnan(to_plot)))+' NaNs preventing KDE for '+cl_est)
#                 kernel = None
#             with open(os.path.join('submissions/validation/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'wb') as fn:
#                 pickle.dump(kernel, fn)

same for class 99 to account for mismatch in true vs. predicted labels

In [None]:
## Run me only if you've never run the notebook before, or if you changed the probabilities at which to evaluate the KDEs.
# for contestant in contestants:
#     for one_target in [99]:
#         all_prob = pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(one_target)+'true.csv'), index_col='object_id')
#         n_true = len(all_prob)
# #         print('calculating '+contestant+'\'s KDE for true class '+str(one_target)+' with '+str(n_true)+' objects')
# #         pred_class_inds = {i: int(i[6:]) for i in all_prob.columns.values}
# #         all_prob.rename(columns=pred_class_inds, inplace=True)
#         for cl_est in all_prob.columns:
#             to_plot = all_prob[cl_est].values
#             if not np.all(to_plot == 0):
#                 kernel = sps.gaussian_kde(to_plot)
#             else:
#                 kernel = None
#             with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'wb') as fn:
#                 pickle.dump(kernel, fn)
# #             print('completed '+contestant+'\'s KDE for predicted class '+str(cl_est))

In [None]:
def fit_kde_99(ind):
    contestant = contestants[ind]
    for one_target in [99]:
        all_prob = pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(one_target)+'true.csv'), index_col='object_id')
        n_true = len(all_prob)
#         print('calculating '+contestant+'\'s KDE for true class '+str(one_target)+' with '+str(n_true)+' objects')
#         pred_class_inds = {i: int(i[6:]) for i in all_prob.columns.values}
#         all_prob.rename(columns=pred_class_inds, inplace=True)
        for cl_est in all_prob.columns:
            to_plot = all_prob[cl_est].values
            if not np.all(to_plot == 0):
                kernel = sps.gaussian_kde(to_plot)
            else:
                kernel = None
            with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'wb') as fn:
                pickle.dump(kernel, fn)
#             print('completed '+contestant+'\'s KDE for predicted class '+str(cl_est))
    return contestant

In [None]:
# nps = mp.cpu_count() - 4
# pool = mp.Pool(nps)
# printout = pool.map(fit_kde_99, range(len(contestants)))
# printout

Evaluate the KDEs at grid points, unfortunately also slow (though I'm not sure why).
It has to be re-reun for each updated set of grid points but saves the output so the plot can be tweaked without running it again.
UPDATE: This is now only kinda slow, major improvement!

In [None]:
nbins = 100
positions = np.linspace(0., 1., nbins+1)

In [None]:
# # Run me only if you've never run the notebook before, or if you changed the probabilities at which to evaluate the KDEs.
# for contestant in contestants:
#     for one_target in truth.true_target.unique():
#         print('evaluating '+contestant+'\'s KDEs for true class '+str(one_target)+' with '+str(n_true)+' objects')
#         for cl_est in truth.ideal_label.unique():
#             with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'rb') as fn:
#                 kernel = pickle.load(fn)
#             if kernel is not None:
#                 data = kernel(positions)#[:, np.newaxis])
#             else:
#                 data = np.zeros_like(positions)
#             print('evaluated '+contestant+'\'s KDE for predicted class '+str(cl_est))
#             np.savetxt(os.path.join('submissions/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt'), data)

In [None]:
def evaluate_kde(ind):
    contestant = contestants[ind]
    for one_target in truth.true_target.unique():
        print('evaluating '+contestant+'\'s KDEs for true class '+str(one_target))
        for cl_est in truth.ideal_label.unique():
            with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'rb') as fn:
                kernel = pickle.load(fn)
            if kernel is not None:
                data = kernel(positions)#[:, np.newaxis])
            else:
                data = np.zeros_like(positions)
            assert(~np.any(data[data < 0.]))
            print('evaluated '+contestant+'\'s KDE for predicted class '+str(cl_est))
            np.savetxt(os.path.join('submissions/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt'), data)
    return contestant

In [None]:
# nps = mp.cpu_count() - 4
# pool = mp.Pool(nps)
# printout = pool.map(evaluate_kde, range(len(contestants)))
# printout

Prepare for violin plots specific to class 99 true vs. predicted asymmetry.

In [None]:
# # Run me only if you've never run the notebook before, or if you changed the probabilities at which to evaluate the KDEs.
# for contestant in contestants:
#     for one_target in [99]:
# #         print('evaluating '+contestant+'\'s KDEs for true class '+str(one_target)+' with '+str(n_true)+' objects')
#         for cl_est in truth.ideal_label.unique():
#             with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'rb') as fn:
#                 kernel = pickle.load(fn)
#             if kernel is not None:
#                 data = kernel(positions)#[:, np.newaxis])
#             else:
#                 data = np.zeros_like(positions)
# #             print('evaluated '+contestant+'\'s KDE for predicted class '+str(cl_est))
#             np.savetxt(os.path.join('submissions/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt'), data)

In [None]:
def evaluate_kde_99(ind):
    contestant = contestants[ind]
    for one_target in [99]:
#         print('evaluating '+contestant+'\'s KDEs for true class '+str(one_target)+' with '+str(n_true)+' objects')
        for cl_est in truth.ideal_label.unique():
            with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'rb') as fn:
                kernel = pickle.load(fn)
            if kernel is not None:
                data = kernel(positions)#[:, np.newaxis])
            else:
                data = np.zeros_like(positions)
#             print('evaluated '+contestant+'\'s KDE for predicted class '+str(cl_est))
            np.savetxt(os.path.join('submissions/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt'), data)
    return contestant

In [None]:
# nps = mp.cpu_count() - 4
# pool = mp.Pool(nps)
# printout = pool.map(evaluate_kde_99, range(len(contestants)))
# printout

now do the same for the validation classifier, 

In [None]:
# # Run me only if you've never run the notebook before.
# for contestant in ['validation']:
#     for one_target in label_dict.keys():
#         all_prob = pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(one_target)+'true.csv'), index_col='object_id')
#         n_true = len(all_prob)
#         print('calculating '+contestant+'\'s KDE for true class '+str(one_target)+' with '+str(n_true)+' objects')
# #         pred_class_inds = {i: int(i[6:]) for i in all_prob.columns.values}
# #         all_prob.rename(columns=pred_class_inds, inplace=True)
#         for cl_col in all_prob.columns:
#             to_plot = all_prob[cl_col].values
#             if not (np.all(to_plot == epsilon) or np.any(np.isnan(to_plot))):
#                 try:
#                     kernel = sps.gaussian_kde(to_plot)
#                     print('completed '+contestant+'\'s KDE for predicted class '+str(cl_col))
#                 except:
#                     print('KDE failed with '+str(to_plot))
#                     kernel = None
#             else:
#                 print('KDE failed with all_zeros='+str(np.all(to_plot == epsilon))+' and NaNs='+str(np.any(np.isnan(to_plot))))
#                 kernel = None
#             with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_col)+'pred.pkl'), 'wb') as fn:
#                 pickle.dump(kernel, fn)

In [None]:
# # Run me only if you've never run the notebook before, or if you changed the probabilities at which to evaluate the KDEs.
# for contestant in ['validation']:
#     for one_target in label_dict.keys():
#         print('evaluating '+contestant+'\'s KDEs for true class '+str(one_target))
#         for cl_est in label_dict:
#             with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'rb') as fn:
#                 kernel = pickle.load(fn)
#             if kernel is not None:
#                 data = kernel(positions)#[:, np.newaxis])
#             else:
#                 print('no kernel for true '+str(one_target)+' pred '+str(cl_est))
#                 data = np.zeros_like(positions)
#             print('evaluated '+contestant+'\'s KDE for predicted class '+str(cl_est))
#             np.savetxt(os.path.join('submissions/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt'), data)
            
# # for contestant in contestants:
# #     for one_target in label_dict:
# #         print('evaluating '+contestant+'\'s KDEs for true class '+str(one_target)+' with '+str(n_true)+' objects')
# #         for cl_est in label_dict:
# #             with open(os.path.join('submissions/validation/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'rb') as fn:
# #                 kernel = pickle.load(fn)
# #             if kernel is not None:
# #                 data = kernel(positions)#[:, np.newaxis])
# #             else:
# #                 data = np.zeros_like(positions)
# #             print('evaluated '+contestant+'\'s KDE for predicted class '+str(cl_est))
# #             np.savetxt(os.path.join('submissions/validation/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt'), data)

## Visualize the probabilities

Voila, snazzy violin plots!

TODO: rewrite these plotting functions with helper functions to avoid repeating code

In [None]:
# color_dict = {'1_Kyle': '#6344EE', '2_MikeSilogram': '#E873AB', '3_MajorTom': '#FFB81F', 
#               '4_AhmetErdem': '#30362F', '5_SKZLostInTranslation': '#30362F', '6_StefanStefanov': '#30362F', 
#               '7_hklee': '#30362F', '8_rapidsai': '#30362F', '9_ThreeMusketeers': '#30362F',
#               '10_JJ': '#30362F', '11_SimonChen': '#30362F', '12_Go_Spartans': '#30362F',
#               'validation': '#30362F'}

color_dict = {}
j = 0
for i, contestant in enumerate(contestants):
    if i == 6 or i == 9:
        color_dict[contestant] = 'k'
    else:
        color_dict[contestant] = mpl.colors.to_hex(plt.get_cmap('tab10')(j))
        j += 1
color_dict

### class 99 performance across classifiers

In [None]:
def violins99(contestants, colors):
#     highlights = {contestants[i]: colors[i] for i in range(len(contestants))}
    one_target = 99
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    for j, contestant in enumerate(contestants):
        all_prob = pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs99true.csv'), index_col='object_id')
        n_true = len(all_prob)
        stretchfact = 2.5
        loc = 0
        ax.set_title(str(n_true)+' true '+str(sub_labels[one_target]))
        ticklabels = []
        maps = all_prob.idxmax(axis=1)
        counts = maps.value_counts()
        for cl_est in sub_labels.keys():
            if str(cl_est) in counts.keys():
#                 nmax = str(counts[str(cl_est)])#str(np.around(counts[str(cl_est)]/float(n_true), 2))
                nmax = format(float(counts[str(cl_est)]) / float(n_true) * 100., '.0f')+'%'
            else:
                nmax = '0%'
            highlight = colors[contestant]
            data = np.exp(np.genfromtxt(os.path.join('submissions/'+contestant, 'violin99true'+str(cl_est)+'pred.txt')))
            if not np.all(data == 1.):
                data = data / np.max(data)
            else:
                data = np.zeros_like(data)
            wheremax = np.argmax(data)
            if cl_est == one_target or (len(str(one_target)) == 2 and str(cl_est)[:2] == '99'):
                ax.fill_betweenx(positions, stretchfact*loc - data, stretchfact*loc + data, alpha=0.5, color=highlight, linewidth=0.1, label=contestant)#fontweight='bold', 
            else:
                ax.fill_betweenx(positions, stretchfact*loc - data, stretchfact*loc + data, alpha=0.5, color=highlight, linewidth=0.1)
            if wheremax < len(positions)/4:
                whereprint = 'bottom'
            elif wheremax > 3 * len(positions) / 4:
                whereprint = 'top'
            else:
                whereprint = 'center'
            yval = j/3.+0.05 #positions[wheremax] + 0.2*j
            ax.text(stretchfact*(loc+0.4), yval, nmax, fontsize=12, color=highlight, ha='center', rotation=0, va='bottom')#whereprint)#fontweight='bold', 
            loc += 1
#             ax.text(stretchfact*(loc+0.25), 0.05+positions[wheremax]+0.2*j, nmax, fontsize=12, color=highlight, ha='center', va=whereprint)#, rotation=90)#fontweight='bold', 
#             loc += 1
            ticklabels.append(sub_labels[cl_est])
    ax.legend(loc='upper left')
    ax.set_xticks(stretchfact * np.arange(loc))
    ax.set_xticklabels(ticklabels, rotation=45, ha="right")
    ax.set_xlabel('predicted class')
    ax.set_ylabel('probability')
    fig.savefig(os.path.join('submissions', 'violin_99.png'))

In [None]:
violins99(contestants[:3], color_dict)#['#00A878', '#D8F1A0', '#B4D6D3'])
# #00A878, D8F1A0, B4D6D3

### performance without true class 99 objects

just between real submissions and validation classifier

In [None]:
def val_violins(contestants, colors):
#     highlights = {contestants[k]: colors[k] for k in range(len(contestants))}
    for i in label_dict.keys():
        fig, ax = plt.subplots(1, 1, figsize=(10, 5))
        legloc, ticklabels = [], []
        for j, contestant in enumerate(contestants):
            all_prob = pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(i)+'true.csv'), index_col='object_id')
            n_true = len(all_prob)
            stretchfact = 2.5
            loc = 0
            ax.set_title(str(n_true)+' true '+str(label_dict[i]))
            maps = all_prob.idxmax(axis=1)
            counts = maps.value_counts()
            for cl_est in label_dict.keys():
                if str(cl_est) in counts.keys():
#                     nmax = str(counts[str(cl_est)])#str(np.around(counts[str(cl_est)]/float(n_true), 2))
                    nmax = format(float(counts[str(cl_est)]) / float(n_true) * 100., '.0f')+'%'
                else:
                    nmax = '0%'
                highlight = colors[contestant]
                data = np.genfromtxt(os.path.join('submissions/'+contestant, 'violin'+str(i)+'true'+str(cl_est)+'pred.txt'))
                if contestant == '':
#                     print((cl_est, data))
                    return
                if not np.all(np.isclose(data, 0.)):
                    data = data / np.max(data)
                else:
                    data = np.zeros_like(data)
                wheremax = np.argmax(data)
#                 alpha=0.2*(j+1)
                if cl_est == i:
                    ax.fill_betweenx(positions, stretchfact*loc - data, stretchfact*loc + data, alpha=0.4, color=highlight, linewidth=0.1, label=contestant)#fontweight='bold', 
                else:
                    ax.fill_betweenx(positions, stretchfact*loc - data, stretchfact*loc + data, alpha=0.4, color=highlight, linewidth=0.1)
                if wheremax < len(positions)/4:
                    whereprint = 'bottom'
                elif wheremax > 3 * len(positions) / 4:
                    whereprint = 'top'
                else:
                    whereprint = 'center'
#                 yval = 0.05+positions[wheremax]+0.2*j
#                 if contestant != 'validation':
                yval = j/5.+0.05 #positions[wheremax] + 0.2*j
                ax.text(stretchfact*(loc+0.4), yval, nmax, fontsize=12, color=highlight, ha='center', rotation=0, va='bottom')#whereprint)#fontweight='bold', 
                loc += 1
        midpoint = bisect.bisect(range(len(label_dict.keys())), len(label_dict.keys())/2.)
        for cl_est in label_dict.keys():
            ticklabels.append(label_dict[cl_est])
        if list(label_dict.keys()).index(i) < midpoint:
            legloc = 'upper right'
        else:
            legloc = 'upper left'
            
        ax.legend(loc=legloc)
        ax.set_xticks(stretchfact * np.arange(loc))
        ax.set_xticklabels(ticklabels, rotation=45, ha="right")
        ax.set_xlabel('predicted class')
        ax.set_ylabel('estimated probability')
        fig.savefig(os.path.join('submissions/validation', 'compare_violin_'+true_labels[i]+'.png'))

In [None]:
# val_violins(all_contestants, color_dict)
val_violins(contestants[:3]+['validation'], color_dict)

### overall performance

This is a violin plot of the diagonal of the confusion matrix, i.e. the distribution of each object's probability of being labeled as its true class.
This is closely related to what our final metric probed.

In [None]:
def diag_violin(contestant):
    stretchfact = 2.5
    loc = 0
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    ax.set_title(contestant+' diagonal')
    ticklabels = []
    others = np.zeros_like(positions)
    for one_target in true_labels.keys():
        for cl_est in sub_labels.keys():
#             print((one_target, cl_est))
            if ((len(str(one_target)) == 3) and (str(cl_est) == 99)):
#                 print(one_target)
                other = np.exp(np.genfromtxt(os.path.join('submissions/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt')))
                others = others + other
                print(others)
            elif cl_est == one_target:
                data = np.exp(np.genfromtxt(os.path.join('submissions/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt')))
                data = data / np.max(data)
                ax.fill_betweenx(positions, stretchfact*loc - data, stretchfact*loc + data, alpha=0.75, linewidth=0.1, color='k')
                ticklabels.append(sub_labels[cl_est])
                loc += 1
    data = others / np.max(others)
    ax.fill_betweenx(positions, stretchfact*loc - data, stretchfact*loc + data, alpha=0.75, linewidth=0.1, color='k')
    ticklabels.append(sub_labels[99])
    ax.set_xticks(stretchfact * np.arange(loc+1))
    ax.set_xticklabels(ticklabels, rotation=45, ha="right")
    ax.set_xlabel('true class')
    ax.set_ylabel('probability of true label')
    fig.savefig(os.path.join('submissions/'+contestant, 'diagonal_violin.png'))
    

In [None]:
for contestant in all_contestants:#['validation']:# + contestants:
    diag_violin(contestant)

TODO: interpret this

Looks like they're all gaming the leaderboard rather than really classifying

### per-class performance

Warning, these are slow!

TODO: The violins are normalized so they don't overlap each other in the plots, but it would be more accurate for the area of each to be the same regardless of how far out they extend.
Is there a reasonable way to do this?

TODO: Consider combining these into subplots of a multipanel plot, one panel per true class and multiple classifiers in each panel.

In [None]:
def violins(contestant, one_target, index, class_dict, color='r'):
    print('plotting '+contestant+' true '+str(one_target)+' color '+color)
    all_prob = pd.read_csv(os.path.join('submissions/'+contestant, 'probvecs'+str(one_target)+'true.csv'), index_col=index)
    n_true = len(all_prob)
    stretchfact = 2.5
    loc = 0
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    ax.set_title(str(n_true)+' true '+str(true_labels[one_target])+' by '+contestant)
    ticklabels = []
    maps = all_prob.idxmax(axis=1)
    counts = maps.value_counts()
#     print(counts.keys())
    for cl_est in class_dict.keys():
        if str(cl_est) in counts.keys():
            nmax = str(counts[str(cl_est)])#str(np.around(counts[str(cl_est)]/float(n_true), 2))
        else:
            nmax = '0'
        if cl_est == one_target or (one_target == 99 and str(cl_est)[:2] == '99'):
            highlight = color
            textcol = 'k'
            alp = 0.75
        else:
            highlight = 'k'
            textcol = color
            alp = 0.25
#         data = np.exp(np.genfromtxt(os.path.join('submissions/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt')))
        data = np.genfromtxt(os.path.join('submissions/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt'))
        if not np.all(np.isclose(data, 0.)):
            data = data / np.max(data)
        else:
            data = np.zeros_like(data)
        wheremax = np.argmax(data)
        ax.fill_betweenx(positions, stretchfact*loc - data, stretchfact*loc + data, alpha=alp, color=highlight, linewidth=0.1)
        if wheremax < len(positions)/4:
            whereprint = 'bottom'
        elif wheremax > 3 * len(positions) / 4:
            whereprint = 'top'
        else:
            whereprint = 'center'
        ax.text(stretchfact*loc, positions[wheremax], nmax, fontsize=12, color=textcol, ha='center', va=whereprint, rotation=90)#fontweight='bold', 
        loc += 1
        ticklabels.append(sub_labels[cl_est])
    ax.set_xticks(stretchfact * np.arange(loc))
    ax.set_xticklabels(ticklabels, rotation=45, ha="right")
    ax.set_xlabel('predicted class')
    ax.set_ylabel('probability')
#     fig.savefig(os.path.join('submissions/'+contestant, 'violin_'+true_labels[one_target]+'.png'))
    savepath = os.path.join('../../plasticc-explorer/images/all_violins', 'violin'+contestant+true_labels[one_target]+'.svg')
    print(savepath)
    fig.savefig(savepath)

In [None]:
for contestant in contestants:
    def help_violins(ind):
        one_target = truth.true_target.unique()[ind]
#     for one_target in truth.true_target.unique():#label_dict:
#         violins(contestant, one_target, 'objids', label_dict, color='#00A878')
        violins(contestant, one_target, 'object_id', sub_labels, color=color_dict[contestant])
#         return
    nps = mp.cpu_count() - 4
    pool = mp.Pool(nps)
    pool.map(help_violins, range(len(truth.true_target.unique())))

In [None]:
for contestant in ['validation']:#contestants:
    for one_target in label_dict.keys():#truth.true_target.unique():#label_dict:
#         violins(contestant, one_target, 'objids', label_dict, color='#00A878')
        violins(contestant, one_target, 'object_id', label_dict, color=color_dict[contestant])

At least for Kyle's submission, classes 6, 15, 16, 53, 64, 65, 88, look pretty darn good, like what we'd expect from the "perfect" classifier archetype; class 90 looks more like the "almost perfect" or "noisy" classifier archetypes.
Classes 42 and 62 look like the "mutually subsuming" classifier archetype, relative to class 99; class 67 looks like a weaker form of the "mutually subsuming" classifier archetype relative to classes 62, 90, and 99, and class 95 is also like that, relative to only class 99.
Meanwhile, class 99 looks like the "mutually subsuming" classifier archetype relative to classes 42 and 62.
Class 52 looks like the "uncertain" classifier archetype, with respect to classes 42, 62, 67, 90, and 99.


# Next steps

TODO: Consider making one plot per predicted class, which sort of conveys a probabilistic notion of false positives, whereas making one plot per true class sort of conveys a probabilistic notion of false negatives.
~~This would require splitting up the data files quite differently.~~

Besides all the "TODO" items, I also want to try some unsupervised clustering on the probability vectors (per true class) to get an idea of the covariances, at least for the weirdos like 42, 52, 62, 67.
Actually, we know why this is happening!
It's due to the way Kyle generated the probability of being in class 99, a formula that actually uses the probabilities of these classes.
In a sense it's a bug that it draws probabilty away from true 42 (SNII) to a label of 99 (other).

# SCRATCH

still don't know why this cell is so slow

In [None]:
# # Run me only if you've never run the notebook before, or if you changed the probabilities at which to evaluate the KDEs.
# for contestant in ['validation_DDF', 'validation_WFD']:
#     for one_target in label_dict:
#         print('evaluating '+contestant+'\'s KDEs for true class '+str(one_target))#+' with '+str(n_true)+' objects')
#         for cl_est in label_dict:
#             with open(os.path.join('submissions/'+contestant, 'kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl'), 'rb') as fn:
#                 kernel = pickle.load(fn)
#             if kernel is not None:
#                 data = kernel(positions)#[:, np.newaxis])
#             else:
#                 data = np.zeros_like(positions)
#             print('evaluated '+contestant+'\'s KDE for predicted class '+str(cl_est))
#             np.savetxt(os.path.join('submissions/'+contestant, 'violin'+str(one_target)+'true'+str(cl_est)+'pred.txt'), data)

If data is unavailable, run the following cell to make trivial mock data.

In [None]:
# M_classes = 13
# flat_factor = 1. / M_classes
# class_ids = range(0, M_classes)

# oom = 4
# generator = proclam.simulators.LogUnbalanced()
# N_objects = int(10 ** oom)
# minitruth = generator.simulate(M_classes, N_objects, base=oom)

# mask_tru = det_to_prob(minitruth).astype(int)

# starter = 0.5 * np.ones((M_classes, M_classes)) + 1.5 * np.eye(M_classes)
# starter = starter / np.sum(starter, axis=1)[:, np.newaxis]
# cm = starter

# # afflicted = np.random.choice(range(0, M_classes), size=10, replace=False)
# cruise = [-1, -2]#[0, 1]#afflicted[2:4]
# subsumed = [-3, -4, -6, -7]#[2, 3, 5, 6]#afflicted[4:8]
# swapped = [3, 4]#[-4, -5]
# tunnel = [-1, -8]#[0, 7]#afflicted[8:]
# noisy_cls = [0, 1]#[-2, -1]#afflicted[:2]
# uncertain = [2]#[-3]
# afflicted = cruise + subsumed + tunnel + noisy_cls

# systematic_types = list(reversed([
#     'perfect',
#     'almost perfect',
#     'cruise control by 11',
#     'cruise control by 10',
#     'almost perfect',
#     'subsumed by 10',
#     'subsumed by 10',
#     'tunnel vision',
#     'mutually subsuming',
#     'mutually subsuming',
#     'uncertain',
#     'noisy',
#     'noisy'
# ]))
# plot_systematic_types = list(reversed(systematic_types))

# almost = 0.5 * np.ones((M_classes, M_classes)) + 1.5 * np.eye(M_classes)
# almost = almost / np.sum(almost, axis=1)[:, np.newaxis]
# cm = almost
# perfect = np.eye(M_classes) + 1.e-8
# cm[tunnel] = perfect[tunnel]
# noisy = 0.5 * np.ones((M_classes, M_classes)) + 0.5 * np.eye(M_classes)
# noisy = noisy / np.sum(starter, axis=1)[:, np.newaxis]
# cm[noisy_cls] = noisy[noisy_cls]
# cm[subsumed[-3:]] = cm[cruise[-1]]
# cm[subsumed[:-3]] = cm[cruise[-2]]
# cm[uncertain] = 1./float(M_classes) * np.ones(M_classes)
# cm[swapped[-2]][swapped[-1]] = cm[swapped[-1]][swapped[-1]]
# cm[swapped[-1]][swapped[-2]] = cm[swapped[-2]][swapped[-2]]
# cm[:, -8] = perfect[:, -8]

# cm = cm / np.sum(cm, axis=1)[:, np.newaxis]

# fig = plt.figure(figsize=(5,5))
# grid = ImageGrid(fig, 111,          # as in plt.subplot(111)
#                  nrows_ncols=(1,1),
#                  axes_pad=0.05,
#                  share_all=True,
#                  )
# fig.subplots_adjust(wspace=0.5)
# ax = grid[0]
# im = ax.imshow(cm, vmin=0., vmax=1., cmap=fave_cmap)
# ax.set_xticks(range(M_classes))
# ax.set_xticklabels(range(1, M_classes+1))
# ax.set_yticks(range(M_classes))
# ax.set_yticklabels(range(1, M_classes+1))
# ax.set_ylabel('true class')
# ax.set_xlabel('predicted class')
# cbar_ax = fig.add_axes([0.1, 0.89, 0.8, 0.04])
# cbar = fig.colorbar(im, cax=cbar_ax, orientation='horizontal', pad=0.05)
# cbar_ax.xaxis.set_ticks_position("top")
# ax.cax.toggle_label(True)
# axp = ax.twinx()
# axp.set_ylim(-0.5, M_classes-0.5)
# axp.set_yticks(range(0, M_classes))
# axp.set_yticklabels(plot_systematic_types)
# axp.set_ylabel('systematic effect', rotation=270, labelpad=20)
# ax.set_title('realistically complex \n conditional probability matrix', pad=50)
# # plt.savefig('fig/combined.png')
# plt.show()
# plt.close()

# # delfact = -1
# # minidelta = 10**delfact
# # altdelfact = delfact * 2
# # altminidelta = 10**altdelfact

# classifier = FromCMDM()
# # delta = altminidelta

# temp_cm = cm
# temp_prob = sanitize_predictions(classifier.classify(temp_cm, minitruth, delta=0.001, other=False))

# dets = prob_to_det(temp_prob)
# cm = det_to_cm(dets, minitruth)
# norm_cm = cm.astype(float).T
# norm_cm[norm_cm == 0] = epsilon

# fig = plt.figure(figsize=(5,5))
# grid = ImageGrid(fig, 111,          # as in plt.subplot(111)
#                  nrows_ncols=(1,1),
#                  axes_pad=0.05,
#                  share_all=True,
#                  )
# fig.subplots_adjust(wspace=0.5)
# ax = grid[0]
# im = ax.imshow(norm_cm, vmin=0.01, vmax=len(temp_prob), cmap=fave_cmap, norm=LogNorm())
# ax.set_xticks(range(M_classes))
# ax.set_xticklabels(pred_class_inds)
# ax.set_yticks(range(M_classes))
# ax.set_yticklabels(pred_class_inds)
# ax.set_ylabel('true class')
# ax.set_xlabel('predicted class')
# cbar_ax = fig.add_axes([0.1, 0.89, 0.8, 0.04])
# cbar = fig.colorbar(im, cax=cbar_ax, orientation='horizontal', pad=0.05)
# cbar_ax.xaxis.set_ticks_position("top")
# ax.cax.toggle_label(True)
# # axp = ax.twinx()
# # axp.set_ylim(-0.5, M_classes-0.5)
# # axp.set_yticks(range(0, M_classes))
# # axp.set_yticklabels(plot_systematic_types)
# # axp.set_ylabel('systematic effect', rotation=270, labelpad=20)
# ax.set_title('confusion matrix for true '+str(one_target), pad=50)
# # plt.savefig('fig/combined.png')
# plt.show()
# plt.close()

Sadly Seaborn can't handle multiple datasets on one set of axes.

In [None]:
# seaborn.violinplot(probs.loc['true class' == 4].iloc[:])#data=per_class["true class" == 3])

One target at a time

In [None]:
# one_target = 52

In [None]:
# all_prob = pd.read_csv('1_Kyle_'+str(one_target)+'.csv', index_col='object_id')
# minitruth = [one_target] * len(all_prob)
# # true_ind = list(all_prob.columns.values).index('class_'+str(one_target))
# pred_class_inds = {i: int(i[6:]) for i in all_prob.columns.values}
# M_classes = len(pred_class_inds)
# all_prob.rename(columns=pred_class_inds, inplace=True)

A stacked histogram really doesn't cut it.

In [None]:
# plt.title('probability vectors for true '+str(one_target))
# probbins = np.linspace(-2., 0., 20)
# plt.hist([all_prob[i] for i in all_prob.columns], bins=10.**probbins, density=True, stacked=True, 
#          label=all_prob.columns, color=[fave_cmap(j/M_classes) for j in range(M_classes)])
# plt.legend(loc='upper right')

In [None]:
# probs = all_prob.copy()
# print((np.min(probs), np.max(probs)))
# probs['true class'] = minitruth
# # probs = pd.melt(probs, value_vars=[str(i) for i in range(M_classes)], id_vars='predicted class')