# Visualizing classification probabilities

_Alex Malz (GCCL@RUB)_

Note: A lot of the cells here are very slow to run but only need to be run once, ever.
The notebook will not run end-to-end unless you uncomment them and run them once.

In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd
import pickle
from sklearn.neighbors.kde import KernelDensity
import scipy.stats as sps
# import seaborn as sns

In [None]:
import matplotlib as mpl
# print(mpl.rcParams.items)
mpl.use('PS')
mpl.rcParams['text.usetex'] = False
mpl.rcParams['mathtext.rm'] = 'serif'
mpl.rcParams['font.family'] = 'serif'
mpl.rcParams["font.family"] = "serif"
mpl.rcParams["mathtext.fontset"] = "dejavuserif"
mpl.rcParams['font.serif'] = 'DejaVu Serif'
# mpl.rcParams['text.usetex'] = False
# mpl.rcParams['mathtext.rm'] = 'serif'
# mpl.rcParams['font.weight'] = 'light'
# mpl.rcParams['font.family'] = 'serif'
# mpl.rcParams['font.serif'] = ['Times New Roman']
# # mpl.rcParams['font.family'] = ['Times New Roman']
mpl.rcParams['axes.titlesize'] = 20
mpl.rcParams['axes.labelsize'] = 16
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['savefig.dpi'] = 250
mpl.rcParams['figure.dpi'] = 250
mpl.rcParams['savefig.format'] = 'pdf'
mpl.rcParams['savefig.bbox'] = 'tight'
import matplotlib.pyplot as plt
%matplotlib inline
# print(mpl.rcParams.items)
from matplotlib.colors import LogNorm

import pylab
from mpl_toolkits.axes_grid1 import ImageGrid
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100):
    new_cmap = mpl.colors.LinearSegmentedColormap.from_list(
        'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval),
        cmap(np.linspace(minval, maxval, n)))
    return new_cmap

cmap = plt.get_cmap('hot_r')
fave_cmap = truncate_colormap(cmap, 0.35, 1.0)

In [None]:
import proclam
from proclam import *

epsilon = 1.e-1

## Preprocessing

### Truth table

The true labels subdivide class 99 but the predicted labels don't (which breaks some `proclam.metrcs.util` functions, among other things).
Warning, this fix is slow, so only run the cell below once and then read it in from a file next time.

In [None]:
## Use me if you've never run the notebook before.
# truth = pd.read_csv('plasticc_test_metadata.csv')
# header = ['object_id', 'true_target']
# for i in [991, 992, 993, 994]:
#     truth.loc[truth['true_target'] == i, 'true_target'] = 99
# truth.to_csv('truth.csv', columns=header, index=False)

## Use me if you've run the notebook before.
truth = pd.read_csv('truth.csv', index_col='object_id')

In [None]:
true_class_ids, true_class_counts = np.unique(truth['true_target'], return_counts=True)

TODO: replace the class numbers with informative names, and give them a logical ordering (SN together, etc.).
This should also fix the problem with some old workarounds for class number mismatch that are still in the code below.

### Classification probabilities

The following cell chops up the data into bite-sized pieces.
It's slow but only needs to be done once.

TODO: Also do this for the other three classifiers that made it into the paper, and use directories FFS.

In [None]:
## Run me only if you've never run the notebook before.

# submission = pd.read_csv('1_Kyle.csv', index_col='object_id')
# for i in truth.true_target.unique():
#     to_save = submission[truth['true_target'] == i]
#     to_save.to_csv('1_Kyle_'+str(i)+'.csv')

We have to further split up the data into small pieces and calculate the KDE of each, which is also slow, though only has to be done once.

TODO: smarter way to choose the kernel bandwidth -- the default is huge to the point of not being useful.  
Probably the easiest fix is to use `scipy.stats.gaussian_kde` rather than `sklearn.neighbors.kde.KernelDensity` (assuming it's pickleable), which lets you automatically use Scott's rule to choose the bandwidth.

In [None]:
bw = 0.05

In [None]:
## Run me only if you've never run the notebook before, or if you changed the bandwidth.

# # for cl_est in all_prob.columns:
# #     to_plot = all_prob[cl_est].values
# #     to_plot = to_plot[:, np.newaxis]
# #     kernel = KernelDensity(bandwidth=0.1).fit(to_plot)
# #     data = np.exp(kernel.score_samples(positions[:, np.newaxis]))
# #     print(data)
# #     print('completed KDE for predicted class '+str(cl_est))
# #     np.savetxt('violin'+str(one_target)+'true'+str(cl_est)+'pred.txt', data)
# for one_target in true_class_ids:
#     all_prob = pd.read_csv('1_Kyle_'+str(one_target)+'.csv', index_col='object_id')
#     n_true = len(all_prob)
#     print('calculating KDE for true class '+str(one_target)+' with '+str(n_true)+' objects')
# #     minitruth = [one_target] * len(all_prob)
#     # true_ind = list(all_prob.columns.values).index('class_'+str(one_target))
#     pred_class_inds = {i: int(i[6:]) for i in all_prob.columns.values}
# #     M_classes = len(pred_class_inds)
#     all_prob.rename(columns=pred_class_inds, inplace=True)
#     for cl_est in all_prob.columns:
#         to_plot = all_prob[cl_est].values
#         to_plot = to_plot[:, np.newaxis]
#         kernel = KernelDensity(bandwidth=bw).fit(to_plot)
#         with open('kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl', 'wb') as fn:
#             pickle.dump(kernel, fn)
# #         with open('kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl', 'rb') as fn:
# #             kernel = pickle.load('kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl')
# #         data = kernel.score_samples(positions[:, np.newaxis])
#         print('completed KDE for predicted class '+str(cl_est))
# #         np.savetxt('violin'+str(one_target)+'true'+str(cl_est)+'pred.txt', data)
# #     violins(one_target)

Evaluate the KDEs at grid points, unfortunately also slow (though I'm not sure why).
It has to be re-reun for each updated set of grid points but saves the output so the plot can be tweaked without running it again.

In [None]:
nbins = 50
positions = np.linspace(0., 1., nbins)

In [None]:
## Run me only if you've never run the notebook before, or if you changed the probabilities at which to evaluate the KDEs.
# for one_target in true_class_ids:
#     all_prob = pd.read_csv('1_Kyle_'+str(one_target)+'.csv', index_col='object_id')
#     n_true = len(all_prob)
#     print('evaluating KDEs for true class '+str(one_target)+' with '+str(n_true)+' objects')
# #     minitruth = [one_target] * len(all_prob)
#     # true_ind = list(all_prob.columns.values).index('class_'+str(one_target))
#     pred_class_inds = {i: int(i[6:]) for i in all_prob.columns.values}
# #     M_classes = len(pred_class_inds)
# #     all_prob.rename(columns=pred_class_inds, inplace=True)
#     for cl_est in all_prob.columns:
# #         to_plot = all_prob[cl_est].values
# #         to_plot = to_plot[:, np.newaxis]
# #         kernel = KernelDensity(bandwidth=0.05).fit(to_plot)
# #         with open('kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl', 'wb') as fn:
# #             pickle.dump(kernel, fn)
#         with open('kernel'+str(one_target)+'true'+str(cl_est)+'pred.pkl', 'rb') as fn:
#             kernel = pickle.load(fn)
#         data = kernel.score_samples(positions[:, np.newaxis])
#         print('evaluated KDE for predicted class '+str(cl_est))
#         np.savetxt('violin'+str(one_target)+'true'+str(cl_est)+'pred.txt', data)
# #     violins(one_target)

## Visualize the probabilities

Voila, snazzy violin plots!

TODO: Include the number of true objects in the title.

TODO: Include the number with max probability in each predicted class within each violin.

TODO: Look into a better way to normalize them for plotting purposes so the area is preserved and they don't run into each other.

TODO: Consider combining these into subplots of a multipanel plot, one panel per true class and multiple classifiers per panel.

In [None]:
def violins(one_target):
    loc = 0
    fig, ax = plt.subplots(1, 1)
    ax.set_title('true class '+str(one_target))
    for cl_est in all_prob.columns:
        if cl_est == one_target:
            highlight = 'r'
        else:
            highlight = 'k'
        data = np.exp(np.genfromtxt('violin'+str(one_target)+'true'+str(cl_est)+'pred.txt'))
        data = data / np.max(data)
        ax.fill_betweenx(positions, 2.5*loc - data, 2.5*loc + data, alpha=0.75, color=highlight)
        loc += 1
    ax.set_xticks(2.5 * np.arange(M_classes))
    ax.set_xticklabels(all_prob.columns)
    ax.set_xlabel('predicted class')
    ax.set_ylabel('probability')
    fig.savefig('violin'+str(one_target)+'.png')

In [None]:
for one_target in true_class_ids:
    violins(one_target)

At least for Kyle's submission, classes 6, 15, 16, 53, 64, 65, 88, look pretty darn good, like what we'd expect from the "perfect" classifier archetype; class 90 looks more like the "almost perfect" or "noisy" classifier archetypes.
Classes 42 and 62 look like the "mutually subsuming" classifier archetype, relative to class 99; class 67 looks like a weaker form of the "mutually subsuming" classifier archetype relative to classes 62, 90, and 99, and class 95 is also like that, relative to only class 99.
Meanwhile, class 99 looks like the "mutually subsuming" classifier archetype relative to classes 42 and 62.
Class 52 looks like the "uncertain" classifier archetype, with respect to classes 42, 62, 67, 90, and 99.


## Next steps

TODO: Make another violin plot for the diagonal entry of the confusion matrix, i.e. the thing that our metric actually probes.

TODO: Consider making one plot per predicted class, which sort of conveys a probabilistic notion of false positives, whereas making one plot per true class sort of conveys a probabilistic notion of false negatives.
This would require splitting up the data files quite differently.

Besides all the "TODO" items, I also want to try classifying the probability vectors (per true class) to get an idea of the covariances, at least for the weirdos like 42, 52, 62, 67.

# scratch

If data is unavailable, run the following cell to make trivial mock data.

In [None]:
# M_classes = 13
# flat_factor = 1. / M_classes
# class_ids = range(0, M_classes)

# oom = 4
# generator = proclam.simulators.LogUnbalanced()
# N_objects = int(10 ** oom)
# minitruth = generator.simulate(M_classes, N_objects, base=oom)

# mask_tru = det_to_prob(minitruth).astype(int)

# starter = 0.5 * np.ones((M_classes, M_classes)) + 1.5 * np.eye(M_classes)
# starter = starter / np.sum(starter, axis=1)[:, np.newaxis]
# cm = starter

# # afflicted = np.random.choice(range(0, M_classes), size=10, replace=False)
# cruise = [-1, -2]#[0, 1]#afflicted[2:4]
# subsumed = [-3, -4, -6, -7]#[2, 3, 5, 6]#afflicted[4:8]
# swapped = [3, 4]#[-4, -5]
# tunnel = [-1, -8]#[0, 7]#afflicted[8:]
# noisy_cls = [0, 1]#[-2, -1]#afflicted[:2]
# uncertain = [2]#[-3]
# afflicted = cruise + subsumed + tunnel + noisy_cls

# systematic_types = list(reversed([
#     'perfect',
#     'almost perfect',
#     'cruise control by 11',
#     'cruise control by 10',
#     'almost perfect',
#     'subsumed by 10',
#     'subsumed by 10',
#     'tunnel vision',
#     'mutually subsuming',
#     'mutually subsuming',
#     'uncertain',
#     'noisy',
#     'noisy'
# ]))
# plot_systematic_types = list(reversed(systematic_types))

# almost = 0.5 * np.ones((M_classes, M_classes)) + 1.5 * np.eye(M_classes)
# almost = almost / np.sum(almost, axis=1)[:, np.newaxis]
# cm = almost
# perfect = np.eye(M_classes) + 1.e-8
# cm[tunnel] = perfect[tunnel]
# noisy = 0.5 * np.ones((M_classes, M_classes)) + 0.5 * np.eye(M_classes)
# noisy = noisy / np.sum(starter, axis=1)[:, np.newaxis]
# cm[noisy_cls] = noisy[noisy_cls]
# cm[subsumed[-3:]] = cm[cruise[-1]]
# cm[subsumed[:-3]] = cm[cruise[-2]]
# cm[uncertain] = 1./float(M_classes) * np.ones(M_classes)
# cm[swapped[-2]][swapped[-1]] = cm[swapped[-1]][swapped[-1]]
# cm[swapped[-1]][swapped[-2]] = cm[swapped[-2]][swapped[-2]]
# cm[:, -8] = perfect[:, -8]

# cm = cm / np.sum(cm, axis=1)[:, np.newaxis]

# fig = plt.figure(figsize=(5,5))
# grid = ImageGrid(fig, 111,          # as in plt.subplot(111)
#                  nrows_ncols=(1,1),
#                  axes_pad=0.05,
#                  share_all=True,
#                  )
# fig.subplots_adjust(wspace=0.5)
# ax = grid[0]
# im = ax.imshow(cm, vmin=0., vmax=1., cmap=fave_cmap)
# ax.set_xticks(range(M_classes))
# ax.set_xticklabels(range(1, M_classes+1))
# ax.set_yticks(range(M_classes))
# ax.set_yticklabels(range(1, M_classes+1))
# ax.set_ylabel('true class')
# ax.set_xlabel('predicted class')
# cbar_ax = fig.add_axes([0.1, 0.89, 0.8, 0.04])
# cbar = fig.colorbar(im, cax=cbar_ax, orientation='horizontal', pad=0.05)
# cbar_ax.xaxis.set_ticks_position("top")
# ax.cax.toggle_label(True)
# axp = ax.twinx()
# axp.set_ylim(-0.5, M_classes-0.5)
# axp.set_yticks(range(0, M_classes))
# axp.set_yticklabels(plot_systematic_types)
# axp.set_ylabel('systematic effect', rotation=270, labelpad=20)
# ax.set_title('realistically complex \n conditional probability matrix', pad=50)
# # plt.savefig('fig/combined.png')
# plt.show()
# plt.close()

# # delfact = -1
# # minidelta = 10**delfact
# # altdelfact = delfact * 2
# # altminidelta = 10**altdelfact

# classifier = FromCMDM()
# # delta = altminidelta

# temp_cm = cm
# temp_prob = sanitize_predictions(classifier.classify(temp_cm, minitruth, delta=0.001, other=False))

# dets = prob_to_det(temp_prob)
# cm = det_to_cm(dets, minitruth)
# norm_cm = cm.astype(float).T
# norm_cm[norm_cm == 0] = epsilon

# fig = plt.figure(figsize=(5,5))
# grid = ImageGrid(fig, 111,          # as in plt.subplot(111)
#                  nrows_ncols=(1,1),
#                  axes_pad=0.05,
#                  share_all=True,
#                  )
# fig.subplots_adjust(wspace=0.5)
# ax = grid[0]
# im = ax.imshow(norm_cm, vmin=0.01, vmax=len(temp_prob), cmap=fave_cmap, norm=LogNorm())
# ax.set_xticks(range(M_classes))
# ax.set_xticklabels(pred_class_inds)
# ax.set_yticks(range(M_classes))
# ax.set_yticklabels(pred_class_inds)
# ax.set_ylabel('true class')
# ax.set_xlabel('predicted class')
# cbar_ax = fig.add_axes([0.1, 0.89, 0.8, 0.04])
# cbar = fig.colorbar(im, cax=cbar_ax, orientation='horizontal', pad=0.05)
# cbar_ax.xaxis.set_ticks_position("top")
# ax.cax.toggle_label(True)
# # axp = ax.twinx()
# # axp.set_ylim(-0.5, M_classes-0.5)
# # axp.set_yticks(range(0, M_classes))
# # axp.set_yticklabels(plot_systematic_types)
# # axp.set_ylabel('systematic effect', rotation=270, labelpad=20)
# ax.set_title('confusion matrix for true '+str(one_target), pad=50)
# # plt.savefig('fig/combined.png')
# plt.show()
# plt.close()

Sadly Seaborn can't handle multiple datasets on one set of axes.

In [None]:
seaborn.violinplot(probs.loc['true class' == 4].iloc[:])#data=per_class["true class" == 3])

One target at a time

In [None]:
one_target = 52

In [None]:
all_prob = pd.read_csv('1_Kyle_'+str(one_target)+'.csv', index_col='object_id')
minitruth = [one_target] * len(all_prob)
# true_ind = list(all_prob.columns.values).index('class_'+str(one_target))
pred_class_inds = {i: int(i[6:]) for i in all_prob.columns.values}
M_classes = len(pred_class_inds)
all_prob.rename(columns=pred_class_inds, inplace=True)

A stacked histogram really doesn't cut it.

In [None]:
plt.title('probability vectors for true '+str(one_target))
probbins = np.linspace(-2., 0., 20)
plt.hist([all_prob[i] for i in all_prob.columns], bins=10.**probbins, density=True, stacked=True, 
         label=all_prob.columns, color=[fave_cmap(j/M_classes) for j in range(M_classes)])
plt.legend(loc='upper right')

In [None]:
# probs = all_prob.copy()
# print((np.min(probs), np.max(probs)))
# probs['true class'] = minitruth
# # probs = pd.melt(probs, value_vars=[str(i) for i in range(M_classes)], id_vars='predicted class')