Alex Malz (NYU), Gautham Narayan (STSci), Renee Hlozek (U. Toronto)

This notebook is for making mock data for PLAsTiCC, in order to test the metrics.  I'm going to model it off the [variability tree](https://obswww.unige.ch/~mowlavi/Images/variability_tree.pdf).

In [None]:
import numpy as np
import string
import random
from collections import namedtuple
import itertools
import numpy.random as npr
import bisect
import scipy.stats as sps
import sklearn as skl
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
%matplotlib inline

Let's consider four classes: `A`, `B`, `C`, and `D`.  

In [None]:
# classes = ['A', 'B', 'C', 'D']
# n_classes = len(classes)
n_classes = 10
class_names = [''.join(random.sample(string.ascii_lowercase, 2)) for i in range(n_classes)]
print(class_names)

We'll assign them probabilites that are random draws from a half-Cauchy distribution.

In [None]:
x = np.linspace(0., 1., 100)
plt.plot(x, sps.halfcauchy.pdf(x))
plt.xlabel(r'$x$')
plt.ylabel(r'$p(x)$')
plt.savefig('halfcauchy.png')

In [None]:
# class_probs = np.array([1., 2., 3., 4.])
class_probs = sps.halfcauchy.rvs(size=n_classes) + (1. + np.arange(n_classes)[::-1]) / n_classes
class_probs /= np.sum(class_probs)

We draw true classes from this non-uniform discrete distribution and express them as a binary matrix.

In [None]:
n_obj = 100

In [None]:
cdf = np.cumsum(class_probs)
truth = np.zeros((n_obj, n_classes))
for t in truth:
    r = np.random.uniform()
    t[bisect.bisect(cdf, r)] = 1.

Let's check that the drawn classes match the underlying probabilities.

In [None]:
print((str('drawn class probabilities: '), np.mean(truth, axis=0)))
print((str('true class probabilities: '), class_probs))

The best classifier is perfect.

In [None]:
sub1 = truth

In [None]:
fig = plt.figure(figsize=(n_obj, n_classes))

ax1 = fig.add_subplot(211)
ax1.set_title('classification probabilites')
ax1.matshow(sub1.T, vmin=0., vmax=1.)
ax1.set_yticks(np.arange(n_classes))
ax1.set_yticklabels(class_names)
ax1.set_xticklabels(range(n_obj))
ax1.set_xlabel('objects')
ax1.set_ylabel('classes')

ax2 = fig.add_subplot(212)
ax2.set_title('difference from truth')
ax2.matshow(sub1.T - truth.T, vmin=-1., vmax=1.)
ax2.set_yticks(np.arange(n_classes))
ax2.set_yticklabels(class_names)
ax2.set_xticklabels(range(n_obj))
ax2.set_xlabel('objects')
ax2.set_ylabel('classes')

fig.savefig('perfect.png')

example: classifier that's right half the time and wrong evenly (no covariance between classes)

In [None]:
sub2 = 0.5 * sub1 + np.ones((n_obj, n_classes)) / n_classes
sub2 /= np.sum(sub2, axis=1)[:, np.newaxis]

In [None]:
fig = plt.figure(figsize=(n_obj, n_classes))

ax1 = fig.add_subplot(211)
ax1.set_title('classification probabilites')
ax1.matshow(sub2.T, vmin=0., vmax=1.)
ax1.set_yticks(np.arange(n_classes))
ax1.set_yticklabels(class_names)
ax1.set_xticklabels(range(n_obj))
ax1.set_xlabel('objects')
ax1.set_ylabel('classes')

ax2 = fig.add_subplot(212)
ax2.set_title('difference from truth')
ax2.matshow(sub2.T - truth.T, vmin=-1., vmax=1.)
ax2.set_yticks(np.arange(n_classes))
ax2.set_yticklabels(class_names)
ax2.set_xticklabels(range(n_obj))
ax2.set_xlabel('objects')
ax2.set_ylabel('classes')

fig.savefig('noiseless_random.png')

too uniform! add some jitter to the tune of 90% accuracy

In [None]:
sub3 = sub2 + 0.1 * sps.halfcauchy.rvs(size=(n_obj, n_classes))
sub3 /= np.sum(sub3, axis=1)[:, np.newaxis]

In [None]:
fig = plt.figure(figsize=(n_obj, n_classes))

ax1 = fig.add_subplot(211)
ax1.set_title('classification probabilites')
ax1.matshow(sub3.T, vmin=0., vmax=1.)
ax1.set_yticks(np.arange(n_classes))
ax1.set_yticklabels(class_names)
ax1.set_xticklabels(range(n_obj))
ax1.set_xlabel('objects')
ax1.set_ylabel('classes')

ax2 = fig.add_subplot(212)
ax2.set_title('difference from truth')
ax2.matshow(sub3.T - truth.T, vmin=-1., vmax=1.)
ax2.set_yticks(np.arange(n_classes))
ax2.set_yticklabels(class_names)
ax2.set_xticklabels(range(n_obj))
ax2.set_xlabel('objects')
ax2.set_ylabel('classes')

fig.savefig('noisy_random.png')

In [None]:
plt.hist(np.abs(sub3 - truth).flatten())
plt.savefig('check_noise.png')

In [None]:
print(sub3[2])
print(truth[2])

In [None]:
print(sub3[4])
print(truth[4])

want a covariance between classes

In [None]:
confmat = np.eye(n_classes) + 0.1 * sps.halfcauchy.rvs(size=(n_classes, n_classes))
confmat[0][1] += 1.
confmat[1][0] += 1.
confmat /= np.sum(confmat, axis=1)[:, np.newaxis]

In [None]:
plt.matshow(confmat, vmin=0., vmax=1.)
plt.savefig('small_confmat.png')

In [None]:
sub4 = np.empty((n_obj, n_classes))
for t in range(n_obj):
    mask = np.where(truth[t] == 1.)
#     print(mask)#, confmat[mask], sub3[t])
    sub4[t,:] = confmat[mask] * sub3[t]

In [None]:
fig = plt.figure(figsize=(n_obj, n_classes))

ax1 = fig.add_subplot(211)
ax1.set_title('classification probabilites')
ax1.matshow(sub4.T, vmin=0., vmax=1.)
ax1.set_yticks(np.arange(n_classes))
ax1.set_yticklabels(class_names)
ax1.set_xticklabels(range(n_obj))
ax1.set_xlabel('objects')
ax1.set_ylabel('classes')

ax2 = fig.add_subplot(212)
ax2.set_title('difference from truth')
ax2.matshow(sub4.T - truth.T, vmin=-1., vmax=1.)
ax2.set_yticks(np.arange(n_classes))
ax2.set_yticklabels(class_names)
ax2.set_xticklabels(range(n_obj))
ax2.set_xlabel('objects')
ax2.set_ylabel('classes')

fig.savefig('confmat_based.png')

try a metric!  _Note that the confusion matrix, however, reduces probabilities to point estimates!_

In [None]:
# pred4 = np.argmax(sub4, axis=1)
classids = np.arange(n_classes)
# confusion_matrix(classids[np_argmax(truth)], classids[pred4)
                                                      
y_truth = classids[np.argmax(truth, axis=1)]
y_pred = classids[np.argmax(sub4, axis=1)]
cnf_matrix = confusion_matrix(y_truth, y_pred)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
# cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classids,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.savefig('big_confmat.png')

In [None]:
print(log_loss(truth, sub1))#perfect
print(log_loss(truth, sub2))#guesses
print(log_loss(truth, sub3))#noisy
print(log_loss(truth, sub4))#correlated

every deterministic classifier corresponds to a confusion matrix

In [None]:
confmat = 

what about an actual probabilistic classifier?

* knn: make distances and create tree (sklearn.KDTree)
* rf: 

Now let's permit hierarchical classes, so `C` and `D` may be subclasses of `B`.