This notebook is for making mock data for PLAsTiCC, in order to test the metrics.  I'm going to model it off the [variability tree](https://obswww.unige.ch/~mowlavi/Images/variability_tree.pdf).

In [None]:
import numpy as np
from collections import namedtuple
import numpy.random as npr
import bisect
import scipy.stats as sps
import sklearn as skl
import itertools

import matplotlib.pyplot as plt
%matplotlib inline

Let's consider four classes: `A`, `B`, `C`, and `D`.  

In [None]:
# classes = ['A', 'B', 'C', 'D']
# n_classes = len(classes)
n_classes = 10

define p(A), p(B), p(C), p(D)

In [None]:
# class_probs = np.array([1., 2., 3., 4.])
class_probs = sps.halfcauchy.rvs(size=n_classes) + (1. + np.arange(n_classes)[::-1])
class_probs /= np.sum(class_probs)

In [None]:
n_obj = 20

weighted draws with bisect

In [None]:
cdf = np.cumsum(class_probs)


In [None]:
npr.seed = 42
truth = np.zeros((n_obj, n_classes))
for t in truth:
    r = np.random.uniform()
    t[bisect.bisect(cdf, r)] = 1.

In [None]:
print(truth)

In [None]:
print(np.sum(truth, axis=0))

the best classifier is perfect

In [None]:
sub1 = truth

In [None]:
plt.matshow(sub1.T)
plt.savefig('perfect.png')

example: classifier that's right half the time and wrong evenly (no covariance between classes)

In [None]:
sub2 = 0.5 * (sub1 + np.ones((n_obj, n_classes)) / n_classes)
sub2 /= np.sum(sub2, axis=1)[:, np.newaxis]

In [None]:
print(sub2)

In [None]:
plt.matshow(sub2.T)
plt.savefig('noiseless_uncorrelated.png')

too uniform! add some jitter

In [None]:
sub3 = sub2 + 0.1 * sps.halfcauchy.rvs(size=(n_obj, n_classes))
sub3 /= np.sum(sub3, axis=1)[:, np.newaxis]

In [None]:
plt.matshow(sub3.T)
plt.savefig('noisy_uncorrelated.png')

In [None]:
plt.matshow(sub3.T - truth.T)
plt.savefig('check_noise.png')

In [None]:
plt.hist(np.abs(sub3 - truth).flatten())
plt.savefig('check_noise.png')

In [None]:
print(sub3[2])
print(truth[2])

In [None]:
print(sub3[4])
print(truth[4])

want a covariance between classes

In [None]:
confmat = np.eye(n_classes) + 0.1 * sps.halfcauchy.rvs(size=(n_classes, n_classes))
confmat[0][1] += 1.
confmat[1][0] += 1.
confmat /= np.sum(confmat, axis=1)[:, np.newaxis]

In [None]:
plt.matshow(confmat)
plt.savefig('small_confmat.png')

In [None]:
print(confmat)

In [None]:
sub4 = np.empty((n_obj, n_classes))
for t in range(n_obj):
    mask = np.where(truth[t] == 1.)
#     print(mask)#, confmat[mask], sub3[t])
    sub4[t,:] = confmat[mask] * sub3[t]

In [None]:
plt.matshow(sub4.T - truth.T)

try a metric!

In [None]:
from sklearn.metrics import confusion_matrix

# pred4 = np.argmax(sub4, axis=1)
classids = np.array(range(n_classes))
# confusion_matrix(classids[np_argmax(truth)], classids[pred4)
        
                                                      
y_truth = classids[np.argmax(truth, axis=1)]
y_pred = classids[np.argmax(sub4, axis=1)]
cnf_matrix = confusion_matrix(y_truth, y_pred)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
# cnf_matrix = confusion_matrix(y_true, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classids,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=classids, normalize=True,
                      title='Normalized confusion matrix')

plt.savefig('big_confmat.png')

In [None]:
from sklearn.metrics import log_loss

log_loss(truth, sub3)
log_loss(truth, sub4)

the dumbest classifier just guesses

In [None]:
npr.seed = 0
# sub2 = npr.randint(0, n_classes, n_obj)

every deterministic classifier corresponds to a confusion matrix

In [None]:
confmat = 

what about an actual probabilistic classifier?

* knn: make distances and create tree (sklearn.KDTree)
* rf: 

Now let's permit hierarchical classes, so `C` and `D` may be subclasses of `B`.