## Chapter 3 -  Classification
### Multilabel Classification

In [1]:
import pickle

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import (precision_score, 
                             recall_score, 
                             classification_report, 
                             confusion_matrix, f1_score, 
                             precision_recall_curve, roc_curve, roc_auc_score)

def load(fname):
    mnist = None
    try:
        with open(fname, 'rb') as f:
            mnist = pickle.load(f)
            return mnist
    except FileNotFoundError:
        from sklearn.datasets import fetch_openml
        mnist = fetch_openml('mnist_784', version=1, cache=True)
        with open(fname, 'wb') as f:
            mnist = pickle.dump(mnist, f)
        return mnist

In some cases, we want the classifier to output multiple response classes for each instance.

In the MNIST example, consider that we want to label for both whether the value is large, and whether the value is odd.

In [2]:
# Ingest and label
mnist_data = load('mnist.data.pkl')
X, y = mnist_data['data'], mnist_data['target']
y = y.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

# Large
y_train_large = (y_train >= 7)
# Odd
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

This now creates a multilabel array containing 2 responses for each observation. The first is whether the digit is large (7-9) and the second is whether the digit is odd.

The training for such a problem follows the same using `SKLearn`

In [3]:
# Train a kNN classifier on both classes
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

Note: Training took about 30 seconds

In [4]:
# Predict
i = 3
print(y_train[i])
print(knn_clf.predict([X_train[i]]))
# Indeed, this is a [[large value]] and it is [[not odd]].

8
[[ True False]]
