## Chapter 3 -  Classification
## Multilabel & Multioutput Classification

In [1]:
import pickle

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import (precision_score, 
                             recall_score, 
                             classification_report, 
                             confusion_matrix, f1_score, 
                             precision_recall_curve, roc_curve, roc_auc_score)

def load(fname):
    mnist = None
    try:
        with open(fname, 'rb') as f:
            mnist = pickle.load(f)
            return mnist
    except FileNotFoundError:
        from sklearn.datasets import fetch_openml
        mnist = fetch_openml('mnist_784', version=1, cache=True)
        with open(fname, 'wb') as f:
            mnist = pickle.dump(mnist, f)
        return mnist

### Ingest

In [2]:
mnist_data = load('mnist.data.pkl')
X, y = mnist_data['data'], mnist_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)
y_train_int, y_test_int = y_train.astype(int), y_test.astype(int)

# Large
y_train_large = (y_train_int >= 7)
y_train_odd = (y_train_int % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]

In [3]:
# Train a kNN classifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [4]:
i = 3
print(y_train[i])
print(knn_clf.predict([X_train[i]]))
# Indeed, this is a [[large value]] and it is [[not odd]].

8
[[ True False]]


In [5]:
# Predict
y_predict = cross_val_predict(knn_clf, X_train, y_train, cv=3)

In [6]:
# Calculate F1 Score
f1_score(y_train, y_predict, average='macro')

0.9670888500500547

In some contexts you prefer better precision, while in others, you prefer higher recall. You cannot have it both ways. A higher precision leads to a lower recall and vice versa. This is called the precision/recall tradeoff.