## Chapter 3 -  Classification
## Accuracy, Precision, Recall

In [1]:
import pickle

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import (train_test_split, 
                                     cross_val_score, 
                                     cross_val_predict)
from sklearn.metrics import (confusion_matrix, 
                             precision_score, recall_score, f1_score,
                             classification_report,                              
                             precision_recall_curve, roc_auc_score)

def load(fname):
    mnist = None
    try:
        with open(fname, 'rb') as f:
            mnist = pickle.load(f)
            return mnist
    except FileNotFoundError:
        from sklearn.datasets import fetch_openml
        mnist = fetch_openml('mnist_784', version=1, cache=True)
        with open(fname, 'wb') as f:
            mnist = pickle.dump(mnist, f)
        return mnist

### Ingest, Labelling, Train-Test Split

In [2]:
mnist_data = load('mnist.data.pkl')
X, y = mnist_data['data'], mnist_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

# Binary Classification - Classify "5" or "not 5"
y_train_5 = (y_train == '5')
y_test_5 = (y_test == '5')

In [3]:
# # For testing. Note that 28*28 = 784
# print(X.shape)
# print(y.shape)

In [4]:
# # For testing
# sample_idk = 4
# d_single_sample = X[sample_idk]
# print(d_single_sample)
# d_single_sample_img = d_single_sample.reshape(28, 28)
# print(d_single_sample)
# plt.imshow(d_single_sample_img, cmap=matplotlib.cm.binary, interpolation='nearest')
# plt.axis('off')
# plt.show()
# print(y[sample_idk])

In [5]:
kfold = 3

In [6]:
# Create model, train
sgd_clf = SGDClassifier(random_state=0)
sgd_clf.fit(X_train, y_train_5)

# Cross-Validation gives the output when the sample is in the test set.
cvs1 = cross_val_score(estimator=sgd_clf, X=X_train, y=y_train_5, cv=kfold)
print(cvs1, cvs1.mean(), cvs1.std())

[0.9325905  0.96470529 0.96435234] 0.9538827107934393 0.015056555299559625


It shows that the estimator gives a 94% accuracy, very good! Now compare this with the dummy classifier

In [7]:
class Dummy5Classifier(BaseEstimator):
    
    def fit(self, X, y=None):
        pass
    
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)
    
dummy_clf = Dummy5Classifier()
dummy_clf.fit(X_train, y_train_5)

cvs2 = cross_val_score(estimator=dummy_clf, X=X_train, y=y_train_5, cv=kfold, scoring='accuracy')
print(cvs2, cvs2.mean(), cvs2.std())

[0.90813754 0.91105733 0.90909091] 0.9094285931269165 0.001215678701391952


Dummy classifier gave a 91% accuracy, which makes sense. If 91% of the images are not 5, then, by using rules, predicting "not 5" alone gives already a high accuracy. Hence, accuracy is generally not preferred as a performance measures when the classifier has skewed datasets.

In [8]:
# cross_val_predict gives the prediction result of each sample when it
# is in the test set group
sgd_ypred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

Precision is $\frac{TP}{TP+FP}$, of all you predicted class 1, how many were predicted wrongly?

Recall is $\frac{TP}{TP+FN}$, of all samples that are class 1, how many were predicted wrongly?

F1 score is harmonic mean of Precision & Recall. $$F_1 = \frac{2}{\frac{1}{precision} + \frac{1}{recall}}$$A harmonic mean gives more weight to low values. Hence, the classifier will only get a high F1 score if both precision & recall are high.

In [9]:
print(confusion_matrix(y_train_5, sgd_ypred)) # first value takes row, 2nd value takes columns
print(precision_score(y_train_5, sgd_ypred)) 
print(recall_score(y_train_5, sgd_ypred))
print(f1_score(y_train_5, sgd_ypred))

[[52467  1644]
 [ 1100  4289]]
0.7229057812236642
0.7958804973093339
0.7576399929341107
