# Setup

In [59]:
import os

from sklearn.datasets import load_digits
from sklearn.datasets import fetch_openml

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# MNIST

In [60]:
#mnist = load_digits()
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

In [72]:
X, y = mnist["data"], mnist["target"]
y = y.astype(np.uint8)

In [65]:
# Digit
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")

#plot_digit(X[0])
    
# Digits
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.figure(figsize=(9,9))
    plt.imshow(image, cmap = mpl.cm.binary, **options)
    plt.axis("off")
    
# plot_digits(X[:100]) 

# Split

In [73]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Binary classifier

In [74]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [75]:
from sklearn.linear_model import SGDClassifier

In [79]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

SGDClassifier(random_state=42)

In [80]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)

In [81]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)

array([[53892,   687],
       [ 1891,  3530]], dtype=int64)

In [82]:
from sklearn.metrics import precision_score, recall_score
cm = confusion_matrix(y_train_5, y_train_pred)

0.8370879772350012

In [84]:
recall_score(y_train_5, y_train_pred)
#cm[1, 1] / (cm[1, 0] + cm[1, 1])

0.6511713705958311

In [None]:
precision_score(y_train_5, y_train_pred)
#cm[1, 1] / (cm[0, 1] + cm[1, 1])