In [1]:
# Import Modules
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt 
from sklearn.datasets import fetch_openml

In [4]:
# Downlaod Data
mnist = fetch_openml('mnist_784', version = 1)
mnist.keys()
# Extract data and target
x, y = mnist["data"], mnist["target"]

In [None]:

# See an example
some_digit_image = x.iloc[0].values.reshape(28,28)
plt.imshow(some_digit_image, cmap = mpl.cm.binary, interpolation = "nearest")
plt.axis("off")
plt.show()
# Compared to Target
y.iloc[0]

In [46]:
# Split data into train and test
x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:]
# For the sake of simplicity, two classes 5 or not-5
y_train_5 = (y_train == '5')
y_test_5 = (y_test == '5')

In [None]:
# Stochastic Gradient Descent (SGD)
# https://scikit-learn.org/stable/modules/sgd.html#:~:text=1.-,Classification,equivalent%20to%20a%20linear%20SVM.
from sklearn.linear_model import SGDClassifier
# clf is short for classifier
sgd_clf = SGDClassifier(random_state = 42)
sgd_clf.fit(x_train, y_train_5)

In [56]:
# Performance Measures
# K-Fold cross-validation (analysis and assessment)
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, x_train, y_train_5, cv = 3, scoring = "accuracy")
# https://www.evidentlyai.com/classification-metrics/accuracy-precision-recall#:~:text=the%20accuracy%20paradox.-,Accuracy%20paradox,the%20dataset%20is%20reasonably%20balanced.

array([0.95035, 0.96035, 0.9604 ])

In [None]:
# Accuracy metrics is only suitable for balanced data
# Solutions: Confusion Matix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score
y_train_pred = cross_val_predict(sgd_clf, x_train, y_train_5, cv = 3)

In [62]:
# Concise matrix 1. Precision = TP/TP+FP 2. Recall = TP/FN+TP
# Precision: Given Test positive, it is actual positive
# Recall: Given Actual positive, it is test positive
confusion_matrix(y_true = y_train_5, y_pred = y_train_pred)

array([[53892,   687],
       [ 1891,  3530]], dtype=int64)

In [66]:
precision_score(y_true = y_train_5, y_pred = y_train_pred)

0.8370879772350012

In [67]:
recall_score(y_true = y_train_5, y_pred = y_train_pred)

0.6511713705958311