First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed as well as Scikit-Learn ≥0.20.

from sklearn.model_selection import cross_val_score

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score


# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Timing code

(borrowed from https://stackoverflow.com/questions/7370801/measure-time-elapsed-in-python)

In [2]:
from contextlib import contextmanager
from timeit import default_timer

@contextmanager
def elapsed_timer():
    start = default_timer()
    elapser = lambda: default_timer() - start
    yield lambda: elapser()
    end = default_timer()
    elapser = lambda: end-start
    


In [3]:
with elapsed_timer() as my_timer:
    sum = 0
    for x in range(1000000):
        sum += x
    print(my_timer())
    for x in range(1000000):
        sum += x
     
print(my_timer())

0.06217270001070574
0.12535200000274926


# MNIST

In [4]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [5]:
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [6]:
y = y.astype(np.uint8)

In [7]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
some_digit = X.iloc[0]
X_train.shape

(60000, 784)

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
X_test_scaled = scaler.transform(X_test.astype(np.float64))

# Multiclass classification

In [9]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)

In [10]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto", random_state=42)
svm_clf.fit(X_train_scaled[:1000], y_train[:1000]) 
svm_clf.predict([some_digit])


array([7], dtype=uint8)

In [11]:
from sklearn.multiclass import OneVsRestClassifier
ovr_clf = OneVsRestClassifier(SVC(gamma="auto", random_state=42))
ovr_clf.fit(X_train_scaled[:1000], y_train[:1000])
ovr_clf.predict([some_digit])

array([7], dtype=uint8)

In [12]:
len(ovr_clf.estimators_)

10

In [13]:
with elapsed_timer() as sgd_timer:
    sgd_clf.fit(X_train_scaled, y_train)
print(f"sgd_clf.fit took {sgd_timer():.3f} secs")  # 300 seconds on my laptop
sgd_clf.predict([some_digit])

sgd_clf.fit took 207.081 secs


array([3], dtype=uint8)

213.051 secs (just in case)

In [14]:
sgd_clf.decision_function([some_digit])

array([[ -52074.00622961,  -90932.76755577,  -20253.71519718,
          13700.40693397, -153621.49226805,   -4632.87432725,
        -107360.91391427,  -90145.62086593,    -980.42911341,
         -44100.09135652]])

In [15]:
# This involves training 3 models - it's going to take a while
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

array([0.8983, 0.891 , 0.9018])

took 7 mins 22 secs

In [16]:
sgd_clf.score(X_test_scaled, y_test)

0.8933

In [17]:
from sklearn.metrics import confusion_matrix
y_test_pred = sgd_clf.predict(X_test_scaled)
conf_mx_test = confusion_matrix(y_test, y_test_pred)
conf_mx_test

array([[ 937,    0,    0,    0,    0,    3,    4,    1,   35,    0],
       [   0, 1083,    5,    1,    0,    3,    4,    0,   39,    0],
       [   4,    3,  890,   14,    7,    2,   13,    6,   89,    4],
       [   4,    0,   13,  874,    0,   20,    2,    7,   82,    8],
       [   1,    0,    6,    0,  877,    1,    6,    4,   66,   21],
       [   5,    2,    1,   31,    8,  716,   17,    7,   98,    7],
       [  10,    2,   10,    0,    8,   13,  881,    1,   33,    0],
       [   2,    2,   16,    5,    5,    1,    0,  924,   49,   24],
       [   6,    4,    4,   15,    3,   24,    8,    1,  905,    4],
       [   5,    5,    0,    6,   24,    4,    0,   21,   98,  846]])

In [18]:
for size in [1000, 2000, 4000, 8000, 10000]: #20000]:
    with elapsed_timer() as svm_timer:
        svm_clf.fit(X_train_scaled[:size], y_train[:size])
    print(f"svm_clf.fit took {svm_timer():.3f} secs")
    with elapsed_timer() as ovr_timer:  # 300 seconds on my laptop
        ovr_clf.fit(X_train_scaled[:size], y_train[:size])
    print(f"ovr_clf.fit took {ovr_timer():.3f} secs")
    #sgd_clf.predict([some_digit])

svm_clf.fit took 0.117 secs
ovr_clf.fit took 0.294 secs
svm_clf.fit took 0.314 secs
ovr_clf.fit took 1.067 secs
svm_clf.fit took 0.902 secs
ovr_clf.fit took 6.032 secs
svm_clf.fit took 4.133 secs
ovr_clf.fit took 20.703 secs
svm_clf.fit took 7.222 secs
ovr_clf.fit took 31.144 secs


In [20]:
cross_val_score(svm_clf, X_train_scaled, y_train, cv=3, scoring="accuracy") 

array([0.95965, 0.96005, 0.9608 ])

In [21]:
cross_val_score(ovr_clf, X_train_scaled, y_train, cv=3, scoring="accuracy") # oh boy

array([0.9611, 0.9579, 0.9617])

svm: 10:03 (96%), ovr: 42:54 (96%)

In [22]:
train_reshaped = X_train_scaled.reshape(60000, 28, 28)
cropped_reshape = train_reshaped[:, 5:25, 3:26] # based on the pixel importance plot in book
cropped_reshape[0]
train_cropped = cropped_reshape.reshape(60000, 460)
train_cropped[0]

array([-2.20501560e-02, -5.18380750e-02, -9.06597035e-02, -1.39667106e-01,
       -1.97082121e-01, -2.64384728e-01, -3.40730442e-01, -4.22535749e-01,
       -5.15408649e-01, -5.84660366e-01, -5.41363044e-01, -6.26156785e-01,
       -6.69145031e-01,  3.43637162e-01,  5.31056017e-01,  1.10236573e+00,
       -2.65802578e-01,  1.71565971e+00,  3.77344822e+00,  4.93900230e+00,
        3.45006811e+00, -1.31080130e-01, -7.48460374e-02, -3.63656606e-02,
       -7.90243079e-02, -1.33289086e-01, -1.99154411e-01, -2.75429691e-01,
        8.82421166e-02, -1.60018965e-02,  4.42476487e-01,  8.18775057e-01,
        7.49339511e-01,  1.32200212e+00,  1.19900231e+00,  1.14748071e+00,
        1.16876765e+00,  1.25693737e+00,  1.17313576e+00,  9.33441548e-01,
        2.13777179e+00,  2.64653072e+00,  2.81275280e+00,  1.14339095e+00,
       -1.72939702e-01, -1.04104240e-01, -5.91585908e-02, -1.13004853e-01,
       -1.78441525e-01, -2.58004810e-01,  4.14936751e-01,  2.55139991e+00,
        2.15784271e+00,  

In [23]:
for size in [1000, 2000, 4000, 8000, 10000]: #20000]:
    with elapsed_timer() as svm_timer:
        svm_clf.fit(train_cropped[:size], y_train[:size])
    print(f"svm_clf.fit took {svm_timer():.3f} secs")
    with elapsed_timer() as ovr_timer:  # 300 seconds on my laptop
        ovr_clf.fit(train_cropped[:size], y_train[:size])
    print(f"ovr_clf.fit took {ovr_timer():.3f} secs")
    #sgd_clf.predict([some_digit])

svm_clf.fit took 0.094 secs
ovr_clf.fit took 0.205 secs
svm_clf.fit took 0.238 secs
ovr_clf.fit took 0.653 secs
svm_clf.fit took 0.860 secs
ovr_clf.fit took 2.690 secs
svm_clf.fit took 2.131 secs
ovr_clf.fit took 12.863 secs
svm_clf.fit took 3.784 secs
ovr_clf.fit took 19.249 secs


In [24]:
cross_val_score(sgd_clf, train_cropped, y_train, cv=3, scoring="accuracy") 

array([0.8686 , 0.86705, 0.8751 ])

In [25]:
cross_val_score(svm_clf, train_cropped, y_train, cv=3, scoring="accuracy")

array([0.96995, 0.968  , 0.9685 ])

In [26]:
cross_val_score(ovr_clf, train_cropped, y_train, cv=3, scoring="accuracy")

array([0.9721 , 0.96855, 0.9698 ])

sgd: 4:27 (87%), svm: 6:28 (96%) & ovr: 26:40 (96%)

Pixel Importance Code From Book:

In [27]:
from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

In [28]:
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rnd_clf.fit(mnist["data"], mnist["target"])

In [29]:
from sklearn.preprocessing import MinMaxScaler

In [30]:
minmaxscale = MinMaxScaler(feature_range=(0,1))
features = minmaxscale.fit_transform(rnd_clf.feature_importances_.reshape(-1,1))
features

array([[0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [3.20347737e-05],
       [6.42126009e-05],
       [1.54511570e-04],
       [2.35168622e-04],
       [4.20730869e-04],
       [3.59091700e-04],


In [31]:
#features.sort()
#features.max() # max
#len(features)
#features[348] # top 30 pixels -> was about 0.49...

In [32]:
import numpy as np
import numpy.ma as ma

In [33]:
mask_array = ma.masked_less(features, 0.55)
mask = ~mask_array.mask
np.count_nonzero(mask == [True])

31

In [34]:
filtered_train = [ma.masked_array(data, mask=mask_array.mask).compressed() for data in X_train_scaled]
len(filtered_train[0])

31

In [35]:
for size in [1000, 2000, 4000, 8000, 10000]: #20000]:
    with elapsed_timer() as svm_timer:
        svm_clf.fit(filtered_train[:size], y_train[:size])
    print(f"svm_clf.fit took {svm_timer():.3f} secs")
    with elapsed_timer() as ovr_timer:  # 300 seconds on my laptop
        ovr_clf.fit(filtered_train[:size], y_train[:size])
    print(f"ovr_clf.fit took {ovr_timer():.3f} secs")
    #sgd_clf.predict([some_digit])

svm_clf.fit took 0.028 secs
ovr_clf.fit took 0.080 secs
svm_clf.fit took 0.082 secs
ovr_clf.fit took 0.256 secs
svm_clf.fit took 0.254 secs
ovr_clf.fit took 0.811 secs
svm_clf.fit took 0.791 secs
ovr_clf.fit took 2.718 secs
svm_clf.fit took 1.152 secs
ovr_clf.fit took 4.028 secs


In [36]:
cross_val_score(sgd_clf, filtered_train, y_train, cv=3, scoring="accuracy")

array([0.74225, 0.72775, 0.73425])

In [37]:
cross_val_score(svm_clf, filtered_train, y_train, cv=3, scoring="accuracy")

array([0.8872 , 0.88685, 0.8938 ])

In [38]:
cross_val_score(ovr_clf, filtered_train, y_train, cv=3, scoring="accuracy")

array([0.8889 , 0.88505, 0.89345])

sgd: 00:02 (73%), svm: 1:32 (89%), ovr: 4:41 (89%)

PCA Reduction

In [39]:
from sklearn.decomposition import PCA

In [40]:
pca = PCA()
pca_80 = PCA(n_components=0.8)

In [56]:
reduced_pca = pca.fit_transform(X_train_scaled)
reduced_pca80 = pca_80.fit_transform(X_train_scaled)

In [57]:
pca.explained_variance_ratio_

array([5.64671692e-02, 4.07827199e-02, 3.73938042e-02, 2.88511485e-02,
       2.52110863e-02, 2.19426996e-02, 1.92334439e-02, 1.74579923e-02,
       1.53509230e-02, 1.40171960e-02, 1.34174302e-02, 1.20374194e-02,
       1.11456955e-02, 1.08992356e-02, 1.02864922e-02, 9.94486564e-03,
       9.36383280e-03, 9.21045666e-03, 8.93436778e-03, 8.69912619e-03,
       8.27363019e-03, 8.03417369e-03, 7.64845500e-03, 7.41772464e-03,
       7.15292868e-03, 6.91846831e-03, 6.84135964e-03, 6.56674546e-03,
       6.31676724e-03, 6.12919839e-03, 5.96255295e-03, 5.87716416e-03,
       5.71591699e-03, 5.62307416e-03, 5.54682002e-03, 5.38418374e-03,
       5.31182250e-03, 5.19605602e-03, 5.08211255e-03, 4.80005571e-03,
       4.76455820e-03, 4.69139360e-03, 4.54348956e-03, 4.51345787e-03,
       4.46963401e-03, 4.43383155e-03, 4.38215469e-03, 4.30381751e-03,
       4.26877901e-03, 4.23647017e-03, 4.04696121e-03, 3.99447403e-03,
       3.97456119e-03, 3.93820800e-03, 3.85813590e-03, 3.79042674e-03,
      

In [59]:
pca_80.explained_variance_ratio_

array([0.05646717, 0.04078272, 0.0373938 , 0.02885115, 0.02521109,
       0.0219427 , 0.01923344, 0.01745799, 0.01535092, 0.0140172 ,
       0.01341743, 0.01203742, 0.0111457 , 0.01089924, 0.01028649,
       0.00994487, 0.00936383, 0.00921046, 0.00893437, 0.00869913,
       0.00827363, 0.00803417, 0.00764846, 0.00741772, 0.00715293,
       0.00691847, 0.00684136, 0.00656675, 0.00631677, 0.0061292 ,
       0.00596255, 0.00587716, 0.00571592, 0.00562307, 0.00554682,
       0.00538418, 0.00531182, 0.00519606, 0.00508211, 0.00480006,
       0.00476456, 0.00469139, 0.00454349, 0.00451346, 0.00446963,
       0.00443383, 0.00438215, 0.00430382, 0.00426878, 0.00423647,
       0.00404696, 0.00399447, 0.00397456, 0.00393821, 0.00385814,
       0.00379043, 0.00375403, 0.00370776, 0.00364944, 0.00359301,
       0.00352382, 0.00347794, 0.00344411, 0.00339868, 0.00335955,
       0.00334886, 0.00331864, 0.00323026, 0.00316277, 0.00313244,
       0.00310731, 0.00307243, 0.00304914, 0.00302717, 0.00299

Using base PCA model

In [60]:
for size in [1000, 2000, 4000, 8000, 10000]: #20000]:
    with elapsed_timer() as svm_timer:
        svm_clf.fit(reduced_pca[:size], y_train[:size])
    print(f"svm_clf.fit took {svm_timer():.3f} secs")
    with elapsed_timer() as ovr_timer:  # 300 seconds on my laptop
        ovr_clf.fit(reduced_pca[:size], y_train[:size])
    print(f"ovr_clf.fit took {ovr_timer():.3f} secs")
    #sgd_clf.predict([some_digit])

svm_clf.fit took 0.121 secs
ovr_clf.fit took 0.423 secs
svm_clf.fit took 0.355 secs
ovr_clf.fit took 1.463 secs
svm_clf.fit took 1.349 secs
ovr_clf.fit took 6.273 secs
svm_clf.fit took 5.262 secs
ovr_clf.fit took 21.706 secs
svm_clf.fit took 9.050 secs
ovr_clf.fit took 34.770 secs


Using variance of 80%

In [61]:
for size in [1000, 2000, 4000, 8000, 10000]: #20000]:
    with elapsed_timer() as svm_timer:
        svm_clf.fit(reduced_pca80[:size], y_train[:size])
    print(f"svm_clf.fit took {svm_timer():.3f} secs")
    with elapsed_timer() as ovr_timer:  # 300 seconds on my laptop
        ovr_clf.fit(reduced_pca80[:size], y_train[:size])
    print(f"ovr_clf.fit took {ovr_timer():.3f} secs")

svm_clf.fit took 0.077 secs
ovr_clf.fit took 0.277 secs
svm_clf.fit took 0.317 secs
ovr_clf.fit took 0.860 secs
svm_clf.fit took 1.168 secs
ovr_clf.fit took 4.355 secs
svm_clf.fit took 3.811 secs
ovr_clf.fit took 15.265 secs
svm_clf.fit took 5.384 secs
ovr_clf.fit took 24.902 secs


Final Comparisons for small datasubset (ovr)

In [62]:
cross_val_score(ovr_clf, X_train_scaled[:5000], y_train[:5000], cv=3, scoring="accuracy") 

array([0.89562088, 0.92861428, 0.90816327])

In [63]:
cross_val_score(ovr_clf, train_cropped[:5000], y_train[:5000], cv=3, scoring="accuracy") 

array([0.91481704, 0.93941212, 0.92557023])

In [64]:
cross_val_score(ovr_clf, filtered_train[:5000], y_train[:5000], cv=3, scoring="accuracy") 

array([0.82543491, 0.85362927, 0.83133253])

In [66]:
cross_val_score(ovr_clf, reduced_pca[:5000], y_train[:5000], cv=3, scoring="accuracy") 

array([0.89562088, 0.92861428, 0.90816327])

In [65]:
cross_val_score(ovr_clf, reduced_pca80[:5000], y_train[:5000], cv=3, scoring="accuracy") 

array([0.8830234 , 0.93041392, 0.8877551 ])

### Results:
1. x_train_scaled: 20.4s, ~90%
2. train_cropped: 12.3s, ~92%
3. filtered_train: 3.2s, ~83%
4. reduced_pca (default): ~20.6s, ~90%
5. reduced_pca (80%): ~12.1s, ~89%