In [54]:
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml

%matplotlib inline
plt.style.use('seaborn')

In [5]:
# Use RELOAD=True mode to import pickles and avoid retraining models 
from joblib import dump, load
model_fpath = 'saved_models/3_'
RELOAD = True

# Exercise 1)
Build a classifier for the MNIST dataset tha achieves over 97% accuracy on the test set

In [16]:
if RELOAD:
    mnist = load(model_fpath+'mnist.joblib')
else:
    mnist = fetch_openml('mnist_784', version=1)
    dump(mnist, model_fpath+'mnist.joblib')

X, y = mnist['data'], mnist['target'].astype(np.uint8)

In [17]:
# recall data is already shuffled into training and test set
X_train, y_train, X_test, y_test = X[:60000], y[:60000], X[60000:], y[60000:]

We'll start with a small set and to write the code and then run on the full training set

In [19]:
X_train_small, y_train_small = X_train[:5000], y_train[:5000]

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict

knn_clf = KNeighborsClassifier()
knn_preds = cross_val_predict(knn_clf, X_train_small, y_train_small, cv=3, verbose=2)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   26.0s finished


In [49]:
from sklearn.metrics import precision_score, recall_score, f1_score

def print_model_scores(y_actual, y_preds, average=None):
    precision = precision_score(y_actual, y_preds, average=average)
    recall = recall_score(y_actual, y_preds, average=average)
    f1 = f1_score(y_actual, y_preds, average=average)

    print('Model results')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1: {f1}')

In [29]:
print_model_scores(y_train_small, knn_preds, 'macro')

Model results
Precision: 0.9252946891621784
Recall: 0.9202217478956463
F1: 0.9215018037617095


In [51]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'weights': ['uniform', 'distance'], 'n_neighbors': np.r_[1:10]}
]
grid_search = GridSearchCV(knn_clf, param_grid, cv=3)
grid_search.fit(X_train_small, y_train_small)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid=[{'n_neighbors': array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
                          'weights': ['uniform', 'distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [52]:
grid_clf = grid_search.best_estimator_
grid_search.best_params_

{'n_neighbors': 4, 'weights': 'distance'}

In [53]:
grid_preds = cross_val_predict(grid_clf, X_train_small, y_train_small, cv=3)
print_model_scores(y_train_small, grid_preds, 'macro')

Model results
Precision: 0.9316239364223113
Recall: 0.9279879556944662
F1: 0.9289948152946156


Grid search on this model trained with a small sample slightly improved results and yields n_neighbours = 4. With this, I can reduce the search space for n_neighbors to say range(3, 6) for the full dataset to save some time during training as the full dataset is very large

In [55]:
# Grid Search for full dataset
if RELOAD:
    grid_search = load(model_fpath + 'ex1_knn.joblib')
else:
    start_time = time.time()
    param_grid = [{'weights':['uniform', 'distance'], 'n_neighbors':np.r_[3:7]}]
    print("Initiating GridSearch")
    grid_search = GridSearchCV(knn_clf, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    print(f'GridSearch complete in {(time.time()-start_time)/60.0:.2f} minutes')
    dump(grid_search, model_fpath+'ex1_knn.joblib')

Initiating GridSearch
GridSearch complete in 568.43 minutes


AttributeError: 'GridSearchCV' object has no attribute 'best_estimator'

In [57]:
start_time = time.time()
grid_clf = grid_search.best_estimator_
print(f'Found best params {grid_search.best_params_}')

print(f'Initiating CrossValPredict on training set')
grid_preds = cross_val_predict(grid_clf, X_train, y_train, cv=3)
print(f'Validation completed in {(time.time()-start_time)/60.0:.2f} minutes')
print_model_scores(y_train, grid_preds, average='macro')

Found best params {'n_neighbors': 4, 'weights': 'distance'}
Initiating CrossValPredict on training set
Validation completed in 800.63 minutes
Model results
Precision: [0.97985012 0.96184739 0.98721934 0.96971193 0.97932233 0.96290829
 0.97712855 0.96174433 0.97943123 0.94711618]
Recall: [0.9934155  0.99466034 0.95938234 0.96085467 0.9647381  0.96255303
 0.98901656 0.97509976 0.93590839 0.96335519]
F1: [0.98658618 0.97797871 0.9731018  0.96526299 0.97197551 0.96273063
 0.98303661 0.968376   0.95717532 0.95516667]


In [75]:
test_set_preds = cross_val_predict(grid_clf, X_test, y_test, cv=3)

In [76]:
print_model_scores(y_test, test_set_preds, average='macro')

Model results
Precision: 0.9421027040033261
Recall: 0.9402735042228934
F1: 0.9407514123508827


Model is slightly overfitting the train set. Perhaps scaling can help

In [None]:
from sklearn.preprocessing import StandardScaler