1. Build a classifier that achieves 97% accuracy on test set. 
Don't use a neural network

In [1]:
# Import MNIST dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [2]:
# set X and y
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [3]:
# Function to plot image from the dataset
import matplotlib as mpl
import matplotlib.pyplot as plt

def plot_digit(digit):
  reshaped_digit = digit.reshape(28, 28)
  plt.imshow(reshaped_digit, cmap=mpl.cm.binary, interpolation="nearest")
  plt.axis("off")
  plt.show()

In [4]:
# cast y to integers
import numpy as np

y = y.astype(np.uint8)

In [5]:
# Separate train from test
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [6]:
# Train with KNN classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV

# As a result of excessive training time, I'll use the best results from previous experiments
n_neighbors = [4]
weights = ['distance']
param_grid = dict(n_neighbors=n_neighbors, weights=weights)
param_grid

{'n_neighbors': [4], 'weights': ['distance']}

In [7]:
knn_clf = KNeighborsClassifier(n_neighbors=4, weights='distance')
#grid_search = GridSearchCV(knn_clf, param_grid, cv=5,verbose=3)
#grid_search.fit(X_train, y_train)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='distance')

In [9]:
from sklearn.metrics import accuracy_score

y_pred = knn_clf.predict(X_test)
accuracy_score(y_pred, y_test)

0.9714

2. Data Augmentation on MNIST

In [10]:
from scipy.ndimage.interpolation import shift

# Data Augmentation method
def augment(image, dx, dy):
  image = image.reshape((28, 28))
  shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
  return shifted_image.reshape([-1])

In [11]:
# Create augmentated datset
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]

for dx, dy in ((1,0), (-1,0), (0,1), (0,-1)):
  for image, label in zip(X_train, y_train):
    X_train_augmented.append(augment(image, dx, dy))
    y_train_augmented.append(label)

# Shuffle dataset
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_prepared = X_train_augmented[shuffle_idx]
y_train_prepared = y_train_augmented[shuffle_idx]

In [17]:
# plot_digit(X_train_prepared[0])
X_train_prepared[0].shape

(784,)

In [18]:
X_test[0].shape

(784,)

In [19]:
# Train with knn
knn_v2 = KNeighborsClassifier(n_neighbors=4, weights='distance')
knn_v2.fit(X_train_prepared, y_train_prepared)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                     weights='distance')

In [None]:
from sklearn.metrics import accuracy_score

y_pred = knn_v2.predict(X_test)
accuracy_score(y_test, y_pred)

0.9763