In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# The MNIST dataset: Classification of handwritten digits

In [None]:
# load the MNIST data
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/digits.csv'
digits = pd.read_csv(url)
digits.head()

In [None]:
# feature matrix and target vector
X = digits.iloc[:,0:784]
y = digits['class']

There are 5,000 images, and each image has 784 features. This is because each image is  28×28  pixels, and each feature simply represents one pixel's intensity, from 0 (white) to 255 (black).

Let's take a peek at one digit from the dataset. 
All we need to do is grab an instance's feature vector (this is, a row of X), reshape it to a $28\times 28$ array, and display it using `plt.imshow()`

In [None]:
some_digit = X.loc[0,:].values
some_digit_image = some_digit.reshape(28,28)
plt.imshow(some_digit_image,cmap = 'binary')
plt.axis('off')
plt.show()

This looks like a 5, and indeed that's what the label tells us

In [None]:
y[0]

The following figure shows a few more images from the dataset

In [None]:
plt.figure(figsize=(10,10))
for i in range(100):
    digit = X.loc[i,:].to_numpy()
    digit_image = digit.reshape(28,28)
    plt.subplot(10,10,i+1)
    plt.imshow(digit_image,cmap = 'binary')
    plt.axis('off')

**Part 1:** We are going to train a k-nearest neighbor model

Import and instantiate a k-nearest neighbors model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)

# instantiate the model (using the default parameters)
knn = KNeighborsClassifier()

Use a grid search to tune in the classifier hyperparameters `n_neighbors` and `weights`

In [None]:
from sklearn.model_selection import GridSearchCV

# define the parameter values that should be searched
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']

# create a parameter grid: map the parameter names to the values that should be searched
param_grid = dict(n_neighbors=k_range, weights=weight_options)
param_grid

In [None]:
# instantiate and fit the grid
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [None]:
# examine the best model hyperparameters
grid.best_params_

In [None]:
# best classifier
grid.best_estimator_

**Part 2:** We'll use accuracy and a confusion matrix to evaluate the performance of our model on the test set.

In [None]:
# load test set
url = 'https://raw.githubusercontent.com/um-perez-alvaro/Data-Science-Practice/master/Data/digits_test.csv'
digits_test = pd.read_csv(url)
digits_test.head()

# feature matrix and target vector
X_test = digits_test.iloc[:,0:784]
y_test = digits_test['class']

In [None]:
# import accuracy_score and confusion_matrix from sklearn.metrics


In [None]:
# accuracy


In [None]:
# confusion matrix
