# Train K-Nearest Neighbor (KNN) Classifier

## Fetch MNIST Dataset 

In [1]:
import sklearn
from sklearn.datasets import fetch_openml
import numpy as np

In [2]:
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

  warn(


## Split Data in Train and Test

In [3]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [4]:
#X= data, y= target
X, y = mnist["data"], mnist["target"]
y = y.astype(np.uint8)
y.shape

(70000,)

In [5]:
#training set = 60000, test set = 10000 (total 70000)
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

## Train KNN

In [6]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(weights='distance', n_neighbors=7)
knn_classifier.fit(X_train, y_train)

In [7]:
y_knn_pred = knn_classifier.predict(X_test)

In [8]:
from sklearn.metrics import accuracy_score
accuracy_score_knn = accuracy_score(y_test, y_knn_pred)
accuracy_score_knn

0.97

# Fine-tune Hyperparameters with Grid Search

In [9]:
from sklearn.model_selection import GridSearchCV
## GridSearchCV for KNN runs long, so I commented it out

# param_grid_knn_classifier = [{'weights': ["uniform", "distance"], 
#                             'n_neighbors': [3, 4, 5, 6]}]

# knn_classifier_grid = KNeighborsClassifier()
# grid_search_knn = GridSearchCV(knn_classifier_grid, param_grid_knn_classifier, cv=5, verbose=3)
# grid_search_knn.fit(X_train, y_train)

In [10]:
# grid_knn_best_params = grid_search_knn.best_params_
# grid_knn_best_params
# #output: {'n_neighbors': 4, 'weights': 'distance'}

In [11]:
# grid_knn_best_score = grid_search_knn.best_score_
# grid_knn_best_score
# #output: 0.9716166666666666

# Evaluate the best KNN model on the test set

In [12]:
from sklearn.metrics import accuracy_score
KNeighborsClassifier(weights='distance', n_neighbors=4)
best_knn_model = knn_classifier.fit(X_train, y_train)

y_knn_best_model_pred = best_knn_model.predict(X_test)

In [13]:
accuracy_score_knn_best_model = accuracy_score(y_test, y_knn_best_model_pred)
accuracy_score_knn_best_model

0.97

# Fine-tune, train and evaluate SGD and Random Forest Classifier

## SGD

### Fine-Tune the Hyperparameters

In [14]:
from sklearn.linear_model import SGDClassifier

##Commented out GridSearchCV for SGDClassifier because it runs long
# param_grid_sgd_classifier =  {
#     'loss': ['hinge', 'log_loss'],
#     'penalty': ['l2', 'elasticnet'],  
#     'alpha': [1e-4, 1e-3],  
#     'max_iter': [1000, 2000]
# }


# sgd_classifier_grid = SGDClassifier()
# grid_search_sgd = GridSearchCV(sgd_classifier_grid, param_grid_sgd_classifier, cv=3, verbose=3)
# grid_search_sgd.fit(X_train, y_train)

In [15]:
# grid_sgd_best_params = grid_search_sgd.best_params_
# grid_sgd_best_params
# #output: {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 2000, 'penalty': 'l2'}

In [16]:
# grid_sgd_best_score = grid_search_sgd.best_score_
# grid_sgd_best_score
# #output: 0.8845000000000001

In [17]:
# grid_search_sgd_results = grid_search_sgd.cv_results_
# grid_search_sgd_results

In [18]:
import pandas as pd

# results_grid_sgd = [
#     {'Score': mean_score, **params} 
#     for mean_score, params in zip(grid_search_sgd_results["mean_test_score"], grid_search_sgd_results["params"])
# ]
# grid_search_sgd_results_df = pd.DataFrame(results_grid_sgd)
# grid_search_sgd_results_df_sorted = grid_search_sgd_results_df.sort_values(by='Score', ascending=False)
# grid_search_sgd_results_df_sorted.to_csv('output/grid_sgd_sorted.csv', index=False)
# grid_search_sgd_results_df_sorted

## Random Forest Classifier

### Fine-Tune the Hyperparameters

In [19]:
from sklearn.ensemble import RandomForestClassifier
## Commented out GridSearchCV for RandomForestClassifier because it runs long
# param_grid_forest_classifier =  {
#     'n_estimators': [300, 1000],
#     'min_samples_leaf': [10, 15, 20],
#     'min_samples_split': [50, 75]
#     }

# forest_classifier_grid = RandomForestClassifier()
# grid_search_forest = GridSearchCV(forest_classifier_grid, param_grid_forest_classifier, cv=3, verbose=3)
# grid_search_forest.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END min_samples_leaf=10, min_samples_split=50, n_estimators=300;, score=0.948 total time= 1.2min
[CV 2/3] END min_samples_leaf=10, min_samples_split=50, n_estimators=300;, score=0.945 total time= 1.2min
[CV 3/3] END min_samples_leaf=10, min_samples_split=50, n_estimators=300;, score=0.950 total time= 1.2min
[CV 1/3] END min_samples_leaf=10, min_samples_split=50, n_estimators=1000;, score=0.948 total time= 4.0min
[CV 2/3] END min_samples_leaf=10, min_samples_split=50, n_estimators=1000;, score=0.946 total time= 4.0min
[CV 3/3] END min_samples_leaf=10, min_samples_split=50, n_estimators=1000;, score=0.950 total time= 4.0min
[CV 1/3] END min_samples_leaf=10, min_samples_split=75, n_estimators=300;, score=0.944 total time= 1.2min
[CV 2/3] END min_samples_leaf=10, min_samples_split=75, n_estimators=300;, score=0.941 total time= 1.2min
[CV 3/3] END min_samples_leaf=10, min_samples_split=75, n_estimators=300;, score=0.946 t

In [24]:
# grid_forest_best_params = grid_search_forest.best_params_
# grid_forest_best_params
# #Output: {'min_samples_leaf': 10, 'min_samples_split': 50, 'n_estimators': 1000}

{'min_samples_leaf': 10, 'min_samples_split': 50, 'n_estimators': 1000}

In [25]:
# grid_forest_best_score = grid_search_forest.best_score_
# grid_forest_best_score
# #Output: 0.9478833333333334

0.9478833333333334

In [26]:
# grid_search_forest_results = grid_search_forest.cv_results_

In [28]:
# results_grid_forest = [
#     {'Score': mean_score, **params} 
#     for mean_score, params in zip(grid_search_forest_results["mean_test_score"], grid_search_forest_results["params"])
# ]
# grid_search_forest_results_df = pd.DataFrame(results_grid_forest)
# grid_search_forest_results_df_sorted = grid_search_forest_results_df.sort_values(by='Score', ascending=False)
# grid_search_forest_results_df_sorted.to_csv('output/grid_forest_class_sorted.csv', index=False)
# grid_search_forest_results_df_sorted

Unnamed: 0,Score,min_samples_leaf,min_samples_split,n_estimators
1,0.947883,10,50,1000
0,0.947433,10,50,300
5,0.946483,15,50,1000
4,0.945817,15,50,300
9,0.944383,20,50,1000
3,0.944,10,75,1000
8,0.943933,20,50,300
2,0.9435,10,75,300
7,0.9423,15,75,1000
6,0.941933,15,75,300


### 