# Machine Learning Project: MNIST

---

## Preliminary

Import the necessary modules:

In [0]:
# essentials
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('darkgrid')
import pickle

# preprocessing and decomposition
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, KernelPCA, NMF

# mnist dataset from keras
from keras.datasets import mnist

# machine learning algorihtms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

Load the dataset: (training set and test set)

In [2]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz


Let's check the shape of training set and test set

In [3]:
print("X_train.shape: ", X_train.shape)
print("X_test.shape: ", X_test.shape)
print("y_train.shape: ", y_train.shape)
print("y_test.shape: ", y_test.shape)

X_train.shape:  (60000, 28, 28)
X_test.shape:  (10000, 28, 28)
y_train.shape:  (60000,)
y_test.shape:  (10000,)


Reshape (flatten) the feature tensor.

In [0]:
X_train = X_train.reshape(60000, 28*28).astype('float32')
X_test = X_test.reshape(10000, 28*28).astype('float32')

In [5]:
print("X_train.shape: ", X_train.shape)
print("X_test.shape: ", X_test.shape)

X_train.shape:  (60000, 784)
X_test.shape:  (10000, 784)


Normalize

In [0]:
X_train /= 255
X_test /= 255

Split the training set into smaller training set and validation set.

In [0]:
X_train_tr, X_train_val, y_train_tr, y_train_val = train_test_split(X_train, y_train, random_state=42, test_size=0.1)

In [8]:
print("X training  shape: ", X_train_tr.shape)
print("X validation shape: ", X_train_val.shape)
print("y training  shape: ", y_train_tr.shape)
print("y validation shape: ", y_train_val.shape)

X training  shape:  (54000, 784)
X validation shape:  (6000, 784)
y training  shape:  (54000,)
y validation shape:  (6000,)


---

## Machine Learning

### With Default Hyperparameters:

#### SGDClassifier

In [0]:
sgd = SGDClassifier().fit(X_train_tr, y_train_tr)

In [0]:
print(sgd.score(X_train_val, y_train_val))

0.915


#### LogisticRegression

In [0]:
logreg = LogisticRegression().fit(X_train_tr, y_train_tr)
print(logreg.score(X_train_val, y_train_val))

0.927


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


#### KNeighborsClassifier

In [0]:
knn = KNeighborsClassifier().fit(X_train_tr, y_train_tr)
print(knn.score(X_train_val, y_train_val))

#### RandomForestClassifier

In [0]:
rf = RandomForestClassifier().fit(X_train_tr, y_train_tr)
print(rf.score(X_train_val, y_train_val))

#### GradientBoostingClassifier

In [0]:
gbrt = GradientBoostingClassifier().fit(X_train_tr, y_train_tr)
print(gbrt.score(X_train_val, y_train_val))

### Tuned Models

#### SGDClassifier

In [0]:
sgd = SGDClassifier(penalty='elasticnet', l1_ratio=0.1, alpha=0.0001, epsilon=0.05).fit(X_train_tr, y_train_tr)
print(sgd.score(X_train_val, y_train_val))

#### LogisticRegression

In [0]:
logreg = LogisticRegression(penalty='l2', C=10).fit(X_train_tr, y_train_tr)
print(logreg.score(X_train_val, y_train_val))

#### KNeighborsClassifier

In [0]:
knn = KNeighborsClassifier(n_neighbors=5, weights='distance').fit(X_train_tr, y_train_tr)
print(knn.score(X_train_val, y_train_val))

#### RandomForestClassifier

In [0]:
rf = RandomForestClassifier(n_estimators=100, max_depth=5).fit(X_train_tr, y_train_tr)
print(rf.score(X_train_val, y_train_val))

### With Dimensionality Decomposition

In [0]:
N_COMPONENTS = 500

pca = PCA(n_components=N_COMPONENTS, whiten=True)
nmf = NMF(n_components=N_COMPONENTS)

In [0]:
X_train_tr_pca = pca.fit_transform(X_train_tr)
X_train_val_pca = pca.transform(X_train_val)
X_train_tr_nmf = nmf.fit_transform(X_train_tr)
X_train_val_nmf = nmf.transform(X_train_val)

#### PCA

In [0]:
sgd = SGDClassifier(penalty='elasticnet', l1_ratio=0.1, alpha=0.0001, epsilon=0.05).fit(X_train_tr_pca, y_train_tr)
logreg = LogisticRegression().fit(X_train_tr_pca, y_train_tr)
knn = KNeighborsClassifier(n_neighbors=5, weights='distance').fit(X_train_tr_pca, y_train_tr)
rf = RandomForestClassifier().fit(X_train_tr_pca, y_train_tr)

In [0]:
print("SGD accuracy score with PCA decomposition: ", sgd.score(X_train_val_pca, y_train_val))
print("LogisticRegression accuracy score with PCA decomposition: ", logreg.score(X_train_val_pca, y_train_val))
print("KNeighborsClassifier accuracy score with PCA decomposition: ", knn.score(X_train_val_pca, y_train_val))
print("RandomForestClassifier accuracy score with PCA decomposition: ", rf.score(X_train_val_pca, y_train_val))

#### NMF 

In [0]:
sgd = SGDClassifier(penalty='elasticnet', l1_ratio=0.1, alpha=0.0001, epsilon=0.05).fit(X_train_tr_nmf, y_train_tr)
logreg = LogisticRegression().fit(X_train_tr_nmf, y_train_tr)
knn = KNeighborsClassifier(n_neighbors=5, weights='distance').fit(X_train_tr_nmf, y_train_tr)
rf = RandomForestClassifier().fit(X_train_tr_nmf, y_train_tr)

In [0]:
print("SGD accuracy score with NMF decomposition: ", sgd.score(X_train_val_nmf, y_train_val))
print("LogisticRegression accuracy score with NMF decomposition: ", logreg.score(X_train_val_nmf, y_train_val))
print("KNeighborsClassifier accuracy score with NMF decomposition: ", knn.score(X_train_val_nmf, y_train_val))
print("RandomForestClassifier accuracy score with NMF decomposition: ", rf.score(X_train_val_nmf, y_train_val))

---

## Saving The Chosen Model

In [0]:
model = KNeighborsClassifier(n_neighbors=5, weights='distance').fit(X_train_tr, y_train_tr)
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [13]:
import sklearn
sklearn.__version__

'0.22.1'