## Objective
In this exercise, your goal is to correctly identify digits from a dataset of handwritten images.

The dataset contains gray-scale images of hand-drawn digits, from zero through nine. It contains 42,000 images.
Each image is 28 pixels in height and 28 pixels in width, for a total of 784 pixels in total. Each pixel has a single pixel-value associated with it, indicating the lightness or darkness of that pixel, with higher numbers meaning darker. This pixel-value is an integer between 0 and 255, inclusive.
The data set, has 785 columns. The first column, called "label", is the digit that was drawn by the user. The rest of the columns contain the pixel-values of the associated image.
Each pixel column in the training set has a name like pixelx, where x is an integer between 0 and 783, inclusive. To locate this pixel on the image, suppose that we have decomposed x as x = i * 28 + j, where i and j are integers between 0 and 27, inclusive. Then pixelx is located on row i and column j of a 28 x 28 matrix, (indexing by zero).

You are expected to:
- Experiment with different models and settings and decide on the best model for this dataset
- Write a brief report (2-4 pages maximum) describing the choices you made and the evaluation you performed

### Import Libraries


In [None]:
# Basic packages
import pandas as pd
import numpy as np
import random
import itertools
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

# Machine learning packages
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Utilities
import os 
import gc

### Load the Dataset

In [None]:
all_digits     = pd.read_csv('digit_recognizer_dataset.csv')

### Data Overview

In [None]:
all_digits.head()

In [None]:
def print_digit():
    fig, axs = plt.subplots(2,5,sharex='col', sharey='row',
                        gridspec_kw={'hspace': 0, 'wspace': 0},figsize=(19,8))
    fig.suptitle('MNIST sample digits')
    for i in range(0,10):
        digit = all_digits.loc[all_digits['label'] == i]
        digit = digit.iloc[random.randint(0, len(digit.index))][1:]
        digit = np.array(digit, dtype='float')
        pixels = digit.reshape((28, 28))
        axs[int(i/5), i % 5].imshow(pixels, cmap='gray')


print_digit()

### Train / Test Split

In [None]:
features = all_digits.columns[1:]
X = all_digits[features]
y = all_digits['label']
X_train, X_test, y_train, y_test = train_test_split(X/255.,y,test_size=0.1,random_state=0)

### Models Declaration


In [None]:

# Create a list, with one item per algorithm. Each item has a name, and a classifier object.
models = []
models.append(('XGB',  XGBClassifier()))
models.append(('LR' ,  LogisticRegression()))
models.append(('LDA',  LinearDiscriminantAnalysis()))
models.append(('kNN',  KNeighborsClassifier()))
models.append(('DT' ,  DecisionTreeClassifier()))
models.append(('MLP',  MLPClassifier()))
models.append(('RF' ,  RandomForestClassifier()))
models.append(('SVM',  SVC()))

### Models Comparison

In [None]:
results = []
names   = []
for name, model in models:
  cv_results = cross_val_score(model, X_train, y_train, scoring='accuracy', n_jobs= -1, verbose=2)
  results.append(cv_results)
  names.append(name)
  print("%03s: %f (+/- %f)" % (name, cv_results.mean(), cv_results.std()))

In [None]:
plt.boxplot(results)
plt.xticks(list(range(1,len(names)+1)), names)
plt.show()

### HyperParameter Tunning

In [None]:
max_depth      = [5   , 6   , 7   ]
subsample      = [0.7 , 0.8 , 0.9 ]
reg_alpha      = [0   , 0.1 , 0.2 ]
reg_lamda      = [1   , 1.1 , 1.2 ]
min_split_loss = [0   , 5   , 10  ]

iterables = [ max_depth, subsample, reg_alpha, reg_lamda, learning_rate, min_split_loss ]

combinations = []
for t in itertools.product(*iterables):
    combinations.append(t)


In [None]:
params_accuracy = []
eval_set = [(X_train,y_train), (X_test, y_test)]
for params in combinations:
    model = XGBClassifier(n_estimators=5,n_jobs=-1,verbose=0,max_depth=params[0],subsample=params[1],
                                reg_alpha=params[2],reg_lambda=params[3],min_split_loss=params[4])
    model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=["merror"], eval_set=eval_set)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    params_accuracy.append(accuracy)

best_params = combinations[np.argmax(params_accuracy)]


### Model Train

In [None]:
# fit model no training data
model = xgb.XGBClassifier(n_estimators=500,n_jobs=-1,max_depth=7,subsample=0.9)
eval_set = [(X_train,y_train), (X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=["merror"], eval_set=eval_set)
y_pred = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
# retrieve performance metrics
results = model.evals_result()
epochs = len(results['validation_0']['merror'])
x_axis = range(0, epochs)

# plot Accuracy
fig, ax = plt.subplots()
ax.plot(x_axis, results['validation_0']['merror'], label='Train')
ax.plot(x_axis, results['validation_1']['merror'], label='Test')
ax.legend()
plt.ylabel('merror')
plt.title('XGBoost merror')
plt.show()

