**Digit Recognizer**

In [12]:
#import libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold 
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import uniform

Read files

In [13]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [15]:
#print(train)
Y = train["label"]
X = train.drop(["label"],axis = 1)

if X.shape[0] > 1:
    print("Files have been read")

X_test = test

nr_samples = 30000
y_train=Y[:nr_samples]
X_train=X[:nr_samples]
start_ix_val = nr_samples 
end_ix_val = nr_samples + int(nr_samples/3)
y_val=Y[start_ix_val:end_ix_val]
X_val=X[start_ix_val:end_ix_val]

m = X.shape[0]

Files have been read


EDA

In [None]:
print("X -> :")
print(X.info())

print("X_test -> :")
print(X_test.info())

print("Y -> :")
print(Y.shape)

In [None]:
#See the dist of all the labels in Y
Y.value_counts().plot(kind = "bar") # Looks equally distributed

In [None]:
#check images for the first for each label
li_idx = []
for i in range(5):
    for nr in range(10):
        ix = Y[Y==nr].index[i]
        li_idx.append(ix)
print(li_idx)

#plot the images
fig,axis = plt.subplots(5,10,sharex=True,sharey=True,figsize = (10,6))
axis = axis.flatten()
for n,i in enumerate(li_idx):
    #print(n,i)
    im = X.iloc[i]
    #print(im)
    im = im.values.reshape(-1,28,28,1)
    #print(im)
    axis[n].imshow(im[0,:,:,0])
    axis[n].set_title(Y[i])
plt.tight_layout()
    

**MODELS**

In [16]:
#logistic regression
from sklearn.linear_model import LogisticRegression
clf_LR = LogisticRegression()
param_grid = {'C':[0.014,0.012],'multi_class':['multinomial'],'penalty':['l1'],'solver':['saga'],'tol':[0.1]}
gridCV_LR = GridSearchCV(clf_LR,param_grid=param_grid,verbose=1,cv = 5)
gridCV_LR.fit(X_train,y_train)


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  3.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.014, 0.012], 'penalty': ['l1'], 'multi_class': ['multinomial'], 'solver': ['saga'], 'tol': [0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [50]:
print(gridCV_LR.best_score_) 
#print(gridCV_LR.best_params_)
#gridCV_LR.grid_scores_

0.9165333333333333


In [27]:
#random forest
from sklearn.ensemble import RandomForestClassifier
clf_RF = RandomForestClassifier()
param_grid_RF = {'max_depth':[15],'max_features':[100],'min_samples_split':[5],'n_estimators':[50]}
gridCV_RF = GridSearchCV(clf_RF,param_grid=param_grid_RF,verbose=1,cv = 5)
gridCV_RF.fit(X_train,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [50], 'max_features': [100], 'min_samples_split': [5], 'max_depth': [15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [48]:
print(gridCV_RF.best_score_)

0.9525666666666667


In [40]:
#Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
clf_GB = GradientBoostingClassifier()
param_grid_GB = {'learning_rate':[0.01,0.05,0.1],'n_estimators': [50], 'max_features': [100], 'min_samples_split': [5]}
gridCV_GB = GridSearchCV(clf_GB,param_grid=param_grid_GB,cv=5,verbose=1)
gridCV_GB.fit(X_train,y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 16.3min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.01, 0.05, 0.1], 'max_features': [100], 'min_samples_split': [5], 'n_estimators': [50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [42]:
gridCV_GB.best_params_

{'learning_rate': 0.1,
 'max_features': 100,
 'min_samples_split': 5,
 'n_estimators': 50}

In [47]:
y_pred_LR = gridCV_LR.predict(X_val)
y_pred_GB = gridCV_GB.predict(X_val)
y_pred_RF = gridCV_LR.predict(X_val)

accuracy_LR = accuracy_score(y_pred=y_pred_LR,y_true=y_val)*100
accuracy_RF = accuracy_score(y_pred=y_pred_RF,y_true=y_val)*100
accuracy_GB = accuracy_score(y_pred=y_pred_GB,y_true=y_val)*100

print(" Logistic Regression: %1.2f" %accuracy_LR + '%')
print(" Random Forest : %1.2f" %accuracy_RF + '%')
print(" Gradient Boosting: %1.2f" %accuracy_GB + '%')

 Logistic Regression: 91.84%
 Random Forest : 91.84%
 Gradient Boosting: 92.46%


In [69]:
#fig,axis = plt.subplots(1,3,sharex=True,sharey=True)
print("Logistic regression: Confusion Matrix")
confusion_matrix(y_pred_LR,y_val)

Logistic regression: Confusion Matrix


array([[ 953,    0,    4,    2,    2,   11,    9,    3,    6,    5],
       [   0, 1104,    6,    5,    7,    6,    2,    3,   28,    8],
       [   6,    3,  875,   33,    4,    8,   10,   16,    7,    7],
       [   3,    3,   15,  956,    1,   29,    1,    7,   24,   14],
       [   1,    2,    9,    0,  850,   13,    9,    7,    2,   28],
       [  12,    1,    4,   25,    1,  778,   11,    0,   28,    8],
       [  14,    0,   14,    2,   12,   19,  904,    1,    2,    1],
       [   3,    2,   11,   13,    1,    3,    0,  979,    3,   32],
       [   6,   12,   25,   14,    7,   36,    5,    1,  885,    5],
       [   0,    0,    4,   10,   29,   11,    0,   49,   10,  900]],
      dtype=int64)

In [70]:
print("Gradient Boost: Confusion Matrix")
confusion_matrix(y_pred_GB,y_val)

Gradient Boost: Confusion Matrix


array([[ 964,    0,    8,    3,    3,   11,   11,    7,    3,    3],
       [   0, 1109,    3,    9,    4,   10,    3,    6,   24,    8],
       [   2,    4,  886,   36,    2,    1,    3,   12,    6,    4],
       [   3,    5,   14,  935,    1,   35,    0,    4,   15,   10],
       [   2,    0,   11,    0,  856,    6,   13,    9,    4,   32],
       [   5,    4,    5,   22,    3,  813,   19,    3,   17,    8],
       [   6,    2,   10,    4,    2,   14,  890,    0,    1,    0],
       [   1,    0,    9,   12,    1,    1,    1,  987,    3,   29],
       [  15,    3,   19,   25,    7,   11,   11,    4,  900,    8],
       [   0,    0,    2,   14,   35,   12,    0,   34,   22,  906]],
      dtype=int64)

In [72]:
print("Random Forest: Confusion Matrix")
confusion_matrix(y_pred_RF,y_val)

Random Forest: Confusion Matrix


array([[ 953,    0,    4,    2,    2,   11,    9,    3,    6,    5],
       [   0, 1104,    6,    5,    7,    6,    2,    3,   28,    8],
       [   6,    3,  875,   33,    4,    8,   10,   16,    7,    7],
       [   3,    3,   15,  956,    1,   29,    1,    7,   24,   14],
       [   1,    2,    9,    0,  850,   13,    9,    7,    2,   28],
       [  12,    1,    4,   25,    1,  778,   11,    0,   28,    8],
       [  14,    0,   14,    2,   12,   19,  904,    1,    2,    1],
       [   3,    2,   11,   13,    1,    3,    0,  979,    3,   32],
       [   6,   12,   25,   14,    7,   36,    5,    1,  885,    5],
       [   0,    0,    4,   10,   29,   11,    0,   49,   10,  900]],
      dtype=int64)

**TRY NEURAL NETWORK AND SEE IF THIS IMPROVES ACCURACY**

In [74]:
from sklearn.neural_network import MLPClassifier
clf_NN = MLPClassifier(activation = "logistic",hidden_layer_sizes = (200,20))
param_grid_NN = {'batch_size':[32], 'solver': ['sgd'], 'learning_rate':['constant'], 
                 'learning_rate_init':[0.01,0.001], 'max_iter':[300]}
gridCV_NN = GridSearchCV(clf_NN,cv=5,verbose=1,param_grid=param_grid_NN)
gridCV_NN.fit(X_train,y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 51.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(200, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate_init': [0.01, 0.001], 'learning_rate': ['constant'], 'max_iter': [300], 'solver': ['sgd'], 'batch_size': [32]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

**CHECK IF NN DOES WELL ON TEST AND VALIDATION**

In [77]:
print("Training Accuracy: %1.2f"%gridCV_NN.best_score_)
y_pred_NN = gridCV_NN.predict(X_val)
print("Test Accuracy: %1.2f"%accuracy_score(y_pred_NN,y_val))
print("NN Confusion")
confusion_matrix(y_pred_NN,y_val)

Training Accuracy: 0.95
Test Accuracy: 0.95
NN Confusion


array([[ 970,    0,    9,    1,    1,    6,   11,    2,    3,    5],
       [   0, 1103,    1,    3,    5,    4,    3,    3,    8,    4],
       [   2,    3,  919,   19,    1,    2,    2,    7,    3,    2],
       [   0,    3,    3,  993,    0,   21,    0,    4,   14,   13],
       [   1,    2,    4,    0,  857,    7,    1,    2,    2,   25],
       [   4,    0,    1,   21,    0,  853,    7,    3,    9,    7],
       [   9,    1,    9,    2,   10,   13,  923,    0,    3,    1],
       [   2,    3,   11,    4,    1,    2,    0, 1027,    0,   26],
       [   8,   12,    9,   13,    4,    3,    4,    2,  942,    5],
       [   2,    0,    1,    4,   35,    3,    0,   16,   11,  920]],
      dtype=int64)

In [78]:
#Write NN predictions into a csv
submission = gridCV_NN.predict(X_test)

In [84]:
csv = pd.DataFrame(submission).reset_index()
csv.columns=['ImageID','Label']
csv["ImageID"]+=1

In [86]:
csv.to_csv("../submission.csv")

In [88]:
#write GBM predictions into a csv
sub2 = gridCV_GB.predict(X_test)
csv2 = pd.DataFrame(sub2).reset_index()
csv2.columns = ['ImageID','Label']
csv2['ImageID'] += 1
csv2.to_csv("../submission.csv")