In [1]:
# set up
import pandas as pd
import numpy as np
np.set_printoptions(formatter={'float_kind':"{:3.2f}".format})
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

# get data
X = pd.read_csv('xTrain.csv')
X = X.loc[:, ~X.columns.str.contains('^Unnamed')]
print(X.head())

# format training data for sklean
Y = pd.read_csv('yTrain.csv')
Y = Y.loc[:, ~Y.columns.str.contains('^Unnamed')]
Y = Y['Category']
print(Y.head())

   pix_1  pix_2  pix_3  pix_4  pix_5  pix_6  pix_7  pix_8  pix_9  pix_10  ...  \
0      0      0      0      0      0      0      0      0      0       0  ...   
1      0      0      0      0      0      0      0      0      0       0  ...   
2      0      0      0      0      0      0      0      0      0       0  ...   
3      0      0      0      0      0      0      0      0      0       0  ...   
4      0      0      0      0      0      0      0      0      0       0  ...   

   pix_775  pix_776  pix_777  pix_778  pix_779  pix_780  pix_781  pix_782  \
0        0        0        0        0        0        0        0        0   
1        0        0        0        0        0        0        0        0   
2        0        0        0        0        0        0        0        0   
3        0        0        0        0        0        0        0        0   
4        0        0        0        0        0        0        0        0   

   pix_783  pix_784  
0        0        0  
1     

In [8]:
# decision trees
print("Searching for the best decision tree.")
model = DecisionTreeClassifier(random_state=1)

# grid search
param_grid = {'max_depth': list(range(10,21)), 'criterion': ['entropy','gini'] }
grid = GridSearchCV(model, param_grid, cv=5)
grid.fit(X, Y)
print("Grid Search: best parameters: {}".format(grid.best_params_))

# accuracy of best model with confidence interval
best_model_tree = grid.best_estimator_
predict_Y = best_model_tree.predict(X)
acc = accuracy_score(Y, predict_Y)
print("Accuracy: {:3.6f}".format(acc))

# build the confusion matrix
labels = [0,1,2,3,4,5,6,7,8,9]
cm = confusion_matrix(Y, predict_Y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Searching for the best decision tree.
Grid Search: best parameters: {'criterion': 'entropy', 'max_depth': 15}
Accuracy: 0.996500
Confusion Matrix:
      0     1     2     3     4     5     6     7     8     9
0  3960     0     0     0     0     0     0     0     0     0
1     2  4536     1     1     0     0     0     4     6     0
2     1     5  3954     1     0     0     0     8     1     0
3     0     2     4  4042     0     2     0     9     3     2
4     0     0     0     1  3863     0     0     1     1    25
5     0     3     2     2     1  3588     0     0     0     2
6     0     0     0     0     0     0  3964     0     0     0
7     0     3     8     0     2     0     0  4150     1     2
8     1     3     2     0     0     1     0     0  3813     4
9     1     0     0     1     3     3     1    10     4  3990


In [5]:
# KNN
print("Searching for the best KNN.")
model = KNeighborsClassifier()

# grid search
param_grid = {'n_neighbors': list(range(2,11))}
grid = GridSearchCV(model, param_grid, cv=5)
grid.fit(X, Y)
print("Grid Search: best parameters: {}".format(grid.best_params_))

# accuracy of best model with confidence interval
best_model_knn = grid.best_estimator_
predict_Y = best_model_knn.predict(X)
acc = accuracy_score(Y, predict_Y)
print("Accuracy: {:3.6f}".format(acc))

# build the confusion matrix
labels = [0,1,2,3,4,5,6,7,8,9]
cm = confusion_matrix(Y, predict_Y, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Searching for the best KNN.
Grid Search: best parameters: {'n_neighbors': 3}
Accuracy: 0.982875
Confusion Matrix:
      0     1     2     3     4     5     6     7     8     9
0  3943     2     1     0     0     2     8     1     1     2
1     0  4538     5     1     0     0     0     4     1     1
2    17    21  3878     6     3     0     1    37     6     1
3     3     2    17  3987     0    19     1    18     8     9
4     1    20     2     1  3824     0     6     3     0    34
5    10     1     4    27     3  3520    25     1     2     5
6     9     4     1     0     4     5  3941     0     0     0
7     1    30     7     1     4     0     0  4096     0    27
8    15    30    16    22    16    28     6     4  3667    20
9     7     9     4    12    21     8     2    27     2  3921


In [9]:
# predict test data with the best tree model
# get data
XT = pd.read_csv('xTest.csv')
XT = XT.loc[:, ~X.columns.str.contains('^Unnamed')]

# predict
YT = best_model_tree.predict(XT)
YT = pd.DataFrame(data = YT, index = range(len(YT)), columns = ['Category'])
YT.to_csv('yTest_tree.csv', index_label = 'ID')

In [7]:
# predict test data with the best tree model
# get data
XT = pd.read_csv('xTest.csv')
XT = XT.loc[:, ~X.columns.str.contains('^Unnamed')]

# predict
YT = best_model_knn.predict(XT)
YT = pd.DataFrame(data = YT, index = range(len(YT)), columns = ['Category'])
YT.to_csv('yTest_knn.csv', index_label = 'ID')