In [130]:
import numpy as np
import pandas as pd
from sklearn import tree
import sklearn.metrics as metrics # accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import cross_val_score

import sys
sys.path.append('plot_confusion_matrix.ipynb')

In [131]:
# loading train and test data and cleaning missing values
raw_df_train = pd.read_csv('../pendigits_data/pendigits.tra', delimiter=',', header=None)
raw_df_test = pd.read_csv('../pendigits_data/pendigits.tes', delimiter=',', header=None)
# print(raw_df_train.head()) # prints sample of the dataset
train_df_clean = raw_df_train.dropna() # drop any rows with missing values
print("Dropped rows in train set: %d" %(raw_df_train.shape[0] - train_df_clean.shape[0])) # number of rows dropped for some missing values
# print(train_df_clean.describe()) # prints statistics column wise for train data

test_df_clean = raw_df_test.dropna()
print("Dropped rows in train set: %d" %(raw_df_test.shape[0] - test_df_clean.shape[0])) # number of rows dropped for some missing values
# print(test_df_clean.describe()) # prints statistics column wise for test data

Dropped rows in train set: 0
Dropped rows in train set: 0


In [132]:
# seperating the label column for train and test set
x_train = train_df_clean.drop(axis=1, columns=[16])
y_train = train_df_clean.iloc[:, 16]

x_test = test_df_clean.drop(axis=1, columns=[16])
y_test = test_df_clean.iloc[:, 16]

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(7494, 16) (7494,) (3498, 16) (3498,)


In [133]:
def print_accuray_precision_recall(y_true, y_predict):
    """Prints accuray, presion for each classes and recall for each classes."""
    print(metrics.accuracy_score(y_true, y_predict)) # accuracy score
    print(metrics.precision_score(y_true, y_predict, average=None)) # precision scores for each class
    print(metrics.recall_score(y_true, y_predict, average=None)) # recall score for each class

In [134]:
# gini decision tree model. accracy, precision and recall for training instances. And confusion matrix.
gini_model = tree.DecisionTreeClassifier(criterion='gini')
gini_model.fit(x_train, y_train)
y_predict = gini_model.predict(x_train)
print_accuray_precision_recall(y_train, y_predict)
#plot_confusion_matrix(y_train, y_predict, classes=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], figsize=(10, 10))

1.0
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [135]:
# gini decision tree model. accracy, precision and recall for test instances. And confusion matrix.
gini_model = tree.DecisionTreeClassifier(criterion='gini')
gini_model.fit(x_train, y_train)
y_predict = gini_model.predict(x_test)
print_accuray_precision_recall(y_test, y_predict)
#plot_confusion_matrix(y_test, y_predict, classes=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], figsize=(10, 10))

0.9210977701543739
[0.95 0.86 0.88 0.88 0.98 0.95 0.95 0.91 0.92 0.93]
[0.96 0.88 0.96 0.93 0.96 0.85 0.95 0.9  0.9  0.93]


In [136]:
# gini decision tree model, with 5-fold cross validation
gini_model = tree.DecisionTreeClassifier(criterion='gini')
cross_val_score(gini_model, x_train, y_train, cv=5)

array([0.96, 0.97, 0.95, 0.96, 0.97])

In [137]:
# entropy decision tree model. accracy, precision and recall for train instances. And confusion matrix.
entropy_model = tree.DecisionTreeClassifier(criterion='entropy')
entropy_model.fit(x_train, y_train)
y_predict = entropy_model.predict(x_train)
print_accuray_precision_recall(y_train, y_predict)
#plot_confusion_matrix(y_train, y_predict, classes=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], figsize=(10, 10))

1.0
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [138]:
# entropy decision tree model. accracy, precision and recall for test instances. And confusion matrix.
entropy_model = tree.DecisionTreeClassifier(criterion='entropy')
entropy_model.fit(x_train, y_train)
y_predict = entropy_model.predict(x_test)
print_accuray_precision_recall(y_test, y_predict)
#plot_confusion_matrix(y_test, y_predict, classes=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], figsize=(10, 10))

0.9210977701543739
[0.98 0.87 0.88 0.87 0.95 0.94 0.96 0.97 0.91 0.9 ]
[0.97 0.9  0.96 0.94 0.92 0.83 0.9  0.9  0.98 0.91]


In [139]:
# entropy decision tree model, with 5-fold cross validation
entropy_model = tree.DecisionTreeClassifier(criterion='entropy')
cross_val_score(entropy_model, x_train, y_train, cv=5)

array([0.96, 0.97, 0.97, 0.97, 0.97])