# Supervised Learning Model
in the following notebook three models will be trained: DecisionTreeClassifier(), RandomForestClassifier() and 

In [1]:
# Utils
import pickle

# Data Science Utils
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Data Science
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import *

In [3]:
train_df = pd.read_csv("../data/fashion-mnist_train.csv", sep=",")
test_df = pd.read_csv("../data/fashion-mnist_test.csv", sep=",")

In [9]:
# setup train and test again
train_data = np.array(train_df, dtype="float32")
test_data = np.array(test_df, dtype="float32")

In [10]:
x_train = train_data[:, 1:]
y_train = train_data[:, 0]

x_test = test_data[:, 1:]
y_test = test_data[:, 0]

In [11]:
labels_names = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat",
               "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]

In [8]:
clf_decisionTree = DecisionTreeClassifier()
clf_decisionTree.fit(x_train, y_train)

In [9]:
y_pred = clf_decisionTree.predict(x_test)
print("accuracy %s" % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.8004
              precision    recall  f1-score   support

         0.0       0.72      0.74      0.73      1000
         1.0       0.95      0.96      0.96      1000
         2.0       0.69      0.70      0.69      1000
         3.0       0.81      0.81      0.81      1000
         4.0       0.68      0.69      0.69      1000
         5.0       0.90      0.89      0.89      1000
         6.0       0.56      0.55      0.55      1000
         7.0       0.86      0.86      0.86      1000
         8.0       0.93      0.90      0.92      1000
         9.0       0.89      0.91      0.90      1000

    accuracy                           0.80     10000
   macro avg       0.80      0.80      0.80     10000
weighted avg       0.80      0.80      0.80     10000



Average accuracy is 0.8. Trouser(1) and Bag(8) seems easy to recognise. Let do hyperparameter tuning

In [13]:
param_grid = {"max_depth": [2, 4, 6, 8, 10],
              "min_samples_leaf": [1, 2, 4, 6, 8, 10],
              "min_samples_split": [2, 4, 6, 8, 10],
              "criterion": ["gini", "entropy"]}

grid_search = RandomizedSearchCV(clf_decisionTree, param_grid, cv=5)
grid_search.fit(x_train, y_train)

In [15]:
y_pred = grid_search.predict(x_test)
print("accuracy %s" % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.8104
              precision    recall  f1-score   support

         0.0       0.76      0.79      0.77      1000
         1.0       0.97      0.93      0.95      1000
         2.0       0.70      0.74      0.72      1000
         3.0       0.80      0.86      0.83      1000
         4.0       0.69      0.72      0.71      1000
         5.0       0.91      0.87      0.89      1000
         6.0       0.62      0.51      0.56      1000
         7.0       0.82      0.89      0.85      1000
         8.0       0.93      0.92      0.93      1000
         9.0       0.89      0.88      0.88      1000

    accuracy                           0.81     10000
   macro avg       0.81      0.81      0.81     10000
weighted avg       0.81      0.81      0.81     10000



An increase of 1% which is not much at all

In [18]:
# Run the code below to export the model
# with open("DecisionTree.pkl", "wb") as f:
#     pickle.dump(grid_search.best_estimator_, f)

In [12]:
clf_rf = RandomForestClassifier()
clf_rf.fit(x_train, y_train)

In [13]:
y_pred = clf_rf.predict(x_test)
print("accuracy %s" % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.8839
              precision    recall  f1-score   support

         0.0       0.80      0.87      0.84      1000
         1.0       0.99      0.97      0.98      1000
         2.0       0.81      0.81      0.81      1000
         3.0       0.89      0.92      0.91      1000
         4.0       0.81      0.86      0.83      1000
         5.0       0.97      0.95      0.96      1000
         6.0       0.74      0.60      0.66      1000
         7.0       0.92      0.93      0.93      1000
         8.0       0.95      0.97      0.96      1000
         9.0       0.94      0.95      0.94      1000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



Average accuracy of 0.88 is pretty high

In [14]:
# Define the parameter grid
param_grid = {'n_estimators': np.arange(50,500,50),
              'max_depth': np.arange(5,30,5),
              'min_samples_split': np.arange(2,10,2),
              'min_samples_leaf': np.arange(1,10,2)}

In [16]:
rs_rfc = RandomizedSearchCV(clf_rf, param_distributions=param_grid, n_iter=10, cv=5, n_jobs=-1)

In [18]:
rs_rfc.fit(x_train, y_train)

In [19]:
y_pred = rs_rfc.predict(x_test)
print("accuracy %s" % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.8802
              precision    recall  f1-score   support

         0.0       0.81      0.86      0.83      1000
         1.0       0.99      0.97      0.98      1000
         2.0       0.80      0.80      0.80      1000
         3.0       0.88      0.92      0.90      1000
         4.0       0.79      0.87      0.83      1000
         5.0       0.97      0.94      0.95      1000
         6.0       0.76      0.60      0.67      1000
         7.0       0.91      0.92      0.92      1000
         8.0       0.95      0.97      0.96      1000
         9.0       0.93      0.95      0.94      1000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [21]:
# model without hyperparameter tuning
# with open("randomForest.pkl", "wb") as f:
#     pickle.dump(clf_rf, f)