## Execute Classification Analysis for Type

In [1]:
import pandas as pd
from pandas import DataFrame

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTree
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#### Define functions

In [2]:
def run_classification_for_model(_model: DTree, _x_train, _x_test, _y_train, _y_test) -> None:
    _model.fit(_x_train, _y_train)
    y_pred = _model.predict(_x_test)

    #compare y_pred with actual targets for your test set(y_test) and calculate precision, recall, f1-score
    print("Precision: %0.2f" %precision_score(_y_test, y_pred , average="macro"))
    print("Recall:  %0.2f" %recall_score(_y_test, y_pred , average="macro"))
    print("F1-score:  %0.2f" %f1_score(_y_test, y_pred , average="macro"))

    print(confusion_matrix(_y_test, y_pred))
    print(classification_report(_y_test, y_pred))

def run_cross_validation_for_model(_model, _x, _y) -> None:
    x, y = shuffle(_x, _y)
    precision = cross_val_score(_model, x, y, cv=10, scoring='precision')
    recall = cross_val_score(_model, x, y, cv=10, scoring='recall')
    f1 = cross_val_score(_model, x, y, cv=10, scoring='f1')

    print("Precision: %0.2f (+/- %0.2f)" % (precision.mean(), precision.std() * 2))
    print("Recall: %0.2f (+/- %0.2f)" % (recall.mean(), recall.std() * 2))
    print("F1-score: %0.2f (+/- %0.2f)" % (f1.mean(), f1.std() * 2))

def run_model_multinomial(x_train, x_test, y_train, y_test) -> None:
    print("Multinomial Naive-Bayes Classifier")
    bayes = MultinomialNB()
    run_classification_for_model(bayes, x_train, x_test, y_train, y_test)
    # run_cross_validation_for_model(bayes, _x, _y)

def run_model_gaussian(x_train, x_test, y_train, y_test) -> None:
    print("Gaussian Naive-Bayes Classifier")
    bayes = GaussianNB()
    run_classification_for_model(bayes, x_train, x_test, y_train, y_test)
    # run_cross_validation_for_model(bayes, _x, _y)

def run_model_bernoulli(x_train, x_test, y_train, y_test) -> None:
    print("Bernoulli Naive-Bayes Classifier")
    bayes = BernoulliNB()
    run_classification_for_model(bayes, x_train, x_test, y_train, y_test)
    # run_cross_validation_for_model(bayes, _x, _y)

def run_model_decision_tree(x_train, x_test, y_train, y_test) -> None:
    print("Decision Tree Classifier")
    d_tree = DTree(criterion='entropy')
    run_classification_for_model(d_tree, x_train, x_test, y_train, y_test)
    d_tree_clf = tree.DecisionTreeClassifier()
    # run_cross_validation_for_model(d_tree_clf, _x, _y)

def run_all_classifications(_x: DataFrame, _y: DataFrame) -> None:
    x_train, x_test, y_train, y_test = train_test_split(_x, _y, test_size=0.2, random_state=1)

    # run_model_multinomial(x_train, x_test, y_train, y_test)
    run_model_gaussian(x_train, x_test, y_train, y_test)
    run_model_bernoulli(x_train, x_test, y_train, y_test)
    run_model_decision_tree(x_train, x_test, y_train, y_test)

def get_final_model(_x: DataFrame, _y: DataFrame):
    x_train, x_test, y_train, y_test = train_test_split(_x, _y, test_size=0.2, random_state=1)
    d_tree = DTree(criterion='entropy')
    _model = d_tree.fit(x_train, y_train)
    return _model

def run_final_model(_model, _x_test: DataFrame, _y_test: DataFrame) -> None:
    y_pred = _model.predict(_x_test)

    print("Precision: %0.2f" %precision_score(_y_test, y_pred , average="macro"))
    print("Recall:  %0.2f" %recall_score(_y_test, y_pred , average="macro"))
    print("F1-score:  %0.2f" %f1_score(_y_test, y_pred , average="macro"))

    print(confusion_matrix(_y_test, y_pred))
    print(classification_report(_y_test, y_pred))

#### Read training dataset from pickle file

In [3]:
master: DataFrame = pd.read_pickle('./data/master.pickle')
print(master.shape)

(344667, 119)


#### Select features for the classification analysis

In [40]:
features_0 = ['Size']
features_1 = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size']
features_2 = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size', 'Weekly_Sales', 'Dept', 'IsHoliday', 'HasMarkDown']

target = ['Type']

#### Running classification models

In [41]:
run_all_classifications(master[features_0], master[target])

Gaussian Naive-Bayes Classifier
Precision: 0.87
Recall:  0.94
F1-score:  0.89
[[33201     0  2086]
 [ 1603 23644  1537]
 [    0     0  6863]]
              precision    recall  f1-score   support

           A       0.95      0.94      0.95     35287
           B       1.00      0.88      0.94     26784
           C       0.65      1.00      0.79      6863

    accuracy                           0.92     68934
   macro avg       0.87      0.94      0.89     68934
weighted avg       0.94      0.92      0.93     68934

Bernoulli Naive-Bayes Classifier
Precision: 0.17
Recall:  0.33
F1-score:  0.23
[[35287     0     0]
 [26784     0     0]
 [ 6863     0     0]]
              precision    recall  f1-score   support

           A       0.51      1.00      0.68     35287
           B       0.00      0.00      0.00     26784
           C       0.00      0.00      0.00      6863

    accuracy                           0.51     68934
   macro avg       0.17      0.33      0.23     68934
weighted

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [38]:
run_all_classifications(master[features_1], master[target])

Gaussian Naive-Bayes Classifier
Precision: 0.87
Recall:  0.94
F1-score:  0.89
[[33201     0  2086]
 [ 1603 23644  1537]
 [    0     0  6863]]
              precision    recall  f1-score   support

           A       0.95      0.94      0.95     35287
           B       1.00      0.88      0.94     26784
           C       0.65      1.00      0.79      6863

    accuracy                           0.92     68934
   macro avg       0.87      0.94      0.89     68934
weighted avg       0.94      0.92      0.93     68934

Bernoulli Naive-Bayes Classifier
Precision: 0.50
Recall:  0.33
F1-score:  0.23
[[35287     0     0]
 [26776     8     0]
 [ 6863     0     0]]
              precision    recall  f1-score   support

           A       0.51      1.00      0.68     35287
           B       1.00      0.00      0.00     26784
           C       0.00      0.00      0.00      6863

    accuracy                           0.51     68934
   macro avg       0.50      0.33      0.23     68934
weighted

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [39]:
run_all_classifications(master[features_2], master[target])

Gaussian Naive-Bayes Classifier
Precision: 0.86
Recall:  0.93
F1-score:  0.88
[[33101   100  2086]
 [ 2516 22758  1510]
 [   20     7  6836]]
              precision    recall  f1-score   support

           A       0.93      0.94      0.93     35287
           B       1.00      0.85      0.92     26784
           C       0.66      1.00      0.79      6863

    accuracy                           0.91     68934
   macro avg       0.86      0.93      0.88     68934
weighted avg       0.93      0.91      0.91     68934

Bernoulli Naive-Bayes Classifier
Precision: 0.36
Recall:  0.33
F1-score:  0.23
[[35211    76     0]
 [26664   120     0]
 [ 6843    20     0]]
              precision    recall  f1-score   support

           A       0.51      1.00      0.68     35287
           B       0.56      0.00      0.01     26784
           C       0.00      0.00      0.00      6863

    accuracy                           0.51     68934
   macro avg       0.36      0.33      0.23     68934
weighted

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


#### Running finalized model on test set

In [43]:
test: DataFrame = pd.read_pickle('./data/test.pickle')
print(test.shape)

(76903, 113)


In [44]:
model = get_final_model(master[features_1], master[target])

In [45]:
run_final_model(model, test[features_1], test[target])


Precision: 0.94
Recall:  0.98
F1-score:  0.95
[[37402     0  1797]
 [    0 29743     0]
 [  218     0  7743]]
              precision    recall  f1-score   support

           A       0.99      0.95      0.97     39199
           B       1.00      1.00      1.00     29743
           C       0.81      0.97      0.88      7961

    accuracy                           0.97     76903
   macro avg       0.94      0.98      0.95     76903
weighted avg       0.98      0.97      0.97     76903

