In [24]:
import pandas as pd
import numpy as np
from pandas import DataFrame

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTree
from sklearn.naive_bayes import BernoulliNB,GaussianNB
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, classification_report

from sklearn.model_selection import cross_val_score

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#### Define functions

In [25]:
def read_file() -> DataFrame:
    file_data: DataFrame = pd.read_csv('../data/house-votes-84.csv')
    print(file_data.shape)
    print(file_data.isna().sum())
    return file_data

def split_data(x: DataFrame, y: DataFrame) -> tuple:
    z = train_test_split(x, y, test_size=0.2, random_state=0)
    return z

def build_model_dt(_x_train: DataFrame, _y_train: DataFrame) -> DTree:
    _model = DTree(criterion='entropy')
    _model.fit(_x_train, _y_train)
    return _model

def build_model_nb(_x_train: DataFrame, _y_train: DataFrame) -> BernoulliNB:
    bayes_clf = BernoulliNB()
    bayes_clf.fit(_x_train, _y_train)
    return bayes_clf

# TODO fix this.
def eval_model(_model: DTree, _x_test, _y_test) -> None:
    y_pred = _model.predict(_x_test)

    #compare y_pred with actual targets for your test set(y_test) and calculate precision, recall, f1-score
    print("Precision: %0.2f" %precision_score(_y_test, y_pred , average="macro"))
    print("Recall:  %0.2f" %recall_score(_y_test, y_pred , average="macro"))
    print("F1-score:  %0.2f" %f1_score(_y_test, y_pred , average="macro"))

    print(confusion_matrix(_y_test, y_pred))
    print(classification_report(_y_test, y_pred))

def run_all_models(_x_train, _y_train, _x_test, _y_test):
    # Decision Tree model
    dt_model = build_model_dt(_x_train, _y_train)
    eval_model(dt_model, _x_test, _y_test)

    # Naive Bayesian model
    nb_model = build_model_nb(_x_train, _y_train)
    eval_model(nb_model, _x_test, _y_test)

def split_to_x_y(df: DataFrame) -> tuple:
    x = df.drop(columns=['Class Name'])
    y = df[['Class Name']]
    return x, y

def run_cross_validation(model, x, y) -> None:
    #scores = cross_val_score(nb, X, y, cv=5, scoring=scoring)
    print("Precision: %0.2f (+/- %0.2f)" % (cross_val_score(model, x, y, cv=5, scoring='precision').mean(), cross_val_score(model, x, y, cv=5, scoring='precision').std() * 2))
    print("Recall: %0.2f (+/- %0.2f)" % (cross_val_score(model, x, y, cv=5, scoring='recall').mean(), cross_val_score(model, x, y, cv=5, scoring='recall').std() * 2))
    print("F1-score: %0.2f (+/- %0.2f)" % (cross_val_score(model, x, y, cv=5, scoring='f1').mean(), cross_val_score(model, x, y, cv=5, scoring='f1').std() * 2))

def run_cross_validation_all(x, y) -> None:
    print("Naive-Bayes Classifier")
    # use Gaussian's NB if the inputs are a mix of categorical & continuous.
    # nb = GaussianNB()
    # use Bernoulli's NB if all the inputs are categorical.
    nb = BernoulliNB()
    run_cross_validation(nb, x, y)

    print("Decision Tree Classifier")
    clf = tree.DecisionTreeClassifier()
    run_cross_validation(clf, x, y)

def split_and_run_models(_data: DataFrame) -> None:
    x_values, y_values = split_to_x_y(_data)
    x_train, x_test, y_train, y_test = split_data(x_values, y_values)
    
    # run all models
    run_all_models(x_train, y_train, x_test, y_test)
    # run cross validation
    run_cross_validation_all(x_values, y_values)

def impute_1_drop(_data: DataFrame) -> DataFrame:
    _data_2 = _data.dropna(inplace=False)
    return _data_2

def impute_2_new_val(_data: DataFrame) -> DataFrame:
    df_fill = _data.fillna('x',inplace=False)
    return df_fill

def impute_3_mode(_data: DataFrame) -> DataFrame:
    df_fill = pd.DataFrame()
    for col in _data.columns:
        df_fill[col] = _data[col].astype('category')
        df_fill[col].fillna(_data[col].mode()[0],inplace=True)
    return df_fill

#### Option #1 - ignore missing values
- About 50% (203 / 435) of the records dropped due to missing values.

In [26]:
data: DataFrame = read_file()

data_1 = impute_1_drop(data)
data_1_transformed = data_1.apply(le.fit_transform)

split_and_run_models(data_1_transformed)

(435, 17)
Class Name                                  0
handicapped-infants                        12
water-project-cost-sharing                 48
adoption-of-the-budget-resolution          11
physician-fee-freeze                       11
el-salvador-aid                            15
religious-groups-in-schools                11
anti-satellite-test-ban                    14
aid-to-nicaraguan-contras                  15
mx-missile                                 22
immigration                                 7
synfuels-corporation-cutback               21
education-spending                         31
superfund-right-to-sue                     25
crime                                      17
duty-free-exports                          28
export-administration-act-south-africa    104
dtype: int64
Precision: 0.91
Recall:  0.91
F1-score:  0.91
[[21  2]
 [ 2 22]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        23
           1       0.

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

#### Option #2 - treat missing values as values

In [20]:
data_2 = impute_2_new_val(data)
data_2_transformed = data_2.apply(le.fit_transform)

split_and_run_models(data_2_transformed)

Precision: 0.93
Recall:  0.93
F1-score:  0.93
[[48  4]
 [ 2 33]]
              precision    recall  f1-score   support

           0       0.96      0.92      0.94        52
           1       0.89      0.94      0.92        35

    accuracy                           0.93        87
   macro avg       0.93      0.93      0.93        87
weighted avg       0.93      0.93      0.93        87

Precision: 0.87
Recall:  0.88
F1-score:  0.87
[[43  9]
 [ 2 33]]
              precision    recall  f1-score   support

           0       0.96      0.83      0.89        52
           1       0.79      0.94      0.86        35

    accuracy                           0.87        87
   macro avg       0.87      0.88      0.87        87
weighted avg       0.89      0.87      0.87        87



  y = column_or_1d(y, warn=True)


#### Option #3 - impute missing values

In [21]:
data_3 = impute_3_mode(data)
data_3_transformed = data_3.apply(le.fit_transform)

split_and_run_models(data_3_transformed)

Precision: 0.93
Recall:  0.93
F1-score:  0.93
[[49  3]
 [ 3 32]]
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        52
           1       0.91      0.91      0.91        35

    accuracy                           0.93        87
   macro avg       0.93      0.93      0.93        87
weighted avg       0.93      0.93      0.93        87

Precision: 0.88
Recall:  0.89
F1-score:  0.88
[[44  8]
 [ 2 33]]
              precision    recall  f1-score   support

           0       0.96      0.85      0.90        52
           1       0.80      0.94      0.87        35

    accuracy                           0.89        87
   macro avg       0.88      0.89      0.88        87
weighted avg       0.90      0.89      0.89        87



  y = column_or_1d(y, warn=True)


### CrossValidation
A great read on this : https://towardsdatascience.com/cross-validation-70289113a072
