## Model building

In [84]:
import pandas as pd
from pandas import DataFrame
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#### Define Functions

In [85]:
def run_classification_for_model(_model, _x_train, _x_test, _y_train, _y_test) -> None:
    _model.fit(_x_train, _y_train)
    _y_pred = _model.predict(_x_test)

    print("Accuracy: %0.2f" %accuracy_score(_y_test, _y_pred))
    print("Precision: %0.2f" %precision_score(_y_test, _y_pred, average="macro"))
    print("Recall:  %0.2f" %recall_score(_y_test, _y_pred, average="macro"))
    print("F1-score:  %0.2f" %f1_score(_y_test, _y_pred, average="macro"))

    print(confusion_matrix(_y_test, _y_pred))
    print(classification_report(_y_test, _y_pred))

#### Read training dataset from pickle file

In [86]:
master: DataFrame = pd.read_pickle('./data/master-coded.pickle')
print(master.shape)

#### Split dataset into x & y

In [87]:
X = master.drop('JobSatisfaction', axis=1)
y = master[['JobSatisfaction']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#### Decision Tree classifier

In [88]:
model_dt = tree.DecisionTreeClassifier()
run_classification_for_model(model_dt, X_train, X_test, y_train, y_test)

#### Naive Bayes Classification

In [89]:
model_nb = BernoulliNB()
run_classification_for_model(model_nb, X_train, X_test, y_train, y_test)

#### Read training dataset - #2 from pickle file

In [94]:
master_2: DataFrame = pd.read_pickle('./data/master-coded-2.pickle')
print(master_2.shape)

(98855, 95)


#### Split dataset - #2 into x & y

In [95]:
X2 = master_2.drop('AIFuture', axis=1)
y2 = master_2[['AIFuture']]

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33, random_state=42)

#### Decision Tree classifier - #2

In [96]:
model_dt_2 = tree.DecisionTreeClassifier()
run_classification_for_model(model_dt_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.69
Precision: 0.50
Recall:  0.50
F1-score:  0.50
[[11991  1422  3132   204]
 [ 1255   195   395    69]
 [ 2888   350  1047   109]
 [  220    67   103  9176]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.72      0.72     16749
 NO-COMMENTS       0.10      0.10      0.10      1914
     WORRIED       0.22      0.24      0.23      4394
           x       0.96      0.96      0.96      9566

    accuracy                           0.69     32623
   macro avg       0.50      0.50      0.50     32623
weighted avg       0.69      0.69      0.69     32623



#### Naive Bayes Classification - #2

In [97]:
model_nb_2 = BernoulliNB()
run_classification_for_model(model_nb_2, X2_train, X2_test, y2_train, y2_test)


Accuracy: 0.78
Precision: 0.59
Recall:  0.50
F1-score:  0.49
[[16115   114   213   307]
 [ 1678   106    62    68]
 [ 4002    67   211   114]
 [  301   134    21  9110]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.96      0.83     16749
 NO-COMMENTS       0.25      0.06      0.09      1914
     WORRIED       0.42      0.05      0.09      4394
           x       0.95      0.95      0.95      9566

    accuracy                           0.78     32623
   macro avg       0.59      0.50      0.49     32623
weighted avg       0.72      0.78      0.72     32623

