## Model building

In [7]:
import pandas as pd
from pandas import DataFrame
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#### Define Functions

In [8]:
def run_classification_for_model(_model, _x_train, _x_test, _y_train, _y_test) -> None:
    _model.fit(_x_train, _y_train)
    _y_pred = _model.predict(_x_test)

    print("Accuracy: %0.2f" %accuracy_score(_y_test, _y_pred))
    print("Precision: %0.2f" %precision_score(_y_test, _y_pred, average="macro"))
    print("Recall:  %0.2f" %recall_score(_y_test, _y_pred, average="macro"))
    print("F1-score:  %0.2f" %f1_score(_y_test, _y_pred, average="macro"))

    print(confusion_matrix(_y_test, _y_pred))
    print(classification_report(_y_test, _y_pred))

#### Read training dataset from pickle file

In [9]:
master: DataFrame = pd.read_pickle('./data/master-pca.pickle')
print(master.shape)

(98855, 12)


#### Split dataset into x & y

In [10]:
X = master.drop('JobSatisfaction', axis=1)
y = master[['JobSatisfaction']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#### Decision Tree classifier

In [11]:
model_dt = tree.DecisionTreeClassifier()
run_classification_for_model(model_dt, X_train, X_test, y_train, y_test)

Accuracy: 0.43
Precision: 0.26
Recall:  0.26
F1-score:  0.26
[[  68  162   88   62  121  191   77   50]
 [ 142  358  277  151  279  508  217  102]
 [ 112  286  291  170  346  703  268  151]
 [  56  182  198  144  245  513  190  137]
 [ 103  299  353  243  510 1150  452  192]
 [ 197  574  738  540 1127 3424 1596  510]
 [  68  221  276  216  468 1528 1003  298]
 [  65  119  147  146  190  508  354 8163]]
              precision    recall  f1-score   support

           1       0.08      0.08      0.08       819
           2       0.16      0.18      0.17      2034
           3       0.12      0.13      0.12      2327
           4       0.09      0.09      0.09      1665
           5       0.16      0.15      0.15      3302
           6       0.40      0.39      0.40      8706
           7       0.24      0.25      0.24      4078
           x       0.85      0.84      0.85      9692

    accuracy                           0.43     32623
   macro avg       0.26      0.26      0.26     3262

#### Naive Bayes

In [12]:
model_nb = BernoulliNB()
run_classification_for_model(model_nb, X_train, X_test, y_train, y_test)

Accuracy: 0.45
Precision: 0.15
Recall:  0.20
F1-score:  0.15
[[   0    5    0    0    0  677    0  137]
 [   0    9    0    0    0 1701    0  324]
 [   0    5    0    0    0 1924    0  398]
 [   0    3    0    0    0 1304    0  358]
 [   0    9    0    0    0 2676    0  617]
 [   0    7    0    0    0 6936    0 1763]
 [   0    1    0    0    0 3164    0  913]
 [   0    4    0    0    0 1877    0 7811]]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       819
           2       0.21      0.00      0.01      2034
           3       0.00      0.00      0.00      2327
           4       0.00      0.00      0.00      1665
           5       0.00      0.00      0.00      3302
           6       0.34      0.80      0.48      8706
           7       0.00      0.00      0.00      4078
           x       0.63      0.81      0.71      9692

    accuracy                           0.45     32623
   macro avg       0.15      0.20      0.15     3262

#### Read training dataset - #2 from pickle file

In [13]:
master_2: DataFrame = pd.read_pickle('./data/master-pca-2.pickle')
print(master_2.shape)

(98855, 12)


#### Split dataset - #2 into x & y

In [14]:
X2 = master_2.drop('AIFuture', axis=1)
y2 = master_2[['AIFuture']]

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33, random_state=42)

#### Decision Tree classifier - #2

In [15]:
model_dt_2 = tree.DecisionTreeClassifier()
run_classification_for_model(model_dt_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.68
Precision: 0.49
Recall:  0.49
F1-score:  0.49
[[11803  1384  3260   302]
 [ 1238   176   404    96]
 [ 2921   401   969   103]
 [  271    95   107  9093]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.70      0.72     16749
 NO-COMMENTS       0.09      0.09      0.09      1914
     WORRIED       0.20      0.22      0.21      4394
           x       0.95      0.95      0.95      9566

    accuracy                           0.68     32623
   macro avg       0.49      0.49      0.49     32623
weighted avg       0.68      0.68      0.68     32623



#### Naive Bayes Classification - #2

In [16]:
model_nb_2 = BernoulliNB()
run_classification_for_model(model_nb_2, X2_train, X2_test, y2_train, y2_test)


Accuracy: 0.75
Precision: 0.38
Recall:  0.47
F1-score:  0.42
[[15189     0     0  1560]
 [ 1606     0     0   308]
 [ 3889     0     0   505]
 [  213     0     0  9353]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.91      0.81     16749
 NO-COMMENTS       0.00      0.00      0.00      1914
     WORRIED       0.00      0.00      0.00      4394
           x       0.80      0.98      0.88      9566

    accuracy                           0.75     32623
   macro avg       0.38      0.47      0.42     32623
weighted avg       0.61      0.75      0.67     32623

