## Model building

In [17]:
import pandas as pd
from pandas import DataFrame
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#### Define Functions

In [18]:
def run_classification_for_model(_model, _x_train, _x_test, _y_train, _y_test) -> None:
    _model.fit(_x_train, _y_train)
    _y_pred = _model.predict(_x_test)

    print("Accuracy: %0.2f" %accuracy_score(_y_test, _y_pred))
    print("Precision: %0.2f" %precision_score(_y_test, _y_pred, average="macro"))
    print("Recall:  %0.2f" %recall_score(_y_test, _y_pred, average="macro"))
    print("F1-score:  %0.2f" %f1_score(_y_test, _y_pred, average="macro"))

    print(confusion_matrix(_y_test, _y_pred))
    print(classification_report(_y_test, _y_pred))

#### Read training dataset from pickle file

In [19]:
master: DataFrame = pd.read_pickle('./data/master-pca.pickle')
print(master.shape)

(98855, 12)


#### Split dataset into x & y

In [20]:
X = master.drop('JobSatisfaction', axis=1)
y = master[['JobSatisfaction']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#### Decision Tree classifier

In [21]:
model_dt = tree.DecisionTreeClassifier()
run_classification_for_model(model_dt, X_train, X_test, y_train, y_test)

Accuracy: 0.43
Precision: 0.27
Recall:  0.27
F1-score:  0.27
[[  81  150   94   61  119  180   84   50]
 [ 118  357  275  187  270  507  207  113]
 [ 106  308  301  149  349  715  259  140]
 [  75  187  201  162  232  484  184  140]
 [ 111  299  356  253  537 1090  474  182]
 [ 182  567  755  544 1141 3425 1583  509]
 [  70  215  285  207  477 1557  982  285]
 [  59  119  137  161  208  513  322 8173]]
              precision    recall  f1-score   support

           1       0.10      0.10      0.10       819
           2       0.16      0.18      0.17      2034
           3       0.13      0.13      0.13      2327
           4       0.09      0.10      0.10      1665
           5       0.16      0.16      0.16      3302
           6       0.40      0.39      0.40      8706
           7       0.24      0.24      0.24      4078
           x       0.85      0.84      0.85      9692

    accuracy                           0.43     32623
   macro avg       0.27      0.27      0.27     3262

#### Naive Bayes Classification

In [22]:
model_nb = BernoulliNB()
run_classification_for_model(model_nb, X_train, X_test, y_train, y_test)

Accuracy: 0.45
Precision: 0.15
Recall:  0.20
F1-score:  0.15
[[   0    5    0    0    0  677    0  137]
 [   0    9    0    0    0 1701    0  324]
 [   0    5    0    0    0 1924    0  398]
 [   0    3    0    0    0 1304    0  358]
 [   0    9    0    0    0 2676    0  617]
 [   0    7    0    0    0 6936    0 1763]
 [   0    1    0    0    0 3164    0  913]
 [   0    4    0    0    0 1877    0 7811]]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       819
           2       0.21      0.00      0.01      2034
           3       0.00      0.00      0.00      2327
           4       0.00      0.00      0.00      1665
           5       0.00      0.00      0.00      3302
           6       0.34      0.80      0.48      8706
           7       0.00      0.00      0.00      4078
           x       0.63      0.81      0.71      9692

    accuracy                           0.45     32623
   macro avg       0.15      0.20      0.15     3262

#### Random Forest Classification

In [23]:
model_rf = RandomForestClassifier(n_jobs=10, random_state=0)
run_classification_for_model(model_rf, X_train, X_test, y_train, y_test)

Accuracy: 0.48
Precision: 0.28
Recall:  0.28
F1-score:  0.27
[[  69  197   89   29   79  244   46   66]
 [ 121  408  237   94  251  723   97  103]
 [  79  319  221   89  296 1008  177  138]
 [  42  192  161   77  202  739  110  142]
 [  59  269  255  122  425 1721  268  183]
 [ 107  429  476  227  951 5030 1012  474]
 [  27  126  147   56  318 2398  712  294]
 [  28   77   76   54  114  507  166 8670]]
              precision    recall  f1-score   support

           1       0.13      0.08      0.10       819
           2       0.20      0.20      0.20      2034
           3       0.13      0.09      0.11      2327
           4       0.10      0.05      0.06      1665
           5       0.16      0.13      0.14      3302
           6       0.41      0.58      0.48      8706
           7       0.28      0.17      0.21      4078
           x       0.86      0.89      0.88      9692

    accuracy                           0.48     32623
   macro avg       0.28      0.28      0.27     3262

#### Logistic Regression Classification

In [24]:
model_lr = LogisticRegression()
run_classification_for_model(model_lr, X_train, X_test, y_train, y_test)

Accuracy: 0.47
Precision: 0.16
Recall:  0.22
F1-score:  0.17
[[   0  133    0    0    0  613    0   73]
 [   0  204    0    0    0 1663    0  167]
 [   0  109    0    0    0 1901    0  317]
 [   0   52    0    0    0 1306    0  307]
 [   0   59    0    0    0 2716    0  527]
 [   0   88    0    0    0 6820    0 1798]
 [   0   17    0    0    0 2941    0 1120]
 [   0  110    0    0    0 1135    0 8447]]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       819
           2       0.26      0.10      0.15      2034
           3       0.00      0.00      0.00      2327
           4       0.00      0.00      0.00      1665
           5       0.00      0.00      0.00      3302
           6       0.36      0.78      0.49      8706
           7       0.00      0.00      0.00      4078
           x       0.66      0.87      0.75      9692

    accuracy                           0.47     32623
   macro avg       0.16      0.22      0.17     3262

#### Read training dataset - #2 from pickle file

In [25]:
master_2: DataFrame = pd.read_pickle('./data/master-pca-2.pickle')
print(master_2.shape)

(98855, 12)


#### Split dataset - #2 into x & y

In [26]:
X2 = master_2.drop('AIFuture', axis=1)
y2 = master_2[['AIFuture']]

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33, random_state=42)

#### Decision Tree classifier - #2

In [27]:
model_dt_2 = tree.DecisionTreeClassifier()
run_classification_for_model(model_dt_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.67
Precision: 0.49
Recall:  0.49
F1-score:  0.49
[[11743  1464  3257   285]
 [ 1227   201   394    92]
 [ 2937   417   934   106]
 [  268    94   101  9103]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.70      0.71     16749
 NO-COMMENTS       0.09      0.11      0.10      1914
     WORRIED       0.20      0.21      0.21      4394
           x       0.95      0.95      0.95      9566

    accuracy                           0.67     32623
   macro avg       0.49      0.49      0.49     32623
weighted avg       0.68      0.67      0.68     32623



#### Naive Bayes Classification - #2

In [28]:
model_nb_2 = BernoulliNB()
run_classification_for_model(model_nb_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.75
Precision: 0.38
Recall:  0.47
F1-score:  0.42
[[15189     0     0  1560]
 [ 1606     0     0   308]
 [ 3889     0     0   505]
 [  213     0     0  9353]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.91      0.81     16749
 NO-COMMENTS       0.00      0.00      0.00      1914
     WORRIED       0.00      0.00      0.00      4394
           x       0.80      0.98      0.88      9566

    accuracy                           0.75     32623
   macro avg       0.38      0.47      0.42     32623
weighted avg       0.61      0.75      0.67     32623



#### Naive Bayes Classification - #2

In [29]:
model_nb_2 = BernoulliNB()
run_classification_for_model(model_nb_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.75
Precision: 0.38
Recall:  0.47
F1-score:  0.42
[[15189     0     0  1560]
 [ 1606     0     0   308]
 [ 3889     0     0   505]
 [  213     0     0  9353]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.91      0.81     16749
 NO-COMMENTS       0.00      0.00      0.00      1914
     WORRIED       0.00      0.00      0.00      4394
           x       0.80      0.98      0.88      9566

    accuracy                           0.75     32623
   macro avg       0.38      0.47      0.42     32623
weighted avg       0.61      0.75      0.67     32623



#### Random Forest Classification

In [30]:
model_rf_2 = RandomForestClassifier(n_jobs=10, random_state=0)
run_classification_for_model(model_rf_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.78
Precision: 0.52
Recall:  0.49
F1-score:  0.47
[[15986    73   545   145]
 [ 1749    20    79    66]
 [ 4091    24   215    64]
 [  307    22    29  9208]]
              precision    recall  f1-score   support

     EXCITED       0.72      0.95      0.82     16749
 NO-COMMENTS       0.14      0.01      0.02      1914
     WORRIED       0.25      0.05      0.08      4394
           x       0.97      0.96      0.97      9566

    accuracy                           0.78     32623
   macro avg       0.52      0.49      0.47     32623
weighted avg       0.70      0.78      0.72     32623



#### Logistic Regression Classification

In [31]:
model_lr_2 = LogisticRegression()
run_classification_for_model(model_lr_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.79
Precision: 0.42
Recall:  0.49
F1-score:  0.45
[[16387     0     0   362]
 [ 1780     0     0   134]
 [ 4253     0     0   141]
 [  218     0     0  9348]]
              precision    recall  f1-score   support

     EXCITED       0.72      0.98      0.83     16749
 NO-COMMENTS       0.00      0.00      0.00      1914
     WORRIED       0.00      0.00      0.00      4394
           x       0.94      0.98      0.96      9566

    accuracy                           0.79     32623
   macro avg       0.42      0.49      0.45     32623
weighted avg       0.65      0.79      0.71     32623

