## Model building

In [102]:
import pandas as pd
from pandas import DataFrame
from sklearn import tree
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings("ignore")

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#### Define Functions

In [103]:
def run_classification_for_model(_model, _x_train, _x_test, _y_train, _y_test) -> None:
    _model.fit(_x_train, _y_train)
    _y_pred = _model.predict(_x_test)

    print("Accuracy: %0.2f" %accuracy_score(_y_test, _y_pred))
    print("Precision: %0.2f" %precision_score(_y_test, _y_pred, average="macro"))
    print("Recall:  %0.2f" %recall_score(_y_test, _y_pred, average="macro"))
    print("F1-score:  %0.2f" %f1_score(_y_test, _y_pred, average="macro"))

    print(confusion_matrix(_y_test, _y_pred))
    print(classification_report(_y_test, _y_pred))

#### Read training dataset from pickle file

In [104]:
master: DataFrame = pd.read_pickle('./data/master-coded.pickle')
print(master.shape)

(98855, 91)


#### Split dataset into x & y

In [105]:
X = master.drop('JobSatisfaction', axis=1)
y = master[['JobSatisfaction']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#### Decision Tree classifier

In [106]:
model_dt = tree.DecisionTreeClassifier()
run_classification_for_model(model_dt, X_train, X_test, y_train, y_test)

Accuracy: 0.50
Precision: 0.33
Recall:  0.33
F1-score:  0.33
[[ 144  183   98   74   91  148   67   14]
 [ 169  468  331  193  263  446  147   17]
 [ 103  303  380  238  383  658  231   31]
 [  59  176  255  243  317  469  123   23]
 [  79  297  415  308  612 1188  370   33]
 [ 180  523  722  445 1307 3725 1700  104]
 [  66  169  209  152  366 1647 1429   40]
 [  13   23   30   23   32   86   38 9447]]
              precision    recall  f1-score   support

           1       0.18      0.18      0.18       819
           2       0.22      0.23      0.22      2034
           3       0.16      0.16      0.16      2327
           4       0.14      0.15      0.15      1665
           5       0.18      0.19      0.18      3302
           6       0.45      0.43      0.44      8706
           7       0.35      0.35      0.35      4078
           x       0.97      0.97      0.97      9692

    accuracy                           0.50     32623
   macro avg       0.33      0.33      0.33     3262

#### Naive Bayes Classification

In [107]:
model_nb = BernoulliNB()
run_classification_for_model(model_nb, X_train, X_test, y_train, y_test)

Accuracy: 0.54
Precision: 0.40
Recall:  0.37
F1-score:  0.38
[[ 157  231   72   43   45  132   36  103]
 [  99  601  333  107  147  423   93  231]
 [  40  218  510  159  244  748  118  290]
 [  15   90  275  251  234  477   66  257]
 [  19   99  356  165  564 1513  201  385]
 [  31  138  304  201  721 4866 1464  981]
 [  23   46   30   39  108 1475 1949  408]
 [  92   90   70  322  105  207  147 8659]]
              precision    recall  f1-score   support

           1       0.33      0.19      0.24       819
           2       0.40      0.30      0.34      2034
           3       0.26      0.22      0.24      2327
           4       0.20      0.15      0.17      1665
           5       0.26      0.17      0.21      3302
           6       0.49      0.56      0.52      8706
           7       0.48      0.48      0.48      4078
           x       0.77      0.89      0.82      9692

    accuracy                           0.54     32623
   macro avg       0.40      0.37      0.38     3262

#### Random Forest Classification

In [108]:
model_rf = RandomForestClassifier(n_jobs=10, random_state=0)
run_classification_for_model(model_rf, X_train, X_test, y_train, y_test)

Accuracy: 0.56
Precision: 0.39
Recall:  0.37
F1-score:  0.38
[[ 180  240   75   31   74  176   32   11]
 [ 178  647  285  116  197  520   75   16]
 [  83  374  377  166  361  811  132   23]
 [  34  186  246  218  282  610   65   24]
 [  38  254  315  217  609 1640  213   16]
 [  66  324  404  274  926 5377 1277   58]
 [  31   71   75   61  169 2190 1455   26]
 [  11   18   15   20   32  100   31 9465]]
              precision    recall  f1-score   support

           1       0.29      0.22      0.25       819
           2       0.31      0.32      0.31      2034
           3       0.21      0.16      0.18      2327
           4       0.20      0.13      0.16      1665
           5       0.23      0.18      0.20      3302
           6       0.47      0.62      0.53      8706
           7       0.44      0.36      0.40      4078
           x       0.98      0.98      0.98      9692

    accuracy                           0.56     32623
   macro avg       0.39      0.37      0.38     3262

#### Logistic Regression Classification

In [109]:
model_lr = LogisticRegression()
run_classification_for_model(model_lr, X_train, X_test, y_train, y_test)

Accuracy: 0.30
Precision: 0.06
Recall:  0.12
F1-score:  0.06
[[   0    0    0    0    0    1    0  818]
 [   0    0    0    0    0    2    0 2032]
 [   0    0    0    0    0    1    0 2326]
 [   0    0    0    0    0    6    0 1659]
 [   0    0    0    0    0    3    0 3299]
 [   0    0    0    0    0   16    0 8690]
 [   0    0    0    0    0    9    0 4069]
 [   0    0    0    0    0   54    0 9638]]
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       819
           2       0.00      0.00      0.00      2034
           3       0.00      0.00      0.00      2327
           4       0.00      0.00      0.00      1665
           5       0.00      0.00      0.00      3302
           6       0.17      0.00      0.00      8706
           7       0.00      0.00      0.00      4078
           x       0.30      0.99      0.46      9692

    accuracy                           0.30     32623
   macro avg       0.06      0.12      0.06     3262

#### Read training dataset - #2 from pickle file

In [110]:
master_2: DataFrame = pd.read_pickle('./data/master-coded-2.pickle')
print(master_2.shape)

(98855, 95)


#### Split dataset - #2 into x & y

In [111]:
X2 = master_2.drop('AIFuture', axis=1)
y2 = master_2[['AIFuture']]

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33, random_state=42)

#### Decision Tree classifier - #2

In [112]:
model_dt_2 = tree.DecisionTreeClassifier()
run_classification_for_model(model_dt_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.69
Precision: 0.50
Recall:  0.50
F1-score:  0.50
[[12012  1373  3166   198]
 [ 1265   197   384    68]
 [ 2875   364  1059    96]
 [  236    70   101  9159]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.72      0.72     16749
 NO-COMMENTS       0.10      0.10      0.10      1914
     WORRIED       0.22      0.24      0.23      4394
           x       0.96      0.96      0.96      9566

    accuracy                           0.69     32623
   macro avg       0.50      0.50      0.50     32623
weighted avg       0.69      0.69      0.69     32623



#### Naive Bayes Classification - #2

In [113]:
model_nb_2 = BernoulliNB()
run_classification_for_model(model_nb_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.78
Precision: 0.59
Recall:  0.50
F1-score:  0.49
[[16115   114   213   307]
 [ 1678   106    62    68]
 [ 4002    67   211   114]
 [  301   134    21  9110]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.96      0.83     16749
 NO-COMMENTS       0.25      0.06      0.09      1914
     WORRIED       0.42      0.05      0.09      4394
           x       0.95      0.95      0.95      9566

    accuracy                           0.78     32623
   macro avg       0.59      0.50      0.49     32623
weighted avg       0.72      0.78      0.72     32623



#### Random Forest Classification

In [114]:
model_rf_2 = RandomForestClassifier(n_jobs=10, random_state=0)
run_classification_for_model(model_rf_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.79
Precision: 0.56
Recall:  0.50
F1-score:  0.49
[[16056    71   502   120]
 [ 1713    38   102    61]
 [ 4017    39   279    59]
 [  267     8    19  9272]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.96      0.83     16749
 NO-COMMENTS       0.24      0.02      0.04      1914
     WORRIED       0.31      0.06      0.11      4394
           x       0.97      0.97      0.97      9566

    accuracy                           0.79     32623
   macro avg       0.56      0.50      0.49     32623
weighted avg       0.72      0.79      0.73     32623



#### Logistic Regression Classification

In [115]:
model_lr_2 = LogisticRegression()
run_classification_for_model(model_lr_2, X2_train, X2_test, y2_train, y2_test)

Accuracy: 0.78
Precision: 0.41
Recall:  0.49
F1-score:  0.44
[[16245     0     0   504]
 [ 1716     0     0   198]
 [ 4101     0     0   293]
 [  218     0     0  9348]]
              precision    recall  f1-score   support

     EXCITED       0.73      0.97      0.83     16749
 NO-COMMENTS       0.00      0.00      0.00      1914
     WORRIED       0.00      0.00      0.00      4394
           x       0.90      0.98      0.94      9566

    accuracy                           0.78     32623
   macro avg       0.41      0.49      0.44     32623
weighted avg       0.64      0.78      0.70     32623

