## Model building

In [42]:
import pandas as pd
from pandas import DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier as DTree
from sklearn.tree import DecisionTreeRegressor  
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

#### Read training dataset from pickle file

In [43]:
master: DataFrame = pd.read_pickle('./data/master.pickle')
print(master.shape)

df_2017: DataFrame = pd.read_csv('./data/survey_results_public.csv')
print(df_2017.shape)

(98855, 91)
(51392, 154)


#### Split dataset into x & y

In [44]:
X = master.drop('JobSatisfaction', axis=1)
X2 = X[:]

y = master[['JobSatisfaction']]
y2 = y[:]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

#### Decision Tree classifier

In [45]:
model_dt = tree.DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
y_pred = model_dt.predict(X_test)

In [47]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='macro')  
cf = confusion_matrix(y_test.values, y_pred)

print("Accuracy: %0.2f" %accuracy)
print("F1 score: %0.2f" %f1)
print("Confusion Matrix -", cf)

Accuracy: 0.51
F1 score: 0.33
Confusion Matrix - [[ 147  170   99   62   97  149   79   16]
 [ 154  467  340  192  272  448  145   16]
 [ 101  299  385  247  378  651  232   34]
 [  44  181  260  232  312  476  131   29]
 [  85  272  411  301  614 1234  357   28]
 [ 175  517  718  448 1313 3757 1681   97]
 [  71  156  200  158  367 1645 1443   38]
 [  12   17   29   27   31   91   43 9442]]


#### Naive Bayes

In [48]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.33, random_state=42)

In [49]:
model_nb = BernoulliNB()
# model_nb = DTree(criterion='entropy')
model_nb.fit(X2_train, y2_train)
y2_pred = model_nb.predict(X2_test)

In [50]:
accuracy = accuracy_score(y2_test, y2_pred)
f1 = f1_score(y2_test, y2_pred, average='macro')  
cf = confusion_matrix(y2_test, y2_pred)

print("Accuracy: %0.2f" %accuracy)
print("F1 score: %0.2f" %f1)
print("Confusion Matrix -", cf)

Accuracy: 0.54
F1 score: 0.38
Confusion Matrix - [[ 157  231   72   43   45  132   36  103]
 [  99  601  333  107  147  423   93  231]
 [  40  218  510  159  244  748  118  290]
 [  15   90  275  251  234  477   66  257]
 [  19   99  356  165  564 1513  201  385]
 [  31  138  304  201  721 4866 1464  981]
 [  23   46   30   39  108 1475 1949  408]
 [  92   90   70  322  105  207  147 8659]]
