In [1]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, roc_auc_score, precision_recall_fscore_support

In [8]:
# load data
with open('tfidfvec.pickle', 'rb') as handle:
    X_train, X_test, y_train, y_test = pickle.load(handle)
    
# print shapes
print('X_train shape: ', X_train.shape)
print('y_train shape: ', y_train.shape)
print('X_test shape: ', X_test.shape)
print('y_test shape: ', y_test.shape)

X_train shape:  (77697, 6000)
y_train shape:  (77697,)
X_test shape:  (19425, 6000)
y_test shape:  (19425,)


### Naive Bayes

The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

Naive bayes classifier does not allow for negative values in the document vectors, therefore, we can't use word embeddings.

In [9]:
# train
model = MultinomialNB();
model.fit(X_train, y_train)

# predict
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# ------------------------
# TRAINING metrics
# ------------------------

print('---------TRAINING METRICS---------')

# accuracy
score = model.score(X_train, y_train)
print('accuracy: ', score); print()

# precision, recall
precision, recall, f1, support = precision_recall_fscore_support(y_train, y_pred_train)
print('precision: ', precision)
print('recall: ', recall)
print('f1 score: ', f1)

# auc
auc = roc_auc_score(y_train, y_pred_train)
print('auc: ', auc)

# confusion matrix
cm = confusion_matrix(y_train, y_pred_train)
print(cm); print()

# -----------------------
# TEST metrics
# -----------------------

print('---------TEST METRICS---------')

# accuracy
score = model.score(X_test, y_test)
print('accuracy: ', score); print()

# precision, recall
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred_test)
print('precision: ', precision)
print('recall: ', recall)
print('f1 score: ', f1)

# auc
auc = roc_auc_score(y_test, y_pred_test)
print('auc: ', auc)

# confusion matrix
cm = confusion_matrix(y_test, y_pred_test)
print(cm)

---------TRAINING METRICS---------
accuracy:  0.6943382627385871

precision:  [0.68057543 0.70989144]
recall:  [0.72611036 0.66291186]
f1 score:  [0.70260591 0.68559779]
auc:  0.6945111096034594
[[28054 10582]
 [13167 25894]]

---------TEST METRICS---------
accuracy:  0.6721750321750322

precision:  [0.65658622 0.68983311]
recall:  [0.70569851 0.63942601]
f1 score:  [0.68025708 0.66367381]
auc:  0.6725622614405332
[[6774 2825]
 [3543 6283]]


**Count Vectorization**

training score:  0.6964

precision:  [0.68699839 0.70672149]  
recall:  [0.72001923 0.67284493]  
f1 score:  [0.70312133 0.68936728]  

confusion matrix:  
([[17976,  6990],  
  [ 8190, 16844]])
  
  
**TFIDF Vectorization**

training score:  0.70352

precision:  [0.69444018 0.71341973]  
recall:  [0.72542658 0.68167292]  
f1 score:  [0.70959527 0.69718511]  

confusion matrix:  
([[18111,  6855]  
  [ 7969, 17065]])