### Gaussian and Bernoulli Naive Bayes

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
# Open the .csv files
X_train = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/train_tfidf.csv')
X_test = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/test_tfidf.csv')

X_train = X_train.drop(['Unnamed: 0'], axis=1)
X_test = X_test.drop(['Unnamed: 0'], axis=1)

# Check the shape of the data
print(X_train.shape)
print(X_test.shape)

(22500, 1000)
(7500, 1000)


In [3]:
# Extract the features into a numpy array
X_train = X_train.values
X_test = X_test.values

# Open the labels
y_train_df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/CSE343-ML-Project/Data/Preprocessed-Data/train_labels.csv')
y_test_df = pd.read_csv('/Users/arnav/Desktop/MachineLearning/ML_CSE343 Project/CSE343-ML-Project/Data/Preprocessed-Data/test_labels.csv')

# Check the shape of the labels
print(y_train_df.shape)
print(y_test_df.shape)

(22500, 1)
(7500, 1)


In [4]:
# Convert class to non_suicide = 0 and suicide = 1
y_train_temp = y_train_df['class'].values
y_test_temp = y_test_df['class'].values

# Check the shape of the labels
print(y_train_temp.shape)
print(y_test_temp.shape)

y_train = np.array([0 if label == "non-suicide" else 1 for label in y_train_temp])
y_test = np.array([0 if label == "non-suicide" else 1 for label in y_test_temp])

# Check the shape of the labels
print(y_train.shape)
print(y_test.shape)

(22500,)
(7500,)
(22500,)
(7500,)


- Running both without Hyperparameter Tuning

In [5]:
# Gaussian Naive Bayes
gnb = GaussianNB()

# Fit
gnb.fit(X_train, y_train)

# Predict
y_pred_gnb = gnb.predict(X_test)

In [6]:
# Print the accuracy score
print("Testing Data: Gaussian Naive Bayes")
print("Accuracy Score:", accuracy_score(y_test, y_pred_gnb))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gnb))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred_gnb))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred_gnb))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred_gnb, average='weighted'))

Testing Data: Gaussian Naive Bayes
Accuracy Score: 0.8542666666666666
Confusion Matrix:
 [[2981  727]
 [ 366 3426]]
AUC Score: 0.8537092226182186
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.80      0.85      3708
           1       0.82      0.90      0.86      3792

    accuracy                           0.85      7500
   macro avg       0.86      0.85      0.85      7500
weighted avg       0.86      0.85      0.85      7500

Weighted F1-score: 0.8538489948302447


In [7]:
# Predict on training data
y_pred_gnb_train = gnb.predict(X_train)

# Print the accuracy score
print("Training Data: Gaussian Naive Bayes")
print("Accuracy Score:", accuracy_score(y_train, y_pred_gnb_train))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_gnb_train))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_gnb_train))

# Print the classification report
print("Classification Report:\n", classification_report(y_train, y_pred_gnb_train))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_gnb_train, average='weighted'))

Training Data: Gaussian Naive Bayes
Accuracy Score: 0.8561777777777778
Confusion Matrix:
 [[ 9202  2090]
 [ 1146 10062]]
AUC Score: 0.8563324094449009
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.81      0.85     11292
           1       0.83      0.90      0.86     11208

    accuracy                           0.86     22500
   macro avg       0.86      0.86      0.86     22500
weighted avg       0.86      0.86      0.86     22500

Weighted F1-score: 0.8559468020493084


In [8]:
# Bernoulli Naive Bayes
bnb = BernoulliNB()

# Fit
bnb.fit(X_train, y_train)

# Predict
y_pred_bnb = bnb.predict(X_test)

In [9]:
# Print the accuracy score
print("Testing Data: Bernoulli Naive Bayes")
print("Accuracy Score:", accuracy_score(y_test, y_pred_bnb))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bnb))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred_bnb))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred_bnb))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred_bnb, average='weighted'))

Testing Data: Bernoulli Naive Bayes
Accuracy Score: 0.7753333333333333
Confusion Matrix:
 [[3330  378]
 [1307 2485]]
AUC Score: 0.7766926283232969
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.90      0.80      3708
           1       0.87      0.66      0.75      3792

    accuracy                           0.78      7500
   macro avg       0.79      0.78      0.77      7500
weighted avg       0.79      0.78      0.77      7500

Weighted F1-score: 0.7721576538450977


In [10]:
# Predict on Training Data
y_pred_bnb_train = bnb.predict(X_train)

# Print the accuracy score
print("Training Data: Bernoulli Naive Bayes")
print("Accuracy Score:", accuracy_score(y_train, y_pred_bnb_train))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_bnb_train))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_bnb_train))

# Print the classification report
print("Classification Report:\n", classification_report(y_train, y_pred_bnb_train))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_bnb_train, average='weighted'))

Training Data: Bernoulli Naive Bayes
Accuracy Score: 0.7825333333333333
Confusion Matrix:
 [[10199  1093]
 [ 3800  7408]]
AUC Score: 0.7820811345471316
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.90      0.81     11292
           1       0.87      0.66      0.75     11208

    accuracy                           0.78     22500
   macro avg       0.80      0.78      0.78     22500
weighted avg       0.80      0.78      0.78     22500

Weighted F1-score: 0.7792371588062159


### Hyperparameter Tuning
- No hyperparameters for Gaussian Naive Bayes

- For Bernoulli Naive Bayes, we apply Grid Search to tune the following hyperparameters:
1. 'alpha': Additive (Laplace/Lidstone) smoothing parameter (0 by default)
2. 'binarize': Threshold for binarizing the input features. If None, input is presumed to already consist of binary vectors.

In [12]:
# For BernoulliNB
param_grid_bnb = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    'binarize': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, None]
}

# For BernoulliNB
clf_bnb = BernoulliNB()
grid_search_bnb = GridSearchCV(clf_bnb, param_grid_bnb, cv=5, verbose=2, n_jobs=-1)

# Fit
grid_search_bnb.fit(X_train, y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits
[CV] END ..........................alpha=0.001, binarize=0.1; total time=   2.0s
[CV] END ..........................alpha=0.001, binarize=0.1; total time=   2.0s
[CV] END ..........................alpha=0.001, binarize=0.1; total time=   2.1s
[CV] END ..........................alpha=0.001, binarize=0.0; total time=   2.1s
[CV] END ..........................alpha=0.001, binarize=0.0; total time=   2.1s
[CV] END ..........................alpha=0.001, binarize=0.0; total time=   2.2s
[CV] END ..........................alpha=0.001, binarize=0.0; total time=   2.2s
[CV] END ..........................alpha=0.001, binarize=0.0; total time=   2.2s
[CV] END ..........................alpha=0.001, binarize=0.1; total time=   2.5s
[CV] END ..........................alpha=0.001, binarize=0.1; total time=   2.5s
[CV] END ..........................alpha=0.001, binarize=0.3; total time=   2.3s
[CV] END ..........................alpha=0.001,

In [28]:
# Best Hyperparameters
best_parameters_bnb = grid_search_bnb.best_params_
print("Best Parameters for BernoulliNB:", best_parameters_bnb)


Best Parameters for BernoulliNB: {'alpha': 1, 'binarize': None}


In [12]:
# Best Estimator
best_estimator_bnb = BernoulliNB(alpha=1, binarize=None)

# Fit
best_estimator_bnb.fit(X_train, y_train)

# Predict
y_pred_bnb = best_estimator_bnb.predict(X_test)

# Print the accuracy score
print("Bernoulli Naive Bayes")
print("Accuracy Score:", accuracy_score(y_test, y_pred_bnb))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bnb))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred_bnb))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred_bnb))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred_bnb, average='weighted'))

Bernoulli Naive Bayes
Accuracy Score: 0.8858666666666667
Confusion Matrix:
 [[3319  389]
 [ 467 3325]]
AUC Score: 0.8859688425982822
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89      3708
           1       0.90      0.88      0.89      3792

    accuracy                           0.89      7500
   macro avg       0.89      0.89      0.89      7500
weighted avg       0.89      0.89      0.89      7500

Weighted F1-score: 0.8858676162566078


In [13]:
# Predict on Training Data
y_pred_bbnb_train = best_estimator_bnb.predict(X_train)

# Print the accuracy score
print("Training Data: Bernoulli Naive Bayes")
print("Accuracy Score:", accuracy_score(y_train, y_pred_bbnb_train))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_bbnb_train))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_bbnb_train))

# Print the classification report
print("Classification Report:\n", classification_report(y_train, y_pred_bbnb_train))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_bbnb_train, average='weighted'))

Training Data: Bernoulli Naive Bayes
Accuracy Score: 0.8861777777777777
Confusion Matrix:
 [[10166  1126]
 [ 1435  9773]]
AUC Score: 0.8861249195011003
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.90      0.89     11292
           1       0.90      0.87      0.88     11208

    accuracy                           0.89     22500
   macro avg       0.89      0.89      0.89     22500
weighted avg       0.89      0.89      0.89     22500

Weighted F1-score: 0.8861504662766736


### Principal Component Analysis

In [14]:
pca = PCA(n_components=0.75)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# For BernoulliNB
param_grid_bnb = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    'binarize': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, None]
}

# For BernoulliNB
clf_bnb = BernoulliNB()
grid_search_bnb = GridSearchCV(clf_bnb, param_grid_bnb, cv=5, verbose=2, n_jobs=-1)

# Fit
grid_search_bnb.fit(X_train_pca, y_train)

Fitting 5 folds for each of 42 candidates, totalling 210 fits
[CV] END ..........................alpha=0.001, binarize=0.1; total time=   0.4s
[CV] END ..........................alpha=0.001, binarize=0.1; total time=   0.5s
[CV] END ..........................alpha=0.001, binarize=0.1; total time=   0.8s
[CV] END ..........................alpha=0.001, binarize=0.0; total time=   1.3s
[CV] END ..........................alpha=0.001, binarize=0.1; total time=   0.5s
[CV] END ..........................alpha=0.001, binarize=0.0; total time=   1.3s
[CV] END ..........................alpha=0.001, binarize=0.0; total time=   1.3s
[CV] END ..........................alpha=0.001, binarize=0.0; total time=   1.3s
[CV] END ..........................alpha=0.001, binarize=0.1; total time=   0.3s
[CV] END ..........................alpha=0.001, binarize=0.2; total time=   0.3s
[CV] END ..........................alpha=0.001, binarize=0.0; total time=   1.5s
[CV] END ..........................alpha=0.001,

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


[CV] END ..........................alpha=0.001, binarize=0.5; total time=   0.4s
[CV] END .........................alpha=0.001, binarize=None; total time=   0.1s
[CV] END .........................alpha=0.001, binarize=None; total time=   0.1s
[CV] END .........................alpha=0.001, binarize=None; total time=   0.2s
[CV] END ...........................alpha=0.01, binarize=0.1; total time=   0.3s
[CV] END ...........................alpha=0.01, binarize=0.1; total time=   0.3s
[CV] END ...........................alpha=0.01, binarize=0.1; total time=   0.3s
[CV] END ...........................alpha=0.01, binarize=0.0; total time=   0.5s
[CV] END ...........................alpha=0.01, binarize=0.0; total time=   0.6s
[CV] END ...........................alpha=0.01, binarize=0.0; total time=   0.6s
[CV] END ...........................alpha=0.01, binarize=0.1; total time=   0.2s
[CV] END ...........................alpha=0.01, binarize=0.1; total time=   0.3s
[CV] END ...................

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


[CV] END ............................alpha=0.1, binarize=0.1; total time=   0.3s
[CV] END ............................alpha=0.1, binarize=0.1; total time=   0.3s
[CV] END ............................alpha=0.1, binarize=0.1; total time=   0.3s
[CV] END ............................alpha=0.1, binarize=0.0; total time=   0.5s
[CV] END ............................alpha=0.1, binarize=0.0; total time=   0.5s
[CV] END ............................alpha=0.1, binarize=0.0; total time=   0.5s
[CV] END ............................alpha=0.1, binarize=0.0; total time=   0.5s
[CV] END ............................alpha=0.1, binarize=0.0; total time=   0.5s
[CV] END ............................alpha=0.1, binarize=0.1; total time=   0.2s
[CV] END ............................alpha=0.1, binarize=0.1; total time=   0.3s
[CV] END ............................alpha=0.1, binarize=0.2; total time=   0.3s
[CV] END ............................alpha=0.1, binarize=0.2; total time=   0.2s
[CV] END ...................

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


[CV] END ..............................alpha=1, binarize=0.1; total time=   0.4s
[CV] END ..............................alpha=1, binarize=0.1; total time=   0.4s
[CV] END ..............................alpha=1, binarize=0.1; total time=   0.4s
[CV] END ..............................alpha=1, binarize=0.0; total time=   0.7s
[CV] END ..............................alpha=1, binarize=0.0; total time=   0.7s
[CV] END ..............................alpha=1, binarize=0.0; total time=   0.7s
[CV] END ..............................alpha=1, binarize=0.0; total time=   0.7s
[CV] END ..............................alpha=1, binarize=0.0; total time=   0.7s
[CV] END ..............................alpha=1, binarize=0.1; total time=   0.2s
[CV] END ..............................alpha=1, binarize=0.1; total time=   0.2s
[CV] END ..............................alpha=1, binarize=0.2; total time=   0.3s
[CV] END ..............................alpha=1, binarize=0.2; total time=   0.3s
[CV] END ...................

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


[CV] END ..............................alpha=1, binarize=0.5; total time=   0.3s
[CV] END ..............................alpha=1, binarize=0.5; total time=   0.3s
[CV] END ..............................alpha=1, binarize=0.5; total time=   0.3s
[CV] END .............................alpha=1, binarize=None; total time=   0.1s
[CV] END .............................alpha=10, binarize=0.1; total time=   0.3s
[CV] END .............................alpha=10, binarize=0.1; total time=   0.3s
[CV] END .............................alpha=10, binarize=0.1; total time=   0.4s
[CV] END .............................alpha=10, binarize=0.0; total time=   0.6s
[CV] END .............................alpha=10, binarize=0.0; total time=   0.6s
[CV] END .............................alpha=10, binarize=0.0; total time=   0.6s
[CV] END .............................alpha=10, binarize=0.0; total time=   0.7s
[CV] END .............................alpha=10, binarize=0.1; total time=   0.3s
[CV] END ...................

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


[CV] END .............................alpha=10, binarize=0.5; total time=   0.4s
[CV] END ............................alpha=10, binarize=None; total time=   0.1s
[CV] END .............................alpha=10, binarize=0.5; total time=   0.3s
[CV] END ............................alpha=10, binarize=None; total time=   0.1s
[CV] END ............................alpha=10, binarize=None; total time=   0.1s
[CV] END .............................alpha=10, binarize=0.5; total time=   0.3s
[CV] END ............................alpha=10, binarize=None; total time=   0.1s
[CV] END ............................alpha=10, binarize=None; total time=   0.1s
[CV] END ............................alpha=100, binarize=0.1; total time=   0.4s
[CV] END ............................alpha=100, binarize=0.1; total time=   0.4s
[CV] END ............................alpha=100, binarize=0.1; total time=   0.4s
[CV] END ............................alpha=100, binarize=0.0; total time=   0.6s
[CV] END ...................

  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


In [15]:
# Best Hyperparameters
best_parameters_bnb = grid_search_bnb.best_params_

# Best Estimator
best_estimator_bnb = grid_search_bnb.best_estimator_

# Predict
y_pred_bnb_pca = best_estimator_bnb.predict(X_test_pca)

# Print the accuracy score
print("PCA : Test Data Bernoulli Naive Bayes")
print("Accuracy Score:", accuracy_score(y_test, y_pred_bnb_pca))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bnb_pca))

# Print AUC score
print("AUC Score:", roc_auc_score(y_test, y_pred_bnb_pca))

# Print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred_bnb_pca))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_test, y_pred_bnb_pca, average='weighted'))

PCA : Test Data Bernoulli Naive Bayes
Accuracy Score: 0.8425333333333334
Confusion Matrix:
 [[3266  442]
 [ 739 3053]]
AUC Score: 0.8429571538787158
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.88      0.85      3708
           1       0.87      0.81      0.84      3792

    accuracy                           0.84      7500
   macro avg       0.84      0.84      0.84      7500
weighted avg       0.84      0.84      0.84      7500

Weighted F1-score: 0.8423560970696526


In [16]:
# Predict on Training Data
y_pred_bnb_pca_train = best_estimator_bnb.predict(X_train_pca)

# Print the accuracy score
print("PCA : Training Data Bernoulli Naive Bayes")
print("Accuracy Score:", accuracy_score(y_train, y_pred_bnb_pca_train))

# Print the confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_train, y_pred_bnb_pca_train))

# Print AUC score
print("AUC Score:", roc_auc_score(y_train, y_pred_bnb_pca_train))

# Print the classification report
print("Classification Report:\n", classification_report(y_train, y_pred_bnb_pca_train))

# Print weighted F1-score
print("Weighted F1-score:", f1_score(y_train, y_pred_bnb_pca_train, average='weighted'))

PCA : Training Data Bernoulli Naive Bayes
Accuracy Score: 0.8387555555555556
Confusion Matrix:
 [[9936 1356]
 [2272 8936]]
AUC Score: 0.8386013178684422
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.85     11292
           1       0.87      0.80      0.83     11208

    accuracy                           0.84     22500
   macro avg       0.84      0.84      0.84     22500
weighted avg       0.84      0.84      0.84     22500

Weighted F1-score: 0.8384632252020451
