# Using Bernoulli Naive Bayes Classifier for Spambase Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

In [2]:
#read data file
data = pd.read_csv("spambase.data", header=None)
data.rename(columns={57:'class'}, inplace=True)
y = np.array(data.pop('class'))
X = np.array(data)

It is important to shuffle the data. Spliting the data into 5 folds. 

In [3]:
#cross validation
kf = KFold(n_splits=5, shuffle=True)

Naive Bayes classifiers are a popular statistical technique of e-mail filtering.

In [4]:
# choose model from three types naive_bayes classifier
# clf = GaussianNB()
# clf = clf = MultinomialNB()
clf = BernoulliNB()
Scores = []
acc = []
total = 0

In [5]:
for train_index, test_index in kf.split(X):
    #  split data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # fit model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = clf.score(X_test, y_test)

    # compute fpr, fnr, accuracy,error
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fpr = fp / (tp + tn + fn + fp)
    fnr = fn / (tp + tn + fn + fp)
    total = tp + tn + fn + fp
    acc.append(score)
    Scores.append({"fp":fp,"fn":fn,"total":total,"false positive(%)":fpr * 100,
                   "false negative(%)":fnr * 100,"accuracy(%)": score * 100, "error(%)": 100 - score*100})


Printing the table of the percentage of false positive, false negative, accuracy and error.

In [6]:
# print each fold's accuracy and error
df = pd.DataFrame(Scores, columns=['fp', 'fn','total',
                                   'false positive(%)','false negative(%)','accuracy(%)', 'error(%)'])
print(df)
print("Average accuracy: ", np.mean(acc) * 100,"%")
print("Average Error:", 100 - np.mean(acc) * 100,"%")

   fp  fn  total  false positive(%)  false negative(%)  accuracy(%)   error(%)
0  35  76    921           3.800217           8.251900    87.947883  12.052117
1  27  62    920           2.934783           6.739130    90.326087   9.673913
2  45  63    920           4.891304           6.847826    88.260870  11.739130
3  40  64    920           4.347826           6.956522    88.695652  11.304348
4  42  69    920           4.565217           7.500000    87.934783  12.065217
Average accuracy:  88.63305480810084 %
Average Error: 11.366945191899163 %


# For binary problems，Multinomial classifier is  more suitable after comparing the results of three types naive bayes classifier. 

The result of Gaussian Naive Bayes:
Average accuracy:  82.19997167540008 %
Average Error: 17.80002832459992 %

The result of Multinomial Naive Bayes:
Average accuracy:  79.17837416796488 %
Average Error: 20.821625832035124 %

For binary problem, compared with three results, Bernoulli Naive Bayes is the most suitable classifier for this problem.