# Using Bernoulli Naive Bayes Classifier for Spambase Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

In [2]:
#read data file
data = pd.read_csv("spambase.data", header=None)
data.rename(columns={57:'class'}, inplace=True)
y = np.array(data.pop('class'))
X = np.array(data)

It is important to shuffle the data. Spliting the data into 5 folds. 

In [3]:
#cross validation
kf = KFold(n_splits=5, shuffle=True)

Naive Bayes classifiers are a popular statistical technique of e-mail filtering. 
They typically use bag of words features to identify spam e-mail,an approach commonly used in text classification.
For binary problems，Multinomial classifier is  more suitable after comparing the results of three types naive bayes classifier. 

In [4]:
# choose model from three types naive_bayes classifier
# clf = GaussianNB()
# clf = clf = MultinomialNB()
clf = BernoulliNB()
Scores = []
acc = []
total = 0

In [5]:
for train_index, test_index in kf.split(X):
    #  split data
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # fit model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = clf.score(X_test, y_test)

    # compute fpr, fnr, accuracy,error
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fpr = fp / (tp + tn + fn + fp)
    fnr = fn / (tp + tn + fn + fp)
    total = tp + tn + fn + fp
    acc.append(score)
    Scores.append({"fp":fp,"fn":fn,"total":total,"false positive(%)":fpr * 100,
                   "false negative(%)":fnr * 100,"accuracy(%)": score * 100, "error(%)": 100 - score*100})


It is important to shuffle the data. Spliting the data into 5 folds. 

Printing the table of the percentage of false positive, false negative, accuracy and error.

In [6]:
# print each fold's accuracy and error
df = pd.DataFrame(Scores, columns=['fp', 'fn','total',
                                   'false positive(%)','false negative(%)','accuracy(%)', 'error(%)'])
print(df)
print("Average accuracy: ", np.mean(acc) * 100,"%")
print("Average Error:", 100 - np.mean(acc) * 100,"%")

   fp  fn  total  false positive(%)  false negative(%)  accuracy(%)   error(%)
0  32  71    921           3.474484           7.709012    88.816504  11.183496
1  28  67    920           3.043478           7.282609    89.673913  10.326087
2  44  52    920           4.782609           5.652174    89.565217  10.434783
3  45  61    920           4.891304           6.630435    88.478261  11.521739
4  43  81    920           4.673913           8.804348    86.521739  13.478261
Average accuracy:  88.61112684699994 %
Average Error: 11.388873153000063 %
