## Naive Bayes' Classifier - Classifivation Algorithms

In [1]:
import numpy as np
import pandas as pd
import urllib

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'

In [3]:
df = pd.read_csv(url)

In [7]:
df.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.21,0.06,0.0,0.0,0.0,0.0,0.0,0.15,0.06,0.0
0.64,0.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.0
0.64.1,0.5,0.71,0.0,0.0,0.0,0.0,0.0,0.46,0.77,0.0
0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0.32,0.14,1.23,0.63,0.63,1.85,1.92,1.88,0.61,0.19,0.0
0.2,0.28,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.32,0.0
0.3,0.21,0.19,0.31,0.31,0.0,0.0,0.0,0.3,0.38,0.96
0.4,0.07,0.12,0.63,0.63,1.85,0.0,1.88,0.0,0.0,0.0
0.5,0.0,0.64,0.31,0.31,0.0,0.0,0.0,0.92,0.06,0.0
0.6,0.94,0.25,0.63,0.63,0.0,0.64,0.0,0.76,0.0,1.92


In [8]:
df.shape

(4600, 58)

In [19]:
X = df.iloc[:,0:48]
y = df.iloc[:,-1]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

***Naive Bayes classifier for multivariate Bernoulli models***

In [23]:
brn_model = BernoulliNB(binarize=True)
brn_model.fit(X_train,y_train)

BernoulliNB(alpha=1.0, binarize=True, class_prior=None, fit_prior=True)

In [24]:
print(accuracy_score(y_test,brn_model.predict(X_test)))

0.8471014492753624


***Naive Bayes classifier for multinomial models***

In [32]:
mn_model = MultinomialNB()

In [34]:
mn_model.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [36]:
print(accuracy_score(y_test,mn_model.predict(X_test)))

0.8659420289855072


***Gaussian Naive Bayes***

In [41]:
nb_model = GaussianNB()

In [42]:
nb_model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [43]:
print(accuracy_score(y_test,nb_model.predict(X_test)))

0.8050724637681159


In [29]:
def model_compare(x_train_df,y_train_df,x_test_df,y_test_df,model_list):
  return pd.DataFrame({'Model Name':[ i for i in model_list.keys()],
              'Training Score': [accuracy_score(y_train_df,i.fit(x_train_df,y_train_df).predict(x_train_df)) for i in model_list.values()],
              'Test Score': [accuracy_score(y_test_df,i.fit(x_train_df,y_train_df).predict(x_test_df)) for i in model_list.values()]})

In [39]:
model_name = {'Bernoulli models': BernoulliNB(binarize=True),
              'Multinomial models':MultinomialNB(),
              'Gaussian Naive Bayes':GaussianNB()}

model_compare(X_train,y_train,X_test,y_test,model_name)

Unnamed: 0,Model Name,Training Score,Test Score
0,Bernoulli models,0.862733,0.847101
1,Multinomial models,0.873602,0.865942
2,Gaussian Naive Bayes,0.809627,0.805072
