In [40]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
import csv
import math
import re

In [41]:
mydata = pd.read_csv("spambase.csv");
mydata = mydata.drop(columns=['longest', 'total', 'average'], axis = 1)
mydata.head()

Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,edu,table,conference,;,(,[,!,$,#,Spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.778,0.0,0.0,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.0,0.0,0.0,0.132,0.0,0.372,0.18,0.048,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.06,0.0,0.0,0.01,0.143,0.0,0.276,0.184,0.01,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.0,0.0,0.137,0.0,0.137,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.0,0.0,0.135,0.0,0.135,0.0,0.0,1


In [52]:
class NaiveBayesClassifier:
    
    def fit(self, X, y):
        #We will use this to give each word a conditional probability considering it is spam.
        self.spam_attributes = []
        #We will use this to give each word a conditional probability considering it is ham.
        self.ham_attributes = []
        self._classes = np.unique(y)
        n_classes = len(self._classes)
        
        
        self._priors = np.zeros(n_classes, dtype = np.float64)  #We want to calculate the prior probability.
        for c in self._classes:
            X_c = X[c==y] 
            self._priors[c] = X_c.shape[0] / float(X.shape[0]) #We divide the Spam/Ham with total num of emails i.e. X.shape[0]
            if c == 1:
                self.spam_attributes = X_c.sum(axis = 0) #We calculate conditional probabilities for each attribute given they're spam.
                self.spam_attributes = self.spam_attributes + 0.1 #Laplace smoothing
                self.spam_attributes[:] = [x / float(X_c.shape[0]) for x in self.spam_attributes] #After summing all percentages, we divide them with total spam messages to get Conditional probability.
            else :
                self.ham_attributes = X_c.sum(axis = 0) #We calc conditional probability given that they're ham.
                self.ham_attributes = self.ham_attributes + 0.1  #Laplace smoothing
                #After summing all percentages, we divide them with total spam messages to get Conditional probability.
                self.ham_attributes[:] = [x / float(X_c.shape[0]) for x in self.ham_attributes]
           
        
    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return y_pred
    
    def _predict(self, x):      
        i = 0
        spam = 1;
        ham = 1;
        while i < len(x):
            if x[i] != 0:
                spam = spam * self.spam_attributes[i] * x[i]
                ham = ham * self.ham_attributes[i] * x[i]
            i +=1
        
        spam = spam*self._priors[1]
        ham = ham*self._priors[0]
        
        if spam > ham:
            return 1
        else:
            return 0



In [53]:
def accuracy(y_true, y_pred):
    accuracy = 0
    i = 0
    while i < len(y_true):
        if y_true[i] == y_pred[i]:
            accuracy += 1
        i = i + 1
        
    return accuracy*100 / len(y_true)


X=mydata.iloc[:,:-1].values

y=mydata.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state=123)

nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)
print("The accuracy of our Naive Bayes is: ", accuracy(y_test, predictions))


The accuracy of our Naive Bayes is:  88.09213385484571
