In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("SMSSpamCollection", sep="\t", header=None)
df

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [2]:
train = df.sample(frac=0.8)
test = df.drop(train.index)

In [3]:
train[0].value_counts()

0
ham     3862
spam     596
Name: count, dtype: int64

In [4]:
test[0].value_counts()

0
ham     963
spam    151
Name: count, dtype: int64

In [5]:
from string import punctuation, digits
import re

class BernoulliNB:
    def __init__(self):
        pass
        
    def get_words(self, X):
        X = re.sub(f"[{punctuation + digits}]", "", X.lower())
        words = set(X.split())
        return words
        
    def learn_vocabulary(self, X_list):
        sets = [self.get_words(X) for X in X_list]
        all_words = set.union(*sets)
        self.all_words = all_words
        booleans = np.array([[int(word in s) for word in all_words] for s in sets])
        return booleans
                
    def train(self, X_list, y_list):
        booleans = self.learn_vocabulary(X_list)
        classes_words = [np.array([boolean for boolean, y in zip(booleans, y_list) if y == class_]) for class_ in np.unique(y_list)]
        self.num_docs_class = np.array([len(class_) for class_ in classes_words])
        classes_words = [class_.sum(axis=0) for class_ in classes_words]
        self.prob_words = [class_ / self.num_docs_class[i] + 0.001 for i, class_ in enumerate(classes_words)]

    def predict(self, X):
        X = self.get_words(X)
        X = np.array([int(word in X) for word in self.all_words])
        
        log_prior = np.log(self.num_docs_class / np.sum(self.num_docs_class))
        log_prob = [np.sum([np.log(prob) * x + np.log(1 - prob) * (1 - x) for x, prob in zip(X, class_probs)])
                        for class_probs in self.prob_words]
        class_prob = log_prior + log_prob

        if class_prob[0] > class_prob[1]:
            return "ham"
        else:
            return "spam"
        
        

In [6]:
model = BernoulliNB()
model.train(train[1], train[0])


In [7]:
score = 0
for i, row in test.iterrows():
    class_ = row[0]
    if class_ == model.predict(row[1]):
        score += 1

In [8]:
accuracy = score / len(test)
print(accuracy)

0.9847396768402155
