In [96]:
import numpy as np
import pandas as pd
from typing import List
from tqdm.notebook import tqdm

In [139]:
class MultinomialNB:
    def __init__(self, num_classes: int, num_features: int, alpha: int=1.0):
        self.num_classes = num_classes
        self.num_features = num_features
        self.alpha = alpha
        self.log_prior = None
        self.log_feature_probs = None
        self.classes = None
        
    def fit(self, X: List[List[int]], y: List[int]):
        self.classes, class_counts = np.unique(y, return_counts=True)
        self.log_prior = np.log(class_counts / len(y))
        
        feature_counts = np.zeros([self.num_classes, self.num_features]) + self.alpha
        for x, c in tqdm(zip(X, y), total=len(X)):
            features, counts = np.unique(x, return_counts=True)
            if len(features):
                feature_counts[c, features] += counts

        self.log_feature_probs = np.log(feature_counts / feature_counts.sum(axis=1, keepdims=True))
    
    def predict(self, x: List[int]) -> int:
        return self.classes[(self.log_prior + self.log_feature_probs[:, x].sum(axis=1)).argmax()].item()

In [140]:
df = pd.read_csv("spam_or_not_spam.csv").sample(frac=1).dropna()
val_size = int(len(df)*0.1)
val_df = df.iloc[:val_size]
train_df = df.iloc[val_size:]

In [141]:
stop_words = {"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"}

In [142]:
X = train_df.email.str.split()
y = train_df.label.tolist()
# X = train_df.email.str.split().apply(lambda words: list(filter(lambda w: w not in stop_words, words)))

word2idx = {w: i for i,w in enumerate(set.union(*X.apply(set)), start=1)}
word2idx["UNK"] = 0
X = X.apply(lambda words: [word2idx[w] for w in words]).tolist()

In [143]:
classifier = MultinomialNB(len(np.unique(y)), len(word2idx))
classifier.fit(X, y)

  0%|          | 0/2700 [00:00<?, ?it/s]

In [144]:
vX = val_df.email.str.split()
vy = val_df.label.tolist()
# X = train_df.email.str.split().apply(lambda words: list(filter(lambda w: w not in stop_words, words)))

vX = vX.apply(lambda words: [word2idx.get(w, 0) for w in words]).tolist()

In [147]:
preds = []
for x in vX:
    preds.append(classifier.predict(x))
    
preds = np.array(preds)

In [152]:
(preds == vy).mean().item()

0.9899665551839465