# Naive Bayes from Scratch

This notebook implements the **Multinomial Naive Bayes** with **Laplace smoothing** classifier for **Spam Email Classification** from scratch, 
following the probabilistic model taught in **Stanford's CS229: Machine Learning (Autumn 2018, Lecture 5 & 6)** by Andrew Ng.

It includes:
- Tokenization and preprocessing of raw email text
- Vocabulary construction with a frequency threshold
- Transformation of text data into bag-of-words feature vectors
- Estimation of conditional probabilities with Laplace smoothing
- Model training and inference
- Accuracy evaluation on labeled data


> 📘 This notebook is part of the broader `ml-from-scratch` project, which reimplements foundational machine learning algorithms using only **NumPy** and **pandas**, for educational clarity and hands-on understanding.



In [92]:
import pickle
import collections
import numpy as np
import pandas as pd
from numpy.typing import NDArray
from typing import Optional, Dict


In [93]:
class NaiveBayes:
    """For Spam email classification"""
    def __init__(self):
        self.word_dictionary: Optional[Dict[str, int]] = None
        self.phi_y: Optional[float] = None
        self.phi_j_y1: Optional[NDArray[np.float64]] = None
        self.phi_j_y0: Optional[NDArray[np.float64]] = None

    @staticmethod
    def get_words(message: str):
        return message.lower().split()
    
    @staticmethod
    def create_dictionary(messages, size: int):
        words = [word for message in messages for word in NaiveBayes.get_words(message)]
        word_counts = collections.Counter(words)
        freq_words = [word for word, count in word_counts.items() if count >= size]
        return {word: index for index, word in enumerate(freq_words)}
    
    @staticmethod
    def transform_text(messages: list[str], word_dictionary: dict[str, int]):
        m = len(messages)
        n = len(word_dictionary)
        word_counts = [collections.Counter(NaiveBayes.get_words(message)) for message in messages]
        matrix = np.zeros((m, n), dtype=int)

        for i in range(m):
            for word, count in word_counts[i].items():
                if word in word_dictionary:
                    matrix[i][word_dictionary[word]] += count
        return matrix
     
    def predict(self, x: list[str]):
        matrix = self.transform_text(x, self.word_dictionary)
        
        log_phi_j_y1 = np.log(self.phi_j_y1)
        log_phi_j_y0 = np.log(self.phi_j_y0)

        scores = matrix @ (log_phi_j_y1 - log_phi_j_y0) + np.log(self.phi_y / (1 - self.phi_y))
        return (scores >= 0).astype(int)

    def fit(self, x_train: list[str], y_train: NDArray, size: int):
        self.word_dictionary = NaiveBayes.create_dictionary(x_train, size)
        matrix = NaiveBayes.transform_text(x_train, self.word_dictionary)

        m, n = matrix.shape

        self.phi_y = np.mean(y_train)

        self.phi_j_y1 = (1 + matrix[y_train == 1].sum(axis=0)) / (n + matrix[y_train == 1].sum())
        self.phi_j_y0 = (1 + matrix[y_train == 0].sum(axis=0)) / (n + matrix[y_train == 0].sum())

        preds = self.predict(x_train)
        accuracy = np.mean(preds == y_train)
        print(f"Model fitted with training accuracy: {accuracy * 100:.2f}%")

    def save(self, filepath):
        data = {
            "word_dictionary" : self.word_dictionary,
            "phi_y" : self.phi_y,
            "phi_j_y1" : self.phi_j_y1,
            "phi_j_y0" : self.phi_j_y0
        }

        with open(filepath,'wb') as f:
                pickle.dump(data, f)

    class Loader:
        def __init__(self, path: str):
            with open(path, 'rb') as f:
                data = pickle.load(f)

            self.model = NaiveBayes()
            self.model.word_dictionary = data.get("word_dictionary")
            self.model.phi_y = data.get("phi_y")
            self.model.phi_j_y0 = data.get("phi_j_y1")
            self.model.phi_j_y1 =  data.get("phi_j_y0")

        def predict(self, x: list[str]):
            return self.model.predict(x)


In [94]:
data = pd.read_csv('./data/ds6_train.tsv', sep='\t', names=['label', 'message'])
val = pd.read_csv('./data/ds6_val.tsv', sep='\t', names=['label', 'message'])
test = pd.read_csv('./data/ds6_test.tsv', sep='\t', names=['label', 'message'])
data.head()

Unnamed: 0,label,message
0,ham,THANX 4 PUTTIN DA FONE DOWN ON ME!!
1,ham,So how are you really. What are you up to. How...
2,ham,Joy's father is John. Then John is the NAME of...
3,ham,"Almost there, see u in a sec"
4,ham,Yes baby! We can study all the positions of th...


In [95]:
data.label = data.label.map({'ham' : 0, 'spam' : 1})

In [96]:
data.head()

Unnamed: 0,label,message
0,0,THANX 4 PUTTIN DA FONE DOWN ON ME!!
1,0,So how are you really. What are you up to. How...
2,0,Joy's father is John. Then John is the NAME of...
3,0,"Almost there, see u in a sec"
4,0,Yes baby! We can study all the positions of th...


In [97]:
x_train = data['message']
y_train = data['label']
x_val = val['message'].to_list()
y_val = val['label'].map({'ham' : 0, 'spam' : 1})
x_test = test['message'].to_list()
y_test = test['label'].map({'ham' : 0, 'spam' : 1})

In [98]:
type(x_train.to_list())

list

In [99]:
x_train = x_train.to_list()

In [100]:
model = NaiveBayes()
model.fit(x_train, y_train, 5)
print(f"Val accuracy {np.mean(model.predict(x_val) == y_val) * 100}")
print(f"Test accuracy {np.mean(model.predict(x_test) == y_test) * 100}")

Model fitted with training accuracy: 98.45%
Val accuracy 98.38420107719928
Test accuracy 97.84946236559139
