In [221]:
# import packages
import pandas as pd 
import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.naive_bayes import MultinomialNB 

In [222]:
#import data
spam_df = pd.read_csv('spam.csv')

In [223]:
# inspect data
spam_df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [224]:
# turn spam/ham into numerical values, create new column
spam_df['spam'] = spam_df['Category'].apply(lambda x: 1 if x=='spam' else 0)

In [225]:

def train_test_split(X, y, test_size=0.25, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    
    # Shuffle the data
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    # Split the data
    test_size = int(len(X) * test_size)
    train_indices = indices[:-test_size]
    test_indices = indices[-test_size:]
    
    X_train = X.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_train = y.iloc[train_indices]
    y_test = y.iloc[test_indices]
    
    return X_train, X_test, y_train, y_test

# Example usage:
X_train, X_test, y_train, y_test = train_test_split(spam_df.Message, spam_df.spam, test_size=0.25)

In [226]:
# create a CountVectorizer object

from collections import defaultdict

class CountVectorizer:
    def __init__(self):
        self.vocabulary_ = {}
        self.inverse_vocabulary_ = []

    def fit(self, documents):
        # Create a vocabulary dictionary
        for doc in documents:
            for word in doc.split():
                if word not in self.vocabulary_:
                    self.vocabulary_[word] = len(self.vocabulary_)
                    self.inverse_vocabulary_.append(word)
        return self

    def transform(self, documents):
        # Create a matrix of token counts
        rows = []
        for doc in documents:
            row = [0] * len(self.vocabulary_)
            for word in doc.split():
                if word in self.vocabulary_:
                    row[self.vocabulary_[word]] += 1
            rows.append(row)
        return np.array(rows)

    def fit_transform(self, documents):
        self.fit(documents)
        return self.transform(documents)

cv = CountVectorizer()
x_train_count = cv.fit_transform(X_train.values)

In [227]:
# train model with Naive Bayes

def r2_score(y_true, y_pred):
    # Calculate the total sum of squares
    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    
    # Calculate the residual sum of squares
    ss_residual = np.sum((y_true - y_pred) ** 2)
    
    # Calculate R^2 score
    r2 = 1 - (ss_residual / ss_total)
    return r2



class MultinomialNB:
    def __init__(self):
        self.class_log_prior_ = None
        self.feature_log_prob_ = None
        self.classes_ = None

    def fit(self, X, y):
        # Calculate class prior probabilities
        self.classes_, class_count = np.unique(y, return_counts=True)
        self.class_log_prior_ = np.log(class_count / y.shape[0])

        # Calculate feature probabilities
        feature_count = np.zeros((len(self.classes_), X.shape[1]))
        for i, c in enumerate(self.classes_):
            feature_count[i, :] = X[y == c].sum(axis=0)
        
        smoothed_fc = feature_count + 1  # Apply Laplace smoothing
        smoothed_cc = smoothed_fc.sum(axis=1)
        self.feature_log_prob_ = np.log(smoothed_fc / smoothed_cc[:, np.newaxis])

        return self

    def predict_log_proba(self, X):
        return (X @ self.feature_log_prob_.T) + self.class_log_prior_

    def predict(self, X):
        return self.classes_[np.argmax(self.predict_log_proba(X), axis=1)]

    def score(self, X, y):
        predictions = self.predict(X)
        return r2_score(y, predictions)

# Example usage:
model = MultinomialNB()
model.fit(x_train_count, y_train)

<__main__.MultinomialNB at 0x252813a1790>

In [228]:
# pre-test ham
email_ham=["Hey, wanna go for a movie tonight?"]
email_ham_count=cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

In [229]:
# pre-test spam
email_spam=["Free money reward !"]
email_spam_count=cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

In [None]:
# test model
# score is r2 score
x_test_count=cv.transform(X_test)
model.score(x_test_count, y_test)

0.862260646758897