In [27]:
import numpy as np
'''
Recall, the naive bayes classifier is defined as: P(y|x) = P(x|y)P(y)
Note, P(y) and P(x|y) are calculated in the training phase
We return the log of it to prevent floating point underflow (reference above)
'''

def train(X,y, alpha=1.0):
    #number of samples
    sampleCount = X.shape[0]

    #group samples by class label
    separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]

    #calculate prior probability for each class, i.e P(y) = N_y/N_total
    classLogPrior = [np.log(len(i) / sampleCount) for i in separated] 

    #calculate P(x_i|y) with smoothing (we default to laplace smoothing, alpha = 1)
    #first calculate frequency
    count = np.array([np.array(i).sum(axis=0) for i in separated]) + alpha 
    #then calculate log probability 
    #[np.newaxis].T is simpy to transpore the array to allow for broadcasting.
    featureLogProb = np.log(count / count.sum(axis=1)[np.newaxis].T) 
    
    return classLogPrior, featureLogProb

def predict(X, classLogPrior, featureLogProb):
    #calculate P(x|y)P(y)
    combinedLikelihood = [(featureLogProb * x).sum(axis=1) + classLogPrior for x in X]
    #return the class with the highest probability
    return np.argmax(combinedLikelihood, axis=1)

## A fkign quick speedrun test to make sure im not stupid af
Stolen from: https://geoffruddock.com/naive-bayes-from-scratch-with-numpy/


In [6]:
%matplotlib inline
import matplotlib.pyplot as plt
from qbstyles import mpl_style
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from typing import Callable

def make_spam_dataset(show_X=True) -> (pd.DataFrame, np.ndarray, Callable):
    """ Create a small toy dataset for MultinomialNB implementation
    
    Returns:
        X: word count matrix
        y: indicator of whether or not message is spam
        msg_tx_func: a function to transform new test data into word count matrix
    """

    vocab = [
        'secret', 'offer', 'low', 'price', 'valued', 'customer', 'today',
        'dollar', 'million', 'sports', 'is', 'for', 'play', 'healthy', 'pizza'
    ]

    spam = [
        'million dollar offer',
        'secret offer today',
        'secret is secret'
    ]
    
    not_spam = [
        'low price for valued customer',
        'play secret sports today',
        'sports is healthy',
        'low price pizza'
    ]

    all_messages = spam + not_spam
    
    vectorizer = CountVectorizer(vocabulary=vocab)
    word_counts = vectorizer.fit_transform(all_messages).toarray()
    df = pd.DataFrame(word_counts, columns=vocab)
    is_spam = [1] * len(spam) + [0] * len(not_spam)
    msg_tx_func = lambda x: vectorizer.transform(x).toarray()
    
    if show_X:
        display(df)
        
    return df.to_numpy(), np.array(is_spam), msg_tx_func

X, y, tx_func = make_spam_dataset()

Unnamed: 0,secret,offer,low,price,valued,customer,today,dollar,million,sports,is,for,play,healthy,pizza
0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0
1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,1,1,1,1,0,0,0,0,0,1,0,0,0
4,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0
5,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0
6,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1


In [24]:
from sklearn.model_selection import train_test_split
X, y, _ = make_spam_dataset()
#split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

classLogPrior, featureLogProb = train(X_train,y_train)
print('classLogPrior: ', classLogPrior)
print('featureLogProb: ', featureLogProb)
predictions = predict(X_test, classLogPrior, featureLogProb)
print('predictions: ', predictions)
print('actual: ', y_test)
print('accuracy: ', np.mean(predictions == y_test))



Unnamed: 0,secret,offer,low,price,valued,customer,today,dollar,million,sports,is,for,play,healthy,pizza
0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0
1,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,1,1,1,1,0,0,0,0,0,1,0,0,0
4,1,0,0,0,0,0,1,0,0,1,0,0,1,0,0
5,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0
6,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1


classLogPrior:  [-0.2876820724517809, -1.3862943611198906]
featureLogProb:  [[-2.60268969 -3.29583687 -2.19722458 -2.19722458 -2.60268969 -2.60268969
  -2.60268969 -3.29583687 -3.29583687 -2.60268969 -3.29583687 -2.60268969
  -2.60268969 -3.29583687 -2.60268969]
 [-1.79175947 -2.89037176 -2.89037176 -2.89037176 -2.89037176 -2.89037176
  -2.89037176 -2.89037176 -2.89037176 -2.89037176 -2.19722458 -2.89037176
  -2.89037176 -2.89037176 -2.89037176]]
predictions:  [1 0 1]
actual:  [1 1 0]
accuracy:  0.3333333333333333
