In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm
import pandas as pd

MODES = {
    "unigram" : (1, 1),
    "bigram" : (2, 2),
    "uni-bigram" : (1, 2)
}


class NaiveBayes:
    def __init__(self, mode: str = "unigram", stop_words = None) -> None:
        """
        Initialize NaiveBayes model

        Args:
            mode (str, optional): Mode for BagOfWords, should be either Unigram or Bigram. Defaults to "unigram".
            stop_words (list, optional): Stop words to eliminate from BagOfWords
        """
        self.count_vector = None
        self.probability_dict = dict()

        assert mode in MODES.keys(), "Mode should be either bigram or unigram"
        self.ngram_mode = MODES[mode]
        self.stop_words = stop_words

    def fit(self, x: np.ndarray, y: np.ndarray) -> None:
        """
        Fit the data, calculate probabilities according to it

        Args:
            x (np.ndarray): data with shape (N,1), each row consists of a mail
            y (np.ndarray): data with shape (N,), each row consists of the label of the mail (0 for ham, 1 for spam)
        """

        # columns is list of every words without their counts, counts stores in count_vector
        self.count_vector, columns = self.__vectorizer(x)

        self.__calculate_class_prior(y)     # calculates P(spam) and P (ham), and adds them into probability_dict
        # calculates P(x(i)|spam) and P(x(i)|ham) values, and adds them into probability_dict
        self.__calculate_likelihoods(y, columns, 1)

    def predict(self, x_predict: np.ndarray) -> np.ndarray:
        """
        Predict the labels of the mails in x_predict

        Args:
            x_predict (np.ndarray): Data to be predicted, shape (N,1)

        Returns:
            y_predict (np.ndarray): Predictions for the mails, shape (N,)
        """

        n = x_predict.shape[0]
        y_predict = np.zeros(n)

        vector, columns = self.__vectorizer(x_predict)

        for i in range(n):
            probability_of_spam = 0
            probability_of_ham = 0

            word_idx = np.arange(columns.shape[0])[vector[i] > 0] # only work on words that the text has
            # calculate P(vj | text)
            for j in word_idx:
                if "%s|spam" % columns[j] in self.probability_dict.keys():
                    probability_of_spam += vector[i][j] * self.probability_dict[columns[j] + "|spam"]
                    probability_of_ham += vector[i][j] * self.probability_dict[columns[j] + "|ham"]

            probability_of_spam += self.probability_dict["spam"]
            probability_of_ham += self.probability_dict["ham"]

            y_predict[i] = 1 if probability_of_spam > probability_of_ham else 0

        return y_predict

    def __vectorizer(self, arr: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
        """
        Creates a matrix which stores the words that emails includes and their counts in each email

        :param arr: an numpy array with shape (N,1) for
        :return: a CountVectorizer matrix (N, number of different words in emails) and a vector named columns
        with shape (number of different words in emails,1) which stores all words appears in mails
        """

        # initializes CountVectorizer item with ngram_mode
        vectorizer = CountVectorizer(ngram_range=self.ngram_mode, stop_words=self.stop_words)

        # vector that holds all words in emails and their counts for each item
        vector = vectorizer.fit_transform(arr)

        # convert vector variable to array for usability
        count_vector = vector.toarray()

        # names of columns
        columns = vectorizer.get_feature_names_out()

        return count_vector, columns

    def __calculate_class_prior(self, y: np.ndarray) -> None:
        """
        Calculates class probabilities [P(spam) and P(ham)] for training examples, and adds the result into
        self.probability dictionary as "spam" and "ham" labels

        :param y: data with shape (N,), each row consists of the label of the mail (0 for ham, 1 for spam)
        :return: None
        """

        # labels are only 0 and 1 therefore if we sum all items we get number of 1s
        # instead of a for loop we can use this method
        number_of_spam = np.sum(y)
        number_of_ham = len(y) - number_of_spam

        self.probability_dict["spam"] = np.log(number_of_spam / y.shape[0])  # P(spam) = number of spams / N
        self.probability_dict["ham"] = np.log(number_of_ham / y.shape[0])     # P(ham) = number of hams / N

    def __calculate_likelihoods(self, y: np.ndarray, columns: np.ndarray, alpha: int) -> None:
        """
        Calculates likelihoods of each word that contains in emails as P(word|spam) and P(word|ham), and adds the results
        into self.probability dictionary as "word|spam" and "word|ham" labels

        :param y: data with shape (N,), each row consists of the label of the mail (0 for ham, 1 for spam)
        :param columns: a vector with shape (number of different words in emails,1) which stores all words appears in mails
        :param alpha: int value for smoothing
        :return: None
        """

        N, D = self.count_vector.shape
        spam_vector, ham_vector = np.sum(self.count_vector[y == 1], axis=0), np.sum(self.count_vector[y == 0], axis=0)

        n_s = np.sum(spam_vector) # | Text_spam |
        n_h = np.sum(ham_vector) # | Text_ham  |

        for word_i in range(D):
            n_w_s = spam_vector[word_i]
            n_h_s = ham_vector[word_i]

            self.probability_dict["%s|spam" % columns[word_i]] = np.log((n_w_s + alpha) / (n_s + D))
            self.probability_dict["%s|ham" % columns[word_i]] = np.log((n_h_s + alpha) / (n_h + D))

## 4. Part 4 Calculation of Performance Metrics

Below we calculate the wanted performance metrics for different settings of the model.

$$\textbf{Accuracy} = \frac{TP + TN}{TP + TN + FP + FN}$$
$$\textbf{Precision} = \frac{TP}{TP + FP}$$
$$\textbf{Recall} = \frac{TP}{TP + FN}$$
$$\textbf{F1 Score} = \frac{2 * (Precision * Recall)}{Precision + Recall}$$

In [5]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import pandas as pd
import numpy as np
import os

file_path = os.path.join(os.getcwd(), "emails.csv")
df = pd.read_csv(file_path)
X, y = df["text"].to_numpy(), df["spam"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=29, stratify=y
)

args = [("unigram", ENGLISH_STOP_WORDS), ("bigram", ENGLISH_STOP_WORDS), ("unigram", None), ("bigram", None)]
for arg in args:
    model = NaiveBayes(*arg)        # initializes the model
    model.fit(X_train, y_train)      # training
    y_predict = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
    
    acc = (tp + tn) / (tn + fp + fn + tp)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * recall * precision / (recall + precision)
    print("Performance metrics with settings ngram=%s, stop_words=%s:" % (arg[0], arg[1] != None))
    print("---------------------------")
    print("Accuracy:\t%f" % acc)
    print("Precision:\t%f" % precision)
    print("Recall:\t%f" % recall)
    print("F1 Score:\t%f" % f1)
    print("\n")

Performance metrics with settings ngram=unigram, stop_words=True:
---------------------------
Accuracy:	0.990401
Precision:	0.978182
Recall:	0.981752
F1 Score:	0.979964


Performance metrics with settings ngram=bigram, stop_words=True:
---------------------------
Accuracy:	0.990401
Precision:	0.992509
Recall:	0.967153
F1 Score:	0.979667


Performance metrics with settings ngram=unigram, stop_words=False:
---------------------------
Accuracy:	0.989529
Precision:	0.974638
Recall:	0.981752
F1 Score:	0.978182


Performance metrics with settings ngram=bigram, stop_words=False:
---------------------------
Accuracy:	0.990401
Precision:	0.996226
Recall:	0.963504
F1 Score:	0.979592


