# Third Question: Spam Email
## Ali Borzoozadeh: 810102410

### A. Preprocessing

In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import re

emails = pd.read_csv('emails.csv')

def preprocess_text(text):
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Join tokens back to a single string
    text = ' '.join(tokens)
    return text

emails['text'] = emails['text'].apply(preprocess_text)

### B. Dividing into training and testing data

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# feature
X = emails['text']
# label
y = emails['spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### C. Building BoW model

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform it into a BoW matrix
X_train_bow = vectorizer.fit_transform(X_train)

# Get the feature names (unique words)
feature_names = vectorizer.get_feature_names_out()

# Convert BoW matrix to DataFrame for easier manipulation
bow_df = pd.DataFrame(X_train_bow.toarray(), columns=feature_names)

# Add the labels to the DataFrame
bow_df['spam'] = y_train.values

In [4]:
from collections import defaultdict

# Calculate P(y)
P_spam = y_train.mean()
P_non_spam = 1 - P_spam

# Calculate P(x_i | y)
spam_bow = bow_df[bow_df['spam'] == 1].drop(columns='spam')
non_spam_bow = bow_df[bow_df['spam'] == 0].drop(columns='spam')

word_counts_spam = spam_bow.sum(axis=0) + 1  # Add-one smoothing
word_counts_non_spam = non_spam_bow.sum(axis=0) + 1  # Add-one smoothing

total_spam_words = word_counts_spam.sum() + len(feature_names)  # Add-one smoothing
total_non_spam_words = word_counts_non_spam.sum() + len(feature_names)  # Add-one smoothing

P_xi_given_spam = word_counts_spam / total_spam_words
P_xi_given_non_spam = word_counts_non_spam / total_non_spam_words

print(f"P(y=spam): {P_spam}")
print(f"P(y=non-spam): {P_non_spam}")
print(f"P(x_i | y=spam): {P_xi_given_spam}")
print(f"P(x_i | y=non-spam): {P_xi_given_non_spam}")

P(y=spam): 0.23526844172850284
P(y=non-spam): 0.7647315582714972
P(x_i | y=spam): aa          0.000004
aaa         0.000004
aadedeji    0.000004
aagrawal    0.000004
aal         0.000008
              ...   
zygoma      0.000008
zymg        0.000015
zzn         0.000008
zzncacst    0.000004
zzzz        0.000027
Length: 30491, dtype: float64
P(x_i | y=non-spam): aa          0.000090
aaa         0.000019
aadedeji    0.000002
aagrawal    0.000002
aal         0.000001
              ...   
zygoma      0.000001
zymg        0.000001
zzn         0.000001
zzncacst    0.000004
zzzz        0.000001
Length: 30491, dtype: float64


### D. Prediction using Bayes' rule

In [5]:
# Convert test data to BoW matrix
X_test_bow = vectorizer.transform(X_test)

# Ensure arrays have the same length as feature names
P_xi_given_spam = P_xi_given_spam.reindex(feature_names, fill_value=1/total_spam_words)
P_xi_given_non_spam = P_xi_given_non_spam.reindex(feature_names, fill_value=1/total_non_spam_words)

# Predict the class for each email in the test set
predictions = []
for i in range(X_test_bow.shape[0]):
    email_vector = X_test_bow[i].toarray()[0]
    P_spam_email = P_spam * np.prod(np.power(P_xi_given_spam, email_vector))
    P_non_spam_email = P_non_spam * np.prod(np.power(P_xi_given_non_spam, email_vector))
    if P_spam_email > P_non_spam_email:
        predictions.append(1)
    else:
        predictions.append(0)

# Calculate accuracy
accuracy = np.mean(np.array(predictions) == y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 87.87%


### Question 1:
Laplace Smoothing, also known as add-one smoothing, is a technique used to handle the problem of zero probabilities. It works by adding one to the count of each word in the vocabulary, ensuring that no word has a zero probability.
In this project we added + 1 to the word counts for both spam and non-spam emails to implement Laplace Smoothing (Part C).

### Question 2:
1. Log probabilities: Using logarithms to compute probabilities helps avoid numerical underflow.
2. Log likelihood: For each email in the test set, calculate the log likelihoods for both spam and non-spam classes.

Now we can calculate the accuracy and compare it to the former approach.

In [6]:
# Convert test data to BoW matrix
X_test_bow = vectorizer.transform(X_test)

# Ensure log probabilities arrays have the same length as feature names
log_P_xi_given_spam = np.log(P_xi_given_spam.reindex(feature_names, fill_value=1/total_spam_words))
log_P_xi_given_non_spam = np.log(P_xi_given_non_spam.reindex(feature_names, fill_value=1/total_non_spam_words))

# Predict the class for each email in the test set
log_P_spam = np.log(P_spam)
log_P_non_spam = np.log(P_non_spam)

predictions = []
for i in range(X_test_bow.shape[0]):
    email_vector = X_test_bow[i].toarray()[0]
    log_likelihood_spam = log_P_spam + (email_vector * log_P_xi_given_spam).sum()
    log_likelihood_non_spam = log_P_non_spam + (email_vector * log_P_xi_given_non_spam).sum()
    if log_likelihood_spam > log_likelihood_non_spam:
        predictions.append(1)
    else:
        predictions.append(0)

# Calculate accuracy
accuracy = np.mean(np.array(predictions) == y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 98.08%


#### As we can see, we reached to 98.08% from 87.87%. This approach should be much faster and provide an accurate estimate of the model's performance.

### Question 3:
To implement this, we need to preprocess the text to remove stop words from the first and rebuild the BoW‌ model without stop words.

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
import string
import re

emails = pd.read_csv('emails.csv')

# Define stop words
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    return text

emails['text'] = emails['text'].apply(preprocess_text)

# feature
X = emails['text']
# label
y = emails['spam']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit the vectorizer on the training data and transform it into a BoW matrix
X_train_bow = vectorizer.fit_transform(X_train)

# Get the feature names (unique words)
feature_names = vectorizer.get_feature_names_out()

# Convert BoW matrix to DataFrame for easier manipulation
bow_df = pd.DataFrame(X_train_bow.toarray(), columns=feature_names)

# Add the labels to the DataFrame
bow_df['spam'] = y_train.values

# Calculate P(y)
P_spam = y_train.mean()
P_non_spam = 1 - P_spam

# Calculate P(x_i | y)
spam_bow = bow_df[bow_df['spam'] == 1].drop(columns='spam')
non_spam_bow = bow_df[bow_df['spam'] == 0].drop(columns='spam')

# Add-one smoothing
word_counts_spam = spam_bow.sum(axis=0) + 1  # Add-one smoothing
word_counts_non_spam = non_spam_bow.sum(axis=0) + 1  # Add-one smoothing

# Add-one smoothing to total words
total_spam_words = word_counts_spam.sum() + len(feature_names)  # Add-one smoothing
total_non_spam_words = word_counts_non_spam.sum() + len(feature_names)  # Add-one smoothing

P_xi_given_spam = word_counts_spam / total_spam_words
P_xi_given_non_spam = word_counts_non_spam / total_non_spam_words

In [8]:
# Convert test data to BoW matrix
X_test_bow = vectorizer.transform(X_test)

# Ensure arrays have the same length as feature names
P_xi_given_spam = P_xi_given_spam.reindex(feature_names, fill_value=1/total_spam_words)
P_xi_given_non_spam = P_xi_given_non_spam.reindex(feature_names, fill_value=1/total_non_spam_words)

# Predict the class for each email in the test set
predictions = []
for i in range(X_test_bow.shape[0]):
    email_vector = X_test_bow[i].toarray()[0]
    P_spam_email = P_spam * np.prod(np.power(P_xi_given_spam, email_vector))
    P_non_spam_email = P_non_spam * np.prod(np.power(P_xi_given_non_spam, email_vector))
    if P_spam_email > P_non_spam_email:
        predictions.append(1)
    else:
        predictions.append(0)

# Calculate accuracy
accuracy = np.mean(np.array(predictions) == y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 91.27%


#### In this implementation we didn't use logarithms. Now let's use!

In [9]:
# Convert test data to BoW matrix
X_test_bow = vectorizer.transform(X_test)

# Ensure log probabilities arrays have the same length as feature names
log_P_xi_given_spam = np.log(P_xi_given_spam.reindex(feature_names, fill_value=1/total_spam_words))
log_P_xi_given_non_spam = np.log(P_xi_given_non_spam.reindex(feature_names, fill_value=1/total_non_spam_words))

# Predict the class for each email in the test set
log_P_spam = np.log(P_spam)
log_P_non_spam = np.log(P_non_spam)

predictions = []
for i in range(X_test_bow.shape[0]):
    email_vector = X_test_bow[i].toarray()[0]
    log_likelihood_spam = log_P_spam + (email_vector * log_P_xi_given_spam).sum()
    log_likelihood_non_spam = log_P_non_spam + (email_vector * log_P_xi_given_non_spam).sum()
    if log_likelihood_spam > log_likelihood_non_spam:
        predictions.append(1)
    else:
        predictions.append(0)

# Calculate accuracy
accuracy = np.mean(np.array(predictions) == y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 98.34%


#### Now we see that the accuracy is higher than ever by using logarithms and removing stop words!