# Spam Detector
The idea is to implement and train a Naive Bayes classifier to detect spam from ham (aka not spam) emails.

- The data contains over 5500 texts from emails with their corresponding labels

In [13]:
import pandas as pd

In [14]:
# Load the dataset
emails = pd.read_csv('data/emails.csv')

In [15]:
emails.head(5)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


## Preprocessing

In [16]:
def process_email(text):
    """
    Processes the given email text by converting it to lowercase, splitting it into words,
    and returning a list of unique words.

    Parameters:
    - text (str): The email text to be processed.

    Returns:
    - list: A list of unique words extracted from the email text.
    """

    text = text.lower()
    return list(set(text.split()))

# Create an extra column with the text converted to a lower-cased list of words
emails['words'] = emails['text'].apply(process_email)

# Show the first 5 rows
emails.head(5)

Unnamed: 0,text,target,words
0,"Go until jurong point, crazy.. Available only ...",ham,"[amore, wat..., cine, la, until, buffet..., go..."
1,Ok lar... Joking wif u oni...,ham,"[lar..., wif, joking, ok, oni..., u]"
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam,"[wkly, 08452810075over18's, 2005., may, 87121,..."
3,U dun say so early hor... U c already then say...,ham,"[hor..., c, early, say, so, say..., then, alre..."
4,"Nah I don't think he goes to usf, he lives aro...",ham,"[around, lives, think, he, don't, though, usf,..."


## Frequency of a word in each class

To compute the frequency of each word in the dataset, we need to define the `word_freq_per_class` below. This function receives the email dataframe as input and should return a dictionary that has the words in the emails as keys and another dictionary that keeps track of how many times that word appeared in `spam` and `hams` emails as values.

In [18]:
def word_freq_per_class(df):
    """
    Calculates the frequency of words in each class (spam and ham) based on a given dataframe.

    Args:
        df (pandas.DataFrame): The input dataframe containing email data, 
        with a column named 'words' representing the words in each email.

    Returns:
        dict: A dictionary containing the frequency of words in each class. 
        The keys of the dictionary are words, and the values are nested dictionaries with keys 
        'spam' and 'ham' representing the frequency of the word in spam and ham emails, respectively.
    """
    
    word_freq_dict = {}

    for _, email in df.iterrows():
        # Iterate over the words in each email
        for word in email['words']:
            # Check if word doesn't exist within the dictionary
            if word not in word_freq_dict:
                # If word doesn't exist, initialize the count at 0
                word_freq_dict[word] = {'spam': 0, 'ham': 0}
            
            # Check if the email was spam
            match email['target']:
                case 'ham': 
                    # If ham then add 1 to the count of ham
                    word_freq_dict[word]['ham'] += 1
                case 'spam': 
                    # If spam then add 1 to the count of spam
                    word_freq_dict[word]['spam'] += 1

    return word_freq_dict

In [25]:
word_freq = word_freq_per_class(emails)
keywords = ['winner', 'sale', 'prize', 'money', 'urgent', 'lottery']

for i in keywords:
    try:
        print(f"Frequency in both classes for word '{i}': {word_freq[f'{i}']}")
    except KeyError:
        print(f"Word '{i}' not in corpus")

Frequency in both classes for word 'winner': {'spam': 11, 'ham': 0}
Frequency in both classes for word 'sale': {'spam': 2, 'ham': 2}
Frequency in both classes for word 'prize': {'spam': 54, 'ham': 0}
Frequency in both classes for word 'money': {'spam': 3, 'ham': 37}
Frequency in both classes for word 'urgent': {'spam': 15, 'ham': 2}
Word 'lottery' not in corpus


## Frequency of classes

To compute the frequency of each class in the dataset, we need to define the `class_frequencies` below. This function receives the email dataframe as input and should return a dictionary that returns the number of spam and ham emails.

In [26]:
def class_frequencies(df):
    """
    Calculate the frequencies of classes in a DataFrame.

    Args:
        df (DataFrame): The input DataFrame containing a column 'spam' indicating class labels.

    Returns:
        dict: A dictionary containing the frequencies of the classes.
            The keys are 'spam' and 'ham', representing the class labels.
            The values are the corresponding frequencies in the DataFrame.
    """
    
    class_freq_dict = { 
        "spam": df[df["target"] == 'spam'].shape[0],
        "ham": df[df["target"] == 'ham'].shape[0]
    } 
    
    return class_freq_dict

In [27]:
class_freq = class_frequencies(emails[:10])
print(f"Small dataset:\n\nThe frequencies for each class are {class_freq}\n")
print(f"The proportion of spam in the dataset is: {100*class_freq['spam']/len(emails[:10]):.2f}%\n")
print(f"The proportion of ham in the dataset is: {100*class_freq['ham']/len(emails[:10]):.2f}%\n")

class_freq = class_frequencies(emails)
print(f"\nFull dataset:\n\nThe frequencies for each class are {class_freq}\n")
print(f"The proportion of spam in the dataset is: {100*class_freq['spam']/len(emails):.2f}%\n")
print(f"The proportion of ham in the dataset is: {100*class_freq['ham']/len(emails):.2f}%")

Small dataset:

The frequencies for each class are {'spam': 4, 'ham': 6}

The proportion of spam in the dataset is: 40.00%

The proportion of ham in the dataset is: 60.00%


Full dataset:

The frequencies for each class are {'spam': 747, 'ham': 4825}

The proportion of spam in the dataset is: 13.41%

The proportion of ham in the dataset is: 86.59%


## Naive Bayes for categorical features

The function `naive_bayes_classifier` receives any text as a parameter and should return the probability of that text belonging to the `spam` class. Notice that the function also receives the two dictionaries that were created during the previous exercises, which means that this probability will depend on the dataset you used for training. With this in mind, if you submit a text containing words that are not in the training dataset the probability should be equal to the proportion of `spam` in the emails:

In [28]:
def naive_bayes_classifier(text, word_freq=word_freq, class_freq=class_freq):
    """
    Implements a naive Bayes classifier to determine the probability of an email being spam.

    Args:
        text (str): The input email text to classify.
        
        word_freq (dict): A dictionary containing word frequencies in the training corpus. 
        The keys are words, and the values are dictionaries containing frequencies for 'spam' and 'ham' classes.

        class_freq (dict): A dictionary containing class frequencies in the training corpus. 
        The keys are class labels ('spam' and 'ham'), and the values are the respective frequencies.

    Returns:
        float: The probability of the email being spam.

    """

    text = text.lower()
    words = set(text.split())
    cumulative_product_spam = 1.0
    cumulative_product_ham = 1.0
    
    for word in words:
        if word in word_freq:
            cumulative_product_spam *= word_freq[word]['spam']/class_freq['spam']
            cumulative_product_ham *= word_freq[word]['ham']/class_freq['ham']
    
     # Calculate the likelihood of the words appearing in the email given that it is spam
    likelihood_word_given_spam = cumulative_product_spam * class_freq['spam']
    
    # Calculate the likelihood of the words appearing in the email given that it is ham
    likelihood_word_given_ham = cumulative_product_ham * class_freq['ham']
    
    # Calculate the posterior probability of the email being spam given that the words appear in the email 
    # (the probability of being a spam given the email content)
    prob_spam = likelihood_word_given_spam / (likelihood_word_given_spam + likelihood_word_given_ham)
    
    return prob_spam

In [36]:
msg = "meet me at the lobby of the hotel at nine am"
print(f"Probability of spam for email '{msg}': {100*naive_bayes_classifier(msg):.2f}%\n")

msg = "congratulations! you are the winner of the mega money today"
print(f"Probability of spam for email '{msg}': {100*naive_bayes_classifier(msg):.2f}%\n")

msg = "9898 asjfkjfdj"
print(f"Probability of spam for email '{msg}': {100*naive_bayes_classifier(msg):.2f}%\n")

Probability of spam for email 'meet me at the lobby of the hotel at nine am': 0.00%

Probability of spam for email 'congratulations! you are the winner of the mega money today': 100.00%

Probability of spam for email '9898 asjfkjfdj': 13.41%

