In [66]:
### Libraries ###

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import email
import os
import string
import re
from collections import Counter

## Get the data

In [69]:
# Load the data

df = pd.read_csv("labels", sep=' ', names=['label', 'email_text'])
print("Size of the data:", df.shape)
df.head()

Size of the data: (37822, 2)


Unnamed: 0,label,email_text
0,ham,../data/000/000
1,spam,../data/000/001
2,spam,../data/000/002
3,ham,../data/000/003
4,spam,../data/000/004


In [71]:
# counting spam and ham
df.label.value_counts()

label
spam    24912
ham     12910
Name: count, dtype: int64

### Converting the labels to numerical values (0 - ham, 1 - spam)

In [74]:
df['numerical_label'] = df.label.map({'ham':0, 'spam':1})
print(df.shape)
df.head()

(37822, 3)


Unnamed: 0,label,email_text,numerical_label
0,ham,../data/000/000,0
1,spam,../data/000/001,1
2,spam,../data/000/002,1
3,ham,../data/000/003,0
4,spam,../data/000/004,1


## Extracting the message from each email

In [77]:
# Directory containing the email data
directory = "data"
email_messages = []
skip_first = True  # Flag to skip the first email

# Walk through the directory and process each file
for root, d_names, f_names in os.walk(directory):
    for file in f_names:
        # Construct the file path
        file_path = os.path.join(root, file)
        
        with open(file_path, 'rb') as f:
            # Parse the email message
            msg = email.message_from_binary_file(f)
            body = ""
            
            # Skip the first email
            if skip_first:
                skip_first = False  # Mark that the first email has been skipped
                continue  # Skip this iteration

            # Extract the body of the email
            if msg.is_multipart():
                for payload in msg.get_payload():
                    body = payload.get_payload()
            # if it is not multipart, immediately get the message
            else:
                body = msg.get_payload()
 
            # Add the extracted message to the list
            email_messages.append(body)

In [78]:
# Add the extracted messages to the dataframe under the 'email_text' column
df['email_text'] = email_messages

# Display the updated dataframe
df.head()

Unnamed: 0,label,email_text,numerical_label
0,ham,The mailing list I queried about a few weeks a...,0
1,spam,\n ...,1
2,spam,Academic Qualifications available from prestig...,1
3,ham,Greetings all. This is to verify your subscri...,0
4,spam,"<html>\n<head>\n<meta http-equiv=""Content-Lang...",1


## Splitting the dataset into training set (ham), training set (spam), and test set.

In [82]:
# Splitting the dataset into a training and test set
train_set = df[:21300]
test_set = df[21300:]

# group by label
full_train_set = train_set.groupby('label')
# training set for ham, training set for spam
train_ham = full_train_set.get_group('ham')
train_spam = full_train_set.get_group('spam')

#### Training set for ham

In [85]:
print("Size of training set for ham:", train_ham.shape)
train_ham.head()

Size of training set for ham: (7523, 3)


Unnamed: 0,label,email_text,numerical_label
0,ham,The mailing list I queried about a few weeks a...,0
3,ham,Greetings all. This is to verify your subscri...,0
5,ham,"It's quiet. Too quiet. Well, how about a str...",0
6,ham,It's working here. I have departed almost tot...,0
10,ham,Greetings all. This is a mass acknowledgement...,0


#### Training set for spam

In [88]:
print("Size of training set for spam:", train_spam.shape)
train_spam.head()

Size of training set for spam: (13777, 3)


Unnamed: 0,label,email_text,numerical_label
1,spam,\n ...,1
2,spam,Academic Qualifications available from prestig...,1
4,spam,"<html>\n<head>\n<meta http-equiv=""Content-Lang...",1
7,spam,From NBC Today Show:\n\nIt's the look everyone...,1
8,spam,The OIL sector is going crazy. This is our wee...,1


#### Test set

In [91]:
print("Size of testing set:", test_set.shape)
test_set.head()

Size of testing set: (16522, 3)


Unnamed: 0,label,email_text,numerical_label
21300,spam,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",1
21301,ham,\n There are several things you can use to p...,0
21302,spam,\nBest offer of the month:\n\nViggra - $76.95\...,1
21303,spam,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",1
21304,spam,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",1


 # Cleaning the dataset

####  Dropping the alphanumeric characters, punctuation marks

In [95]:
# Function to clean the email_text
def clean_email(email_text, min_word_length=3):
    # Convert to lowercase
    email_text = str(email_text).lower()
    
    # Remove HTML tags using a regular expression
    email_text = re.sub(r'<[^>]+>', '', email_text)
    
    # Remove escape characters
    email_text = re.sub(r'(\x9B|\x1B\[)[0-?]*[ -\/]*[@-~]', '', email_text)
    
    # Remove all non-alphabetic characters and digits
    email_text = re.sub(r'[^a-z]+', ' ', email_text)
    
    # Remove stop words and words shorter than the min_word_length
    clean_text = ' '.join(
        word for word in email_text.split() 
        if len(word) >= min_word_length
    )
    
    return clean_text

### Creating a new dataframe with cleaned emails

In [98]:
# Clean the email_text column using the clean_email function
clean_df = df.copy()
clean_df['email_text'] = df['email_text'].apply(clean_email)

# Display the updated DataFrame
clean_df.head(10)

Unnamed: 0,label,email_text,numerical_label
0,ham,the mailing list queried about few weeks ago n...,0
1,spam,luxury watches buy your own rolex for only rol...,1
2,spam,academic qualifications available from prestig...,1
3,ham,greetings all this verify your subscription th...,0
4,spam,guyton sheena nbsp nbsp nbsp will help you get...,1
5,ham,quiet too quiet well how about straw poll then...,0
6,ham,working here have departed almost totally from...,0
7,spam,from nbc today show the look everyone wants bo...,1
8,spam,the oil sector going crazy this our weekly gif...,1
9,spam,little magic perfect weekends http othxu rzfzw...,1


### Splitting the clean df

In [100]:
# Splitting the dataset into a training and test set
cleaned_train_set = clean_df[:21300]
cleaned_test_set = clean_df[21300:]

# group by label
cleaned_full_train_set = cleaned_train_set.groupby('label')
# training set for ham, training set for spam
cleaned_train_ham = cleaned_full_train_set.get_group('ham')
cleaned_train_spam = cleaned_full_train_set.get_group('spam')

In [102]:
cleaned_train_ham.head()

Unnamed: 0,label,email_text,numerical_label
0,ham,the mailing list queried about few weeks ago n...,0
3,ham,greetings all this verify your subscription th...,0
5,ham,quiet too quiet well how about straw poll then...,0
6,ham,working here have departed almost totally from...,0
10,ham,greetings all this mass acknowledgement everyo...,0


In [104]:
cleaned_train_spam.head()

Unnamed: 0,label,email_text,numerical_label
1,spam,luxury watches buy your own rolex for only rol...,1
2,spam,academic qualifications available from prestig...,1
4,spam,guyton sheena nbsp nbsp nbsp will help you get...,1
7,spam,from nbc today show the look everyone wants bo...,1
8,spam,the oil sector going crazy this our weekly gif...,1


In [106]:
cleaned_test_set.head()

Unnamed: 0,label,email_text,numerical_label
21300,spam,,1
21301,ham,there are several things you can use perform y...,0
21302,spam,best offer the month viggra ialis vaiium naax ...,1
21303,spam,home wne nbsp your doesn matter you real and w...,1
21304,spam,body font size color font family verdana sans ...,1


In [108]:
clean_df[21300:21305]

Unnamed: 0,label,email_text,numerical_label
21300,spam,,1
21301,ham,there are several things you can use perform y...,0
21302,spam,best offer the month viggra ialis vaiium naax ...,1
21303,spam,home wne nbsp your doesn matter you real and w...,1
21304,spam,body font size color font family verdana sans ...,1


### Extracting the list of unique words from the training set along with its summed number of occurences from the ham and spam set.

In [28]:
# Combine all email messages from the training set for vocabulary creation
all_messages = pd.concat([cleaned_train_ham['email_text'], cleaned_train_spam['email_text']])

# Tokenize messages and create a vocabulary
def tokenize(text):
    # Remove punctuation and non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and spaces
    return text.lower().split()  # Tokenize and convert to lowercase

# Tokenize all messages
all_tokens = all_messages.apply(tokenize).explode()

# Create a Counter to find the most common words
word_counts = Counter(all_tokens)
# Get the 10,000 most common words
vocabulary = [word for word, _ in word_counts.most_common(10000)]

# Create feature matrices for ham and spam training sets
def create_feature_matrix(dataframe, vocabulary):
    matrix = np.zeros((len(dataframe), len(vocabulary)), dtype=int)
    
    for i, message in enumerate(dataframe['email_text']):
        tokens = tokenize(message)
        for token in tokens:
            if token in vocabulary:
                matrix[i, vocabulary.index(token)] = 1  # Set 1 for word presence
                
    return matrix

# Create feature matrices
ham_feature_matrix = create_feature_matrix(cleaned_train_ham, vocabulary)
spam_feature_matrix = create_feature_matrix(cleaned_train_spam, vocabulary)

# Convert feature matrices to DataFrames for easier viewing
ham_matrix_df = pd.DataFrame(ham_feature_matrix, columns=vocabulary)
spam_matrix_df = pd.DataFrame(spam_feature_matrix, columns=vocabulary)

### Feature Matrix for the ham training data set

In [30]:
ham_matrix_df

Unnamed: 0,the,and,nbsp,you,for,that,this,with,from,have,...,swapped,slope,defect,tops,vos,cares,aaaaaaaaaaaa,receivers,wannabe,ruby
0,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7518,1,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7519,1,1,0,1,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7520,1,1,0,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7521,1,1,0,0,1,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0


### Feature Matrix for the spam training data set

In [32]:
spam_matrix_df

Unnamed: 0,the,and,nbsp,you,for,that,this,with,from,have,...,swapped,slope,defect,tops,vos,cares,aaaaaaaaaaaa,receivers,wannabe,ruby
0,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,1,1,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13772,1,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
13773,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13774,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13775,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Computing the priors

In [34]:
# Getting the count of ham and spam emails and the total count of email in the training set
ham_count = len(cleaned_train_ham)
spam_count = len(cleaned_train_spam)
email_count = len(cleaned_train_set)

# probability of ham
prior_ham = ham_count/email_count
print("Prior probabilities for ham: ", prior_ham)

# probability of spam
prior_spam = spam_count/email_count
print("Prior probabilities for spam: ", prior_spam)

Prior probabilities for ham:  0.3531924882629108
Prior probabilities for spam:  0.6468075117370892


## Computing the likelihood

#### count the frequency of each word in the vocabulary

In [37]:
# Count occurrences of each word in ham and spam training sets
def count_word_occurrences(dataframe, vocabulary):
    counts = np.zeros(len(vocabulary), dtype=int)
    
    for message in dataframe['email_text']:
        tokens = tokenize(message)
        for token in tokens:
            if token in vocabulary:
                counts[vocabulary.index(token)] += 1
                
    return counts

#### Computing the likelihood of each word in the vocabulary

In [39]:
# Count occurrences for ham and spam
ham_word_counts = count_word_occurrences(cleaned_train_ham, vocabulary)
spam_word_counts = count_word_occurrences(cleaned_train_spam, vocabulary)

# Calculate total counts for ham and spam
total_ham_words = ham_word_counts.sum()
total_spam_words = spam_word_counts.sum()

# Apply Laplace Smoothing
lambda_value = 1
vocabulary_size = len(vocabulary)

# Calculate the likelihoods with Laplace smoothing
ham_likelihoods = (ham_word_counts + lambda_value) / (total_ham_words + lambda_value * vocabulary_size)
spam_likelihoods = (spam_word_counts + lambda_value) / (total_spam_words + lambda_value * vocabulary_size)

# Create a DataFrame for easier viewing
likelihoods_df = pd.DataFrame({
    'word': vocabulary,
    'P(word|ham)': ham_likelihoods,
    'P(word|spam)': spam_likelihoods
})

# Display the resulting DataFrame
likelihoods_df  # Display the likelihoods DataFrame

Unnamed: 0,word,P(word|ham),P(word|spam)
0,the,0.065061,3.081193e-02
1,and,0.024675,2.473797e-02
2,nbsp,0.004085,2.590862e-02
3,you,0.011367,1.336729e-02
4,for,0.014911,8.006348e-03
...,...,...,...
9995,cares,0.000012,6.643047e-06
9996,aaaaaaaaaaaa,0.000018,7.381163e-07
9997,receivers,0.000018,7.381163e-07
9998,wannabe,0.000012,6.643047e-06


## Classifying the emails

#### Computing the probabilities of ham or spam given the document (using log)

In [42]:
# Calculate log prior probabilities
log_prior_ham = np.log(prior_ham)
log_prior_spam = np.log(prior_spam)

# Classify emails in the test set
def classify_email(email_text, vocabulary, ham_likelihoods, spam_likelihoods, log_prior_ham, log_prior_spam):
    # Tokenize the email text
    tokens = tokenize(email_text)
    
    # Initialize log probabilities
    log_prob_ham = log_prior_ham
    log_prob_spam = log_prior_spam
    
    # Calculate log probabilities for each word in the vocabulary
    for word in vocabulary:
        if word in tokens:
            log_prob_ham += np.log(ham_likelihoods[vocabulary.index(word)])
            log_prob_spam += np.log(spam_likelihoods[vocabulary.index(word)])
    
    return log_prob_ham, log_prob_spam

# Create a DataFrame to store original and predicted labels
predictions = []

for _, row in cleaned_train_set.iterrows():
    email_text = row['email_text']
    log_prob_ham, log_prob_spam = classify_email(email_text, vocabulary, ham_likelihoods, spam_likelihoods, log_prior_ham, log_prior_spam)
    
    # Classify based on log probabilities
    predicted_label = 'spam' if log_prob_spam > log_prob_ham else 'ham'
    predictions.append(predicted_label)

# Create DataFrame with original labels and predicted labels
results_df = pd.DataFrame({
    'email_text': cleaned_train_set['email_text'].values,
    'original_label': cleaned_train_set['label'].values,
    'predicted_label': predictions
})

# Display the results
results_df 


Unnamed: 0,email_text,original_label,predicted_label
0,the mailing list queried about few weeks ago n...,ham,ham
1,luxury watches buy your own rolex for only rol...,spam,spam
2,academic qualifications available from prestig...,spam,spam
3,greetings all this verify your subscription th...,ham,ham
4,guyton sheena nbsp nbsp nbsp will help you get...,spam,spam
...,...,...,...
21295,http get high biz bld xin walla com,spam,spam
21296,body font size color font family verdana sans ...,spam,spam
21297,lovechocolat blovechocolatebar http pan model ...,spam,spam
21298,have mounted the infrared demodulator the but ...,ham,ham


## Testing the model on the test set

In [44]:
# Classify each email in the test set
test_predictions = []

for _, row in cleaned_test_set.iterrows():
    email_text = row['email_text']
    log_prob_ham, log_prob_spam = classify_email(email_text, vocabulary, ham_likelihoods, spam_likelihoods, log_prior_ham, log_prior_spam)
    
    # Classify based on log probabilities
    predicted_label = 'spam' if log_prob_spam > log_prob_ham else 'ham'
    test_predictions.append(predicted_label)

# Create DataFrame with original labels and predicted labels for the test set
test_results_df = pd.DataFrame({
    'email_text': cleaned_test_set['email_text'].values,
    'original_label': cleaned_test_set['label'].values,
    'predicted_label': test_predictions
})

# Display the test results
test_results_df

Unnamed: 0,email_text,original_label,predicted_label
0,,spam,spam
1,there are several things you can use perform y...,ham,ham
2,best offer the month viggra ialis vaiium naax ...,spam,spam
3,home wne nbsp your doesn matter you real and w...,spam,spam
4,body font size color font family verdana sans ...,spam,spam
...,...,...,...
16517,great news expec ted infinex ventures inc infx...,spam,spam
16518,the oil sector going crazy this our weekly gif...,spam,spam
16519,http vdtobj docscan info suffering from pain d...,spam,spam
16520,you want for prosperous future increased money...,spam,spam


## Performance Evaluation

In [46]:
# Initialize counts for TP, TN, FP, and FN
TP = TN = FP = FN = 0

for _, row in test_results_df.iterrows():
    original_label = row['original_label']
    predicted_label = row['predicted_label']
    
    if original_label == 'spam' and predicted_label == 'spam':
        TP += 1  # True Positive
    elif original_label == 'ham' and predicted_label == 'ham':
        TN += 1  # True Negative
    elif original_label == 'ham' and predicted_label == 'spam':
        FP += 1  # False Positive
    elif original_label == 'spam' and predicted_label == 'ham':
        FN += 1  # False Negative

# Calculate evaluation metrics
accuracy = (TP + TN) / (TP + TN + FP + FN)
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
precision = TP / (TP + FP) if (TP + FP) > 0 else 0

# False Positive Rate and False Negative Rate
false_positive_rate = FP
false_negative_rate = FN

# Output the results
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")


True Positives (TP): 9884
True Negatives (TN): 5087
False Positives (FP): 300
False Negatives (FN): 1251
Accuracy: 90.61%
Recall: 88.77%
Precision: 97.05%
