## Spam Classifier
__Machine Learning & Data Science Assignment by Thöni Andreas__  
2024

---

#### Preprocessing Infrastructure
This section introduces the __EmailPreprocessor__ class, which is able to convert a raw email file into a stemmed and normalized string

In [6]:
from email import policy
from email.parser import BytesParser
from nltk.stem.snowball import SnowballStemmer
import re
from string import punctuation

class EmailPreprocessor:
    """
    class for processing emails, extracting email parts and applying regex and stemming
    """

    def __init__(self):
        pass

    def extract_email_parts(self, file_path):
        """
        Extracts the subject, sender, recipient and body of an email file
        arguments: file_path - path to the email file
        return: dictionary with the email parts
        """

        # Open the email file
        with open(file_path, 'rb') as file:
            # Parse the email using the default policy
            email_message = BytesParser(policy=policy.default).parse(file)
        
        # Extract headers, like subject, sender and recipient
        subject = email_message.get('Subject', '(No Subject)')
        sender = email_message.get('From', '(Unknown Sender)')
        recipient = email_message.get('To', '(Unknown Recipient)')
        
        # Extract body
        body = ""
        if email_message.get_body(preferencelist=('plain', 'html')):
            body_content = email_message.get_body(preferencelist=('plain', 'html'))
            body = body_content.get_content()  # Automatically decodes and gets the content
        
        # Return the email parts as a dictionary
        return {
            "Subject": subject,
            "From": sender,
            "To": recipient,
            "Body": body.strip()
        }

    def stem_and_regex(self, email_dict):
        """
        Applies stemming and regex operations to the email parts
        arguments: email_dict - dictionary with the email parts
        return: string with the processed email
        """

        # create one string from the email parts
        email_string = email_dict["Subject"] + " " + email_dict["From"] + " " + email_dict["To"] + " " + email_dict["Body"]

        # convert to lowercase
        email_string = email_string.lower()

        # perform regex operations
        # change email adresses to 'emailaddr'
        email_string = re.sub(r'\b[\w\.-]+@[\w\.-]+\.\w+\b', 'emailaddr', email_string)

        # change urls to 'httpaddr'
        email_string = re.sub(r'(http|https)://[^\s]*', 'httpaddr', email_string)

        # change time to 'time'
        email_string = re.sub(r'\b\d{1,2}:\d{1,2}(:\d{1,2})?\b', 'time', email_string)

        # change date to 'date'
        email_string = re.sub(r'\b\d{1,2}/\d{1,2}/\d{4}\b', 'date', email_string)

        # change dollar to 'dollar'
        email_string = re.sub(r'\$\S+', 'dollar', email_string)

        # change www to 'wwwaddr'
        email_string = re.sub(r'\bwww\.[^\s]*\b', 'wwwaddr', email_string)

        # change percentages to 'percent'
        email_string = re.sub(r'\b\d+%', 'percent', email_string)

        # change ip to 'ipaddr'
        email_string = re.sub(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', 'ipaddr', email_string)

        # change numbers to 'number'
        email_string = re.sub(r'\b\d+\b', 'number', email_string)

        # Remove itemization prefixes (bullets, numbered lists, etc.)
        email_string = re.sub(r'^\s*[\d\w][\.\)\-]\s*|[\u2022\u2219\u25CB·]\s*', '', email_string, flags=re.MULTILINE)

        # other processing tasks
        # remove punctuation
        email_string = email_string.translate(str.maketrans('', '', punctuation))
        email_string = email_string.replace('\n', ' ')
        email_string = email_string.replace('\t', ' ')

        # remove multiple spaces
        email_string = re.sub(r'\s+', ' ', email_string)

        # stem the words
        stemmer = SnowballStemmer('english')
        email_string = ' '.join([stemmer.stem(word) for word in email_string.split()])

        return email_string

    def process_email(self,email_file_path):
        """
        Process an email file, calls the extract_email_parts and stem_and_regex functions
        arguments: email_file_path - path to the email file
        return: string with the processed email
        """

        email_parts = self.extract_email_parts(email_file_path)
        processed = self.stem_and_regex(email_parts)

        return processed

#### Load and preprocess ham and spam emails

In [16]:
import os


def load_data_from_directories(ham_dir, spam_dir):
    """
    Load email data from given directories (ham and spam)
    arguments: ham_dir - path to the ham directory
               spam_dir - path to the spam directory
    return: emails - list of email strings
            labels - list of labels (0 for ham, 1 for spam)
    """

    # Create an instance of the EmailPreprocessor class
    preprocessor = EmailPreprocessor()

    # Initialize lists to store the emails and labels
    emails = []
    labels = []

    # Load ham emails
    for file_name in os.listdir(ham_dir):
        file_path = os.path.join(ham_dir, file_name)
        try:
            email = preprocessor.process_email(file_path)
            emails.append(email)
            labels.append(0)  # 0 for ham
        except:
            print(f"Error processing file {file_path}")

    # Load spam emails
    for file_name in os.listdir(spam_dir):
        file_path = os.path.join(spam_dir, file_name)
        try:
            email = preprocessor.process_email(file_path)
            emails.append(email)
            labels.append(1)  # 1 for spam
        except:
            print(f"Error processing file {file_path}")
    
    return emails, labels

# use the function to load the data
ham_dir = "./datasets/20021010_easy_ham/easy_ham"
spam_dir = "./datasets/20021010_spam/spam"

emails, labels = load_data_from_directories(ham_dir, spam_dir)

print(f"Number of emails loaded: {len(emails)}")

Error processing file ./datasets/20021010_spam/spam\0226.409b6577c79d85773d50cb37fde4ba79
Error processing file ./datasets/20021010_spam/spam\0329.5c22249fa35fff050675e7df4433b89f
Error processing file ./datasets/20021010_spam/spam\0399.b9eab4251d9263129290cf7fc2aa4c7a
Number of emails loaded: 3049


#### Vectorize the data using a TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)

X = vectorizer.fit_transform(emails).toarray()
y = labels

print(X.shape)

(3049, 2000)


#### Splitting into training and test set

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

---
#### Try different classifiers

In [20]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

__Naive Bayes model__ (is apparently often used for spam classifiers)

In [23]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred)}")

Accuracy: 0.9754098360655737
Precision: 0.9767441860465116
Recall: 0.865979381443299
