Yash Ajay Tapadiya : EX4

In [18]:
import os
import email
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [19]:
# Loading the content in notebook, Spam and Ham emails
def load_emails(directory):
    emails = []
    for filename in os.listdir(directory):
        path = os.path.join(directory, filename)
        if os.path.isfile(path):
            with open(path, "rb") as f:
                msg = email.message_from_binary_file(f)
                payload = ""
                if msg.is_multipart():
                    for part in msg.walk():
                        if part.get_content_type() == "text/plain":
                            payload += part.get_payload(decode=True).decode(errors="ignore")
                else:
                    payload = msg.get_payload(decode=True).decode(errors="ignore")
                emails.append(payload)
    return emails

spam_emails = load_emails('/content/drive/MyDrive/20021010_spam/spam')
ham_emails = load_emails('/content/drive/MyDrive/20021010_easy_ham/easy_ham')

X = spam_emails + ham_emails
y = [1]*len(spam_emails) + [0]*len(ham_emails)

print("We have Spam emails:", len(spam_emails))
print("We have Ham emails:", len(ham_emails))

We have Spam emails: 501
We have Ham emails: 2551


In [20]:
# Cleaning the data REmoving headers, removing urls, etc
def preprocess_email(text,
                     lower=True,
                     remove_headers=True,
                     replace_urls=True,
                     replace_numbers=True,
                     remove_punctuation=True):

    if remove_headers:
        text = text.split("\n\n", 1)[-1]

    if lower:
        text = text.lower()

    if replace_urls:
        text = re.sub(r"http\S+|www\S+", " URL ", text)

    if replace_numbers:
        text = re.sub(r"\d+", " NUMBER ", text)

    if remove_punctuation:
        text = re.sub(r"[^\w\s]", " ", text)

    return text

X_cleaned = [preprocess_email(email) for email in X]


In [21]:
# Splitting data to be used by our models

X_train, X_test, y_train, y_test = train_test_split(
    X_cleaned, y, test_size=0.2, random_state=42, stratify=y
)

In [22]:
# Email to vector
vectorizer = CountVectorizer(binary=True, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [23]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_vec, y_train)

y_pred_nb = nb.predict(X_test_vec)

print("Naive Bayes Results:")
print(classification_report(y_test, y_pred_nb))

Naive Bayes Results:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       511
           1       0.97      0.73      0.83       100

    accuracy                           0.95       611
   macro avg       0.96      0.86      0.90       611
weighted avg       0.95      0.95      0.95       611



In [24]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_vec, y_train)

y_pred_lr = lr.predict(X_test_vec)

print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       511
           1       0.99      0.68      0.80       100

    accuracy                           0.95       611
   macro avg       0.96      0.84      0.89       611
weighted avg       0.95      0.95      0.94       611



In [25]:
#SVM
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_vec, y_train)

y_pred_svm = svm.predict(X_test_vec)

print("SVM Results:")
print(classification_report(y_test, y_pred_svm))


SVM Results:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96       511
           1       0.77      0.91      0.83       100

    accuracy                           0.94       611
   macro avg       0.88      0.93      0.90       611
weighted avg       0.95      0.94      0.94       611

