Machine Learning Project 1 - Benjamin Walmer

In [1]:
# Using numpy and pandas
import numpy as np
import pandas as pd

Reading in training datasets from public github link

In [2]:
# Importing libraries for reading files from github
import os
import requests
import zipfile

# Downloading the full repository (zipped)
url = "https://github.com/benwalmer/Email_Classification/archive/refs/heads/main.zip"
zip_path = "repo.zip"
r = requests.get(url)
with open(zip_path, "wb") as f:
    f.write(r.content)

# Unnzipping the repository
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(".")
repo_folder = [d for d in os.listdir() if d.endswith("-main")][0]

# Step 3: Iterating over all zipped datasets inside repo
data = []
for item in os.listdir(repo_folder):
    if item.endswith("train.zip"):
        inner_zip_path = os.path.join(repo_folder, item)

        # Extracting the dataset
        with zipfile.ZipFile(inner_zip_path, 'r') as zip_ref:
            zip_ref.extractall(".")

        dataset_name = item.replace("_train.zip", "")
        base_dir = ""
        # Special case for enron2 because its file path is stored differently
        if (dataset_name == "enron2"):
            base_dir = "train"
        else:
            base_dir = os.path.join(dataset_name, "train")

        # Step 4: Loading in spam/ham emails
        for label in ["spam", "ham"]:
            folder_path = os.path.join(base_dir, label)
            if os.path.exists(folder_path):
                for filename in os.listdir(folder_path):
                    file_path = os.path.join(folder_path, filename)
                    if os.path.isfile(file_path):
                        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                            text = f.read()
                        data.append((text, label, dataset_name))



Splitting dataset into different training sets

In [3]:
# Data frame of spam/ham emails, split into different training sets
df = pd.DataFrame(data, columns=["text", "label", "dataset"])
#make enron1 a new dataset and reset the indices
enron1 = df[df['dataset'] == 'enron1']
enron1 = enron1.reset_index(drop=True)
enron2 = df[df['dataset'] == 'enron2']
enron2 = enron2.reset_index(drop=True)
enron4 = df[df['dataset'] == 'enron4']
enron4 = enron4.reset_index(drop=True)
enron1 = enron1.drop(columns=['dataset'])
enron2 = enron2.drop(columns=['dataset'])
enron4 = enron4.drop(columns=['dataset'])

Reading in test datasets from public github link

In [4]:
# Step 3: Iterating over all zipped datasets inside repo
test_data = []
for item in os.listdir(repo_folder):
    if item.endswith("test.zip"):
        inner_zip_path = os.path.join(repo_folder, item)

        # Extracting the dataset
        with zipfile.ZipFile(inner_zip_path, 'r') as zip_ref:
            zip_ref.extractall(".")

        dataset_name = item.replace("_test.zip", "")
        base_dir = ""
        # Special case for enron2 because its file path is stored differently
        if (dataset_name == "enron2"):
            base_dir = "test"
        else:
            base_dir = os.path.join(dataset_name, "test")
        # Step 4: Loading in spam/ham emails
        for label in ["spam", "ham"]:
            folder_path = os.path.join(base_dir, label)
            if os.path.exists(folder_path):
                for filename in os.listdir(folder_path):
                    file_path = os.path.join(folder_path, filename)
                    if os.path.isfile(file_path):
                        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                            text = f.read()
                        test_data.append((text, label, dataset_name))


Splitting dataset into different test sets

In [5]:
# Data frame of spam/ham emails, split into different training sets
df_test = pd.DataFrame(test_data, columns=["text", "label", "dataset"])
#make enron1 a new dataset and reset the indices
enron1_test = df_test[df_test['dataset'] == 'enron1']
enron1_test = enron1_test.reset_index(drop=True)
enron2_test = df_test[df_test['dataset'] == 'enron2']
enron2_test = enron2_test.reset_index(drop=True)
enron4_test = df_test[df_test['dataset'] == 'enron4']
enron4_test = enron4_test.reset_index(drop=True)
enron1_test = enron1_test.drop(columns=['dataset'])
enron2_test = enron2_test.drop(columns=['dataset'])
enron4_test = enron4_test.drop(columns=['dataset'])

importing NLTK Processing Library

In [6]:
# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Training Preprocessing

In [7]:
# Preprocessing Emails
import re
from collections import Counter

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

# Step 1: Preprocess text
def preprocess(text):
    # Lowercase
    text = text.lower()

    # Remove all text between "forwarded" and "subject", multiple times if present
    text = re.sub(r'forwarded.*?subject', '', text, flags=re.DOTALL)

    text = re.sub(r'original.*?subject', '', text, flags=re.DOTALL)

    text = re.sub(r'\b(from|to|cc|subject|re)\b', '', text)

    # Remove newline characters
    text = text.replace("\n", " ")

    # Tokenize
    tokens = word_tokenize(text)

    # Clean tokens
    clean_tokens = []
    for t in tokens:
        # Remove punctuation, numbers, and special characters from each token
        t = re.sub(r'[^a-z]', '', t)
        if t and t not in stop_words:
            clean_tokens.append(t)
    return clean_tokens

In [8]:
def buildvocab(df):
  processed_texts = df['text'].apply(preprocess)
  all_words = Counter()
  for tokens in processed_texts:
      all_words.update(tokens)
  all_words = sorted(all_words)
  return processed_texts, all_words

In [9]:
def generate_df(processed_texts, vocab, labels):
  bow_mat = []
  for tokens in processed_texts:
    word_counts = Counter(tokens)
    row = [word_counts.get(word, 0) for word in vocab]
    bow_mat.append(row)
  vocab_bow = pd.DataFrame(bow_mat, columns=vocab)
  vocab_bow['label'] = labels
  ber_mat = []
  for tokens in processed_texts:
    token_set = set(tokens)
    row = [1 if word in token_set else 0 for word in vocab]
    ber_mat.append(row)
  vocab_ber = pd.DataFrame(ber_mat, columns=vocab)
  vocab_ber['label'] = labels
  return vocab_bow, vocab_ber

In [10]:
# Training Data
p1, e1_vocab = buildvocab(enron1)
enron1_bow_train, enron1_bernoulli_train = generate_df(p1, e1_vocab, enron1['label'])
p2, e2_vocab = buildvocab(enron2)
enron2_bow_train, enron2_bernoulli_train = generate_df(p2, e2_vocab, enron2['label'])
p4, e4_vocab = buildvocab(enron4)
enron4_bow_train, enron4_bernoulli_train = generate_df(p4, e4_vocab, enron4['label'])

# Test Data
enron1_bow_test, enron1_bernoulli_test = generate_df(enron1_test['text'].apply(preprocess), e1_vocab, enron1_test['label'])
enron2_bow_test, enron2_bernoulli_test = generate_df(enron2_test['text'].apply(preprocess), e2_vocab, enron2_test['label'])
enron4_bow_test, enron4_bernoulli_test = generate_df(enron4_test['text'].apply(preprocess), e4_vocab, enron4_test['label'])

In [11]:
# Exporting all training files
enron1_bow_train.to_csv('enron1_bow_train.csv', index=False)
enron1_bernoulli_train.to_csv('enron1_bernoulli_train.csv', index=False)
enron2_bow_train.to_csv('enron2_bow_train.csv', index=False)
enron2_bernoulli_train.to_csv('enron2_bernoulli_train.csv', index=False)
enron4_bow_train.to_csv('enron4_bow_train.csv', index=False)
enron4_bernoulli_train.to_csv('enron4_bernoulli_train.csv', index=False)

# Exporting all test files
enron1_bow_test.to_csv('enron1_bow_test.csv', index=False)
enron1_bernoulli_test.to_csv('enron1_bernoulli_test.csv', index=False)
enron2_bow_test.to_csv('enron2_bow_test.csv', index=False)
enron2_bernoulli_test.to_csv('enron2_bernoulli_test.csv', index=False)
enron4_bow_test.to_csv('enron4_bow_test.csv', index=False)
enron4_bernoulli_test.to_csv('enron4_bernoulli_test.csv', index=False)

Summary Statistics for Each Dataset:

BOW:

In [15]:
print("enron1_train Summary Statistics:")
print("Number of emails:", enron1_bow_train.shape[0])
print("Number of spam emails:", enron1_bow_train[enron1_bow_train['label'] == 'spam'].shape[0])
print("Number of ham emails:", enron1_bow_train[enron1_bow_train['label'] == 'ham'].shape[0])
print("Number of unique vocabulary words:", enron1_bow_train.shape[1]-1)

print("\n")
print("enron1_test Summary Statistics:")
print("Number of emails:", enron1_bow_test.shape[0])
print("Number of spam emails:", enron1_bow_test[enron1_bow_test['label'] == 'spam'].shape[0])
print("Number of ham emails:", enron1_bow_test[enron1_bow_test['label'] == 'ham'].shape[0])
print("Number of unique vocabulary words:", enron1_bow_test.shape[1]-1)

print("\n")
print("enron2_train Summary Statistics:")
print("Number of emails:", enron2_bow_train.shape[0])
print("Number of spam emails:", enron2_bow_train[enron2_bow_train['label'] == 'spam'].shape[0])
print("Number of ham emails:", enron2_bow_train[enron2_bow_train['label'] == 'ham'].shape[0])
print("Number of unique vocabulary words:", enron2_bow_train.shape[1]-1)

print("\n")
print("enron2_test Summary Statistics:")
print("Number of emails:", enron2_bow_test.shape[0])
print("Number of spam emails:", enron2_bow_test[enron2_bow_test['label'] == 'spam'].shape[0])
print("Number of ham emails:", enron2_bow_test[enron2_bow_test['label'] == 'ham'].shape[0])
print("Number of unique vocabulary words:", enron2_bow_test.shape[1]-1)

print("\n")
print("enron4_train Summary Statistics:")
print("Number of emails:", enron4_bow_train.shape[0])
print("Number of spam emails:", enron4_bow_train[enron4_bow_train['label'] == 'spam'].shape[0])
print("Number of ham emails:", enron4_bow_train[enron4_bow_train['label'] == 'ham'].shape[0])
print("Number of unique vocabulary words:", enron4_bow_train.shape[1]-1)

print("\n")
print("enron4_test Summary Statistics:")
print("Number of emails:", enron4_bow_test.shape[0])
print("Number of spam emails:", enron4_bow_test[enron4_bow_test['label'] == 'spam'].shape[0])
print("Number of ham emails:", enron4_bow_test[enron4_bow_test['label'] == 'ham'].shape[0])
print("Number of unique vocabulary words:", enron4_bow_test.shape[1]-1)



enron1_train Summary Statistics:
Number of emails: 450
Number of spam emails: 131
Number of ham emails: 319
Number of unique vocabulary words: 8238


enron1_test Summary Statistics:
Number of emails: 456
Number of spam emails: 149
Number of ham emails: 307
Number of unique vocabulary words: 8238


enron2_train Summary Statistics:
Number of emails: 463
Number of spam emails: 123
Number of ham emails: 340
Number of unique vocabulary words: 8818


enron2_test Summary Statistics:
Number of emails: 478
Number of spam emails: 130
Number of ham emails: 348
Number of unique vocabulary words: 8818


enron4_train Summary Statistics:
Number of emails: 535
Number of spam emails: 402
Number of ham emails: 133
Number of unique vocabulary words: 16322


enron4_test Summary Statistics:
Number of emails: 543
Number of spam emails: 391
Number of ham emails: 152
Number of unique vocabulary words: 16322


Modeling...

Naive Bayes Multinomial (BOW)

In [16]:
def mnb(vocab, df_original, df):
  V = vocab
  V_len = len(vocab)
  N = len(df_original)
  spam = df[df['label'] == 'spam']
  ham = df[df['label'] == 'ham']
  prior_spam = len(spam) / N
  prior_ham = len(ham) / N

  text_spam = spam.drop(columns=['label']).sum()
  text_ham = ham.drop(columns=['label']).sum()

  denom_spam = text_spam.sum() + V_len
  denom_ham = text_ham.sum() + V_len

  text_spam = pd.DataFrame({
    'count': text_spam,
    'conditional': (text_spam + 1) / denom_spam
})

  text_ham = pd.DataFrame({
    'count': text_ham,
    'conditional': (text_ham + 1) / denom_ham
})
  return prior_spam, prior_ham, text_spam, text_ham, V


In [17]:
def predict_mnb (prior_spam, prior_ham, text_spam, text_ham, V, tokens):
  import math
  spam_likelihood = math.log(prior_spam)
  ham_likelihood = math.log(prior_ham)
  for token in tokens:
      if token in V:
          spam_likelihood += math.log(text_spam.loc[token, "conditional"])
          ham_likelihood += math.log(text_ham.loc[token, "conditional"])

  return 'spam' if spam_likelihood >= ham_likelihood else 'ham'

In [28]:
def evaluate(df):
  total = len(df)
  spam_correct = sum((df['label'] == 'spam') & (df['pred_label'] == 'spam'))
  spam_incorrect = sum((df['label'] == 'ham') & (df['pred_label'] == 'spam'))
  ham_incorrect = sum((df['label'] == 'spam') & (df['pred_label'] == 'ham'))
  ham_correct = sum((df['label'] == 'ham') & (df['pred_label'] == 'ham'))

  accuracy = ((spam_correct + ham_correct) / total)
  precision = (spam_correct / (spam_correct + spam_incorrect))
  recall = (spam_correct / (spam_correct + ham_incorrect))
  f1_score = (2 * precision * recall) / (precision + recall)

  print(f"Accuracy: {accuracy:.4f}")
  print(f"Precision: {precision: .4f}")
  print(f"Recall: {recall: .4f}")
  print(f"F1 Score: {f1_score: .4f}")

Enron1 MNB

In [30]:
enron1_bow_test_mnb = enron1_bow_test.copy()
ps1, ph1, ts1, th1, v1 = mnb(e1_vocab, enron1, enron1_bow_train)
enron1_bow_test_mnb['pred_label'] = enron1_test['text'].apply(preprocess).apply(lambda tokens: predict_mnb(ps1, ph1, ts1, th1, v1, tokens))
print ("Enron 1 MNB Evaluation Metrics:")
evaluate(enron1_bow_test_mnb)

Enron 1 MNB Evaluation Metrics:
Accuracy: 0.9298
Precision:  0.9333
Recall:  0.8456
F1 Score:  0.8873


Enron2 MNB



In [32]:
enron2_bow_test_mnb = enron2_bow_test.copy()
ps2, ph2, ts2, th2, v2 = mnb(e2_vocab, enron2, enron2_bow_train)
enron2_bow_test_mnb['pred_label'] = enron2_test['text'].apply(preprocess).apply(lambda tokens: predict_mnb(ps2, ph2, ts2, th2, v2, tokens))
print ("Enron 2 MNB Evaluation Metrics:")
evaluate(enron2_bow_test_mnb)

Enron 2 MNB Evaluation Metrics:
Accuracy: 0.9435
Precision:  0.9256
Recall:  0.8615
F1 Score:  0.8924


Enron4 MNB



In [33]:
enron4_bow_test_mnb = enron4_bow_test.copy()
ps4, ph4, ts4, th4, v4 = mnb(e4_vocab, enron4, enron4_bow_train)
enron4_bow_test_mnb['pred_label'] = enron4_test['text'].apply(preprocess).apply(lambda tokens: predict_mnb(ps4, ph4, ts4, th4, v4, tokens))
print ("Enron 4 MNB Evaluation Metrics:")
evaluate(enron4_bow_test_mnb)

Enron 4 MNB Evaluation Metrics:
Accuracy: 0.9540
Precision:  0.9598
Recall:  0.9770
F1 Score:  0.9683


Enron1 Bernoulli Naive Bayes:

In [34]:
def bnb(vocab, df_original, df):
  V = vocab

  e1ber_spam = df[df['label'] == 'spam']
  e1ber_ham = df[df['label'] == 'ham']

  textber_spam = e1ber_spam.drop(columns=['label']).sum()
  textber_ham = e1ber_ham.drop(columns=['label']).sum()

  spams = e1ber_spam.shape[0]
  hams = e1ber_ham.shape[0]
  prior_spamber = spams / (spams + hams)
  prior_hamber = hams / (spams + hams)

  # Add conditionals as new columns (Laplace smoothing)
  textber_spam = pd.DataFrame({
      'count': textber_spam,
      'conditional': ((textber_spam + 1) / (len(e1ber_spam) + 2))
  })

  textber_ham = pd.DataFrame({
      'count': textber_ham,
      'conditional': ((textber_ham + 1) / (len(e1ber_ham) + 2))
  })

  return prior_spamber, prior_hamber, textber_spam, textber_ham, V


In [35]:
def predict_bnb(prior_spamber, prior_hamber, textber_spam, textber_ham, V, tokens):
  import math
  spam_likelihood = math.log(prior_spamber)
  ham_likelihood = math.log(prior_hamber)

  for token in V:
      if token in tokens:
        spam_likelihood += math.log(textber_spam.loc[token, "conditional"])
        ham_likelihood += math.log(textber_ham.loc[token, "conditional"])
      else:
        spam_likelihood += math.log(1 - textber_spam.loc[token, "conditional"])
        ham_likelihood += math.log(1 - textber_ham.loc[token, "conditional"])

  return 'spam' if spam_likelihood >= ham_likelihood else 'ham'

In [37]:
enron1_bernoulli_test_bnb = enron1_bernoulli_test.copy()
ps11, ph11, ts11, th11, v11 = bnb(e1_vocab, enron1, enron1_bernoulli_train)
enron1_bernoulli_test_bnb['pred_label'] = enron1_test['text'].apply(preprocess).apply(lambda tokens: predict_bnb(ps11, ph11, ts11, th11, v11, tokens))
print("Enron 1 Bernoulli NB Evaluation Metrics:")
evaluate(enron1_bernoulli_test_bnb)

Enron 1 Bernoulli NB Evaluation Metrics:
Accuracy: 0.7215
Precision:  0.8438
Recall:  0.1812
F1 Score:  0.2983


In [38]:
enron2_bernoulli_test_bnb = enron2_bernoulli_test.copy()
ps22, ph22, ts22, th22, v22 = bnb(e2_vocab, enron2, enron2_bernoulli_train)
enron2_bernoulli_test_bnb['pred_label'] = enron2_test['text'].apply(preprocess).apply(lambda tokens: predict_bnb(ps22, ph22, ts22, th22, v22, tokens))
print("Enron 2 Bernoulli NB Evaluation Metrics:")
evaluate(enron2_bernoulli_test_bnb)

Enron 2 Bernoulli NB Evaluation Metrics:
Accuracy: 0.7762
Precision:  0.8966
Recall:  0.2000
F1 Score:  0.3270


In [39]:
enron4_bernoulli_test_bnb = enron4_bernoulli_test.copy()
ps44, ph44, ts44, th44, v44 = bnb(e4_vocab, enron4, enron4_bernoulli_train)
enron4_bernoulli_test_bnb['pred_label'] = enron4_test['text'].apply(preprocess).apply(lambda tokens: predict_bnb(ps44, ph44, ts44, th44, v44, tokens))
print("Enron 4 Bernoulli NB Evaluation Metrics:")
evaluate(enron4_bernoulli_test_bnb)

Enron 4 Bernoulli NB Evaluation Metrics:
Accuracy: 0.9098
Precision:  0.8886
Recall:  1.0000
F1 Score:  0.9410


Logistic Regression...

In [40]:
def log_regression(X, lr, T, ld):
    # Extract y and map labels to {0,1}
    y = X[:, -1].copy()
    y = np.where(y == 'ham', 0, 1).astype(float)

    # Features (remove label column)
    X = X[:, :-1].astype(float)
    d, n = X.shape

    # Add bias column
    X = np.hstack((np.ones((d, 1)), X))

    # Initialize weights
    W = np.ones(n + 1)

    for t in range(T):
        # Linear combination (vectorized)
        z = X @ W
        z = np.clip(z, -500, 500)  # numerical stability

        # Sigmoid
        ypred = 1 / (1 + np.exp(-z))

        # Gradient (vectorized)
        g = X.T @ (y - ypred)
        g[1:] -= ld * W[1:]  # L2 penalty (skip bias)

        # Update weights
        W += lr * g

    return W


In [41]:
def accuracy(df):
  total = len(df)
  spam_correct = sum((df['label'] == 'spam') & (df['pred_label'] == 'spam'))
  ham_correct = sum((df['label'] == 'ham') & (df['pred_label'] == 'ham'))
  accuracy = ((spam_correct + ham_correct) / total)
  return accuracy


In [42]:
def tuning(df):
    """
    df: pandas DataFrame (or numpy array) with features + label in last column.
        If df is a DataFrame, convert to numpy before calling log_regression.
    lr_list: list of learning rates to try
    T: iterations for log_regression
    ld: lambda (regularization) fixed while tuning lr
    Returns best_lr (lowest validation log loss) and a dict mapping lr->val_loss
    """
    T = [1000]
    ld = [0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 16, 18, 20]
    lr_list = [0.001]

    from sklearn.model_selection import train_test_split
    # Convert DataFrame to numpy array before splitting
    df_np = df.to_numpy()
    X_train, X_val = train_test_split(df_np, test_size=0.3, random_state=11)
    results = {}

    for t in T:
        for l in ld:
          for lr in lr_list:
            W = log_regression(X_train.copy(), lr, t, l)
            val = pd.DataFrame(X_val, columns=df.columns)
            val['pred_label'] = predict(val, W)
            val['pred_label'] = val['pred_label'].apply(lambda x: 'spam' if x > 0.5 else 'ham')
            val_acc = accuracy(val)
            results[(lr, t, l)] = val_acc
            print(f"t: {t}  ld: {l}  lr: {lr:.4f} validation accuracy: {val_acc:.4f}")

    best_params = max(results, key=results.get)
    print("Best iterations:" + str(best_params[1]))
    print("Best learning rate:" + str(best_params[0]))
    print("Best lambda:" + str(best_params[2]))
    return best_params

Enron1 BOW


In [43]:
def predict(X, W):
    X_test = X.to_numpy()
    X_test = X_test[:, :-1].astype(float)
    X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))
    z = np.clip(X_test @ W, -500, 500)
    return 1 / (1 + np.exp(-z))

In [46]:
print("Enron 1 Bag of Words Parameter Tuning:")
e1_bow_params = tuning(enron1_bow_train)

Enron 1 Bag of Words Parameter Tuning:
t: 1000  ld: 0  lr: 0.0010 validation accuracy: 0.8815
t: 1000  ld: 2  lr: 0.0010 validation accuracy: 0.9704
t: 1000  ld: 3  lr: 0.0010 validation accuracy: 0.9630
t: 1000  ld: 4  lr: 0.0010 validation accuracy: 0.9704
t: 1000  ld: 5  lr: 0.0010 validation accuracy: 0.9556
t: 1000  ld: 6  lr: 0.0010 validation accuracy: 0.9778
t: 1000  ld: 8  lr: 0.0010 validation accuracy: 0.9704
t: 1000  ld: 10  lr: 0.0010 validation accuracy: 0.9630
t: 1000  ld: 12  lr: 0.0010 validation accuracy: 0.9556
t: 1000  ld: 14  lr: 0.0010 validation accuracy: 0.9407
t: 1000  ld: 16  lr: 0.0010 validation accuracy: 0.9481
t: 1000  ld: 18  lr: 0.0010 validation accuracy: 0.9481
t: 1000  ld: 20  lr: 0.0010 validation accuracy: 0.9407
Best iterations:1000
Best learning rate:0.001
Best lambda:6


In [47]:
W_e1_bow = log_regression(enron1_bow_train.to_numpy(), e1_bow_params[0], e1_bow_params[1], e1_bow_params[2])

In [48]:
enron1_bow_test_lr = enron1_bow_test.copy()
enron1_bow_test_lr['pred_label'] = predict(enron1_bow_test_lr, W_e1_bow)
enron1_bow_test_lr['pred_label'] = enron1_bow_test_lr['pred_label'].apply(lambda x: 'spam' if x > 0.5 else 'ham')
print("Enron 1 BOW Logistic Regression Evaluation Metrics:")
evaluate(enron1_bow_test_lr)

Enron 1 BOW Logistic Regression Evaluation Metrics:
Accuracy: 0.9320
Precision:  0.9155
Recall:  0.8725
F1 Score:  0.8935


In [64]:
print("Enron 1 Bernoulli Parameter Tuning")
e1_bernoulli_params = tuning(enron1_bernoulli_train)


Enron 1 Bernoulli Parameter Tuning
t: 1000  ld: 0  lr: 0.0010 validation accuracy: 0.9185
t: 1000  ld: 2  lr: 0.0010 validation accuracy: 0.9407
t: 1000  ld: 3  lr: 0.0010 validation accuracy: 0.9481
t: 1000  ld: 4  lr: 0.0010 validation accuracy: 0.9333
t: 1000  ld: 5  lr: 0.0010 validation accuracy: 0.9259
t: 1000  ld: 6  lr: 0.0010 validation accuracy: 0.9259
t: 1000  ld: 8  lr: 0.0010 validation accuracy: 0.9111
t: 1000  ld: 10  lr: 0.0010 validation accuracy: 0.9111
t: 1000  ld: 12  lr: 0.0010 validation accuracy: 0.9037
t: 1000  ld: 14  lr: 0.0010 validation accuracy: 0.8889
t: 1000  ld: 16  lr: 0.0010 validation accuracy: 0.8889
t: 1000  ld: 18  lr: 0.0010 validation accuracy: 0.8815
t: 1000  ld: 20  lr: 0.0010 validation accuracy: 0.8815
Best iterations:1000
Best learning rate:0.001
Best lambda:3


In [50]:
W_e1_bernoulli = log_regression(enron1_bernoulli_train.to_numpy(), e1_bernoulli_params[0], e1_bernoulli_params[1], e1_bernoulli_params[2])

In [51]:
enron1_bernoulli_test_lr = enron1_bernoulli_test.copy()
enron1_bernoulli_test_lr['pred_label'] = predict(enron1_bernoulli_test_lr, W_e1_bernoulli)
enron1_bernoulli_test_lr['pred_label'] = enron1_bernoulli_test_lr['pred_label'].apply(lambda x: 'spam' if x > 0.5 else 'ham')
print("Enron 1 Bernoulli Logistic Regression Evaluation Metrics:")
evaluate(enron1_bernoulli_test_lr)

Enron 1 Bernoulli Logistic Regression Evaluation Metrics:
Accuracy: 0.8947
Precision:  0.9469
Recall:  0.7181
F1 Score:  0.8168


In [52]:
print("Enron 2 Bag of Words Parameter Tuning:")
e2_bow_params = tuning(enron2_bow_train)

Enron 2 Bag of Words Parameter Tuning:
t: 1000  ld: 0  lr: 0.0010 validation accuracy: 0.8058
t: 1000  ld: 2  lr: 0.0010 validation accuracy: 0.8705
t: 1000  ld: 3  lr: 0.0010 validation accuracy: 0.8921
t: 1000  ld: 4  lr: 0.0010 validation accuracy: 0.8993
t: 1000  ld: 5  lr: 0.0010 validation accuracy: 0.9065
t: 1000  ld: 6  lr: 0.0010 validation accuracy: 0.9065
t: 1000  ld: 8  lr: 0.0010 validation accuracy: 0.9137
t: 1000  ld: 10  lr: 0.0010 validation accuracy: 0.9137
t: 1000  ld: 12  lr: 0.0010 validation accuracy: 0.9137
t: 1000  ld: 14  lr: 0.0010 validation accuracy: 0.9137
t: 1000  ld: 16  lr: 0.0010 validation accuracy: 0.9137
t: 1000  ld: 18  lr: 0.0010 validation accuracy: 0.9065
t: 1000  ld: 20  lr: 0.0010 validation accuracy: 0.9065
Best iterations:1000
Best learning rate:0.001
Best lambda:8


In [53]:
W_e2_bow = log_regression(enron2_bow_train.to_numpy(), e2_bow_params[0], e2_bow_params[1], e2_bow_params[2])

In [54]:
enron2_bow_test_lr = enron2_bow_test.copy()
enron2_bow_test_lr['pred_label'] = predict(enron2_bow_test_lr, W_e2_bow)
enron2_bow_test_lr['pred_label'] = enron2_bow_test_lr['pred_label'].apply(lambda x: 'spam' if x > 0.5 else 'ham')
print("Enron 2 BOW Logistic Regression Evaluation Metrics:")
evaluate(enron2_bow_test_lr)

Enron 2 BOW Logistic Regression Evaluation Metrics:
Accuracy: 0.8996
Precision:  0.9184
Recall:  0.6923
F1 Score:  0.7895


In [55]:
print("Enron 2 Bernoulli Parameter Tuning:")
e2_bernoulli_params = tuning(enron2_bernoulli_train)

Enron 2 Bernoulli Parameter Tuning:
t: 1000  ld: 0  lr: 0.0010 validation accuracy: 0.8705
t: 1000  ld: 2  lr: 0.0010 validation accuracy: 0.8705
t: 1000  ld: 3  lr: 0.0010 validation accuracy: 0.8849
t: 1000  ld: 4  lr: 0.0010 validation accuracy: 0.8993
t: 1000  ld: 5  lr: 0.0010 validation accuracy: 0.9137
t: 1000  ld: 6  lr: 0.0010 validation accuracy: 0.9137
t: 1000  ld: 8  lr: 0.0010 validation accuracy: 0.9065
t: 1000  ld: 10  lr: 0.0010 validation accuracy: 0.9065
t: 1000  ld: 12  lr: 0.0010 validation accuracy: 0.9065
t: 1000  ld: 14  lr: 0.0010 validation accuracy: 0.8993
t: 1000  ld: 16  lr: 0.0010 validation accuracy: 0.8993
t: 1000  ld: 18  lr: 0.0010 validation accuracy: 0.8993
t: 1000  ld: 20  lr: 0.0010 validation accuracy: 0.8921
Best iterations:1000
Best learning rate:0.001
Best lambda:5


In [56]:
W_e2_bernoulli = log_regression(enron2_bernoulli_train.to_numpy(), e2_bernoulli_params[0], e2_bernoulli_params[1], e2_bernoulli_params[2])

In [57]:
enron2_bernoulli_test_lr = enron2_bernoulli_test.copy()
enron2_bernoulli_test_lr['pred_label'] = predict(enron2_bernoulli_test_lr, W_e2_bernoulli)
enron2_bernoulli_test_lr['pred_label'] = enron2_bernoulli_test_lr['pred_label'].apply(lambda x: 'spam' if x > 0.5 else 'ham')
print("Enron 2 Bernoulli Logistic Regression Evaluation Metrics:")
evaluate(enron2_bernoulli_test_lr)

Enron 2 Bernoulli Logistic Regression Evaluation Metrics:
Accuracy: 0.8808
Precision:  0.8842
Recall:  0.6462
F1 Score:  0.7467


In [58]:
print("Enron 4 Bag of Words Parameter Tuning:")
e4_bow_params = tuning(enron4_bow_train)


Enron 4 Bag of Words Parameter Tuning:
t: 1000  ld: 0  lr: 0.0010 validation accuracy: 0.9068
t: 1000  ld: 2  lr: 0.0010 validation accuracy: 0.9441
t: 1000  ld: 3  lr: 0.0010 validation accuracy: 0.9565
t: 1000  ld: 4  lr: 0.0010 validation accuracy: 0.9565
t: 1000  ld: 5  lr: 0.0010 validation accuracy: 0.9565
t: 1000  ld: 6  lr: 0.0010 validation accuracy: 0.9565
t: 1000  ld: 8  lr: 0.0010 validation accuracy: 0.9565
t: 1000  ld: 10  lr: 0.0010 validation accuracy: 0.9565
t: 1000  ld: 12  lr: 0.0010 validation accuracy: 0.9565
t: 1000  ld: 14  lr: 0.0010 validation accuracy: 0.9565
t: 1000  ld: 16  lr: 0.0010 validation accuracy: 0.9565
t: 1000  ld: 18  lr: 0.0010 validation accuracy: 0.9565
t: 1000  ld: 20  lr: 0.0010 validation accuracy: 0.9565
Best iterations:1000
Best learning rate:0.001
Best lambda:3


In [59]:
W_e4_bow = log_regression(enron4_bow_train.to_numpy(), e4_bow_params[0], e4_bow_params[1], e4_bow_params[2])

In [60]:
enron4_bow_test_lr = enron4_bow_test.copy()
enron4_bow_test_lr['pred_label'] = predict(enron4_bow_test_lr, W_e4_bow)
enron4_bow_test_lr['pred_label'] = enron4_bow_test_lr['pred_label'].apply(lambda x: 'spam' if x > 0.5 else 'ham')
print("Enron 4 BOW Logistic Regression Evaluation Metrics:")
evaluate(enron4_bow_test_lr)

Enron 4 BOW Logistic Regression Evaluation Metrics:
Accuracy: 0.9521
Precision:  0.9376
Recall:  1.0000
F1 Score:  0.9678


In [61]:
print("Enron 4 Bernoulli Parameter Tuning:")
e4_bernoulli_params = tuning(enron4_bernoulli_train)

Enron 4 Bernoulli Parameter Tuning:
t: 1000  ld: 0  lr: 0.0010 validation accuracy: 0.9193
t: 1000  ld: 2  lr: 0.0010 validation accuracy: 0.9503
t: 1000  ld: 3  lr: 0.0010 validation accuracy: 0.9503
t: 1000  ld: 4  lr: 0.0010 validation accuracy: 0.9503
t: 1000  ld: 5  lr: 0.0010 validation accuracy: 0.9379
t: 1000  ld: 6  lr: 0.0010 validation accuracy: 0.9379
t: 1000  ld: 8  lr: 0.0010 validation accuracy: 0.9379
t: 1000  ld: 10  lr: 0.0010 validation accuracy: 0.9317
t: 1000  ld: 12  lr: 0.0010 validation accuracy: 0.9255
t: 1000  ld: 14  lr: 0.0010 validation accuracy: 0.9255
t: 1000  ld: 16  lr: 0.0010 validation accuracy: 0.9255
t: 1000  ld: 18  lr: 0.0010 validation accuracy: 0.9255
t: 1000  ld: 20  lr: 0.0010 validation accuracy: 0.9193
Best iterations:1000
Best learning rate:0.001
Best lambda:2


In [62]:
W_e4_bernoulli = log_regression(enron4_bernoulli_train.to_numpy(), e4_bernoulli_params[0], e4_bernoulli_params[1], e4_bernoulli_params[2])

In [63]:
enron4_bernoulli_test_lr = enron4_bernoulli_test.copy()
enron4_bernoulli_test_lr['pred_label'] = predict(enron4_bernoulli_test_lr, W_e4_bernoulli)
enron4_bernoulli_test_lr['pred_label'] = enron4_bernoulli_test_lr['pred_label'].apply(lambda x: 'spam' if x > 0.5 else 'ham')
print("Enron 4 Bernoulli Logistic Regression Evaluation Metrics:")
evaluate(enron4_bernoulli_test_lr)

Enron 4 Bernoulli Logistic Regression Evaluation Metrics:
Accuracy: 0.9558
Precision:  0.9465
Recall:  0.9949
F1 Score:  0.9701
