In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score
from transformers import BertTokenizer, DistilBertTokenizer, DistilBertModel, Trainer, TrainingArguments, AutoModelForSequenceClassification, DistilBertForSequenceClassification
from sklearn.model_selection import KFold
from torch.utils.data import Subset
import numpy as np

In [3]:
# load base and distil tokenizers
tokenizer_base = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer_distil = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [4]:
# select enron1.zip file from local files
from google.colab import files
uploaded = files.upload()

Saving enron1.zip to enron1.zip


In [5]:
!unzip enron1.zip -d /content/enron/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/enron/enron/enron4/spam/2165.2004-08-26.GP.spam.txt  
  inflating: /content/enron/__MACOSX/enron/enron4/spam/._2165.2004-08-26.GP.spam.txt  
  inflating: /content/enron/enron/enron4/spam/3703.2004-12-27.GP.spam.txt  
  inflating: /content/enron/__MACOSX/enron/enron4/spam/._3703.2004-12-27.GP.spam.txt  
  inflating: /content/enron/enron/enron4/spam/5780.2005-08-04.GP.spam.txt  
  inflating: /content/enron/__MACOSX/enron/enron4/spam/._5780.2005-08-04.GP.spam.txt  
  inflating: /content/enron/enron/enron4/spam/4083.2005-01-30.GP.spam.txt  
  inflating: /content/enron/__MACOSX/enron/enron4/spam/._4083.2005-01-30.GP.spam.txt  
  inflating: /content/enron/enron/enron4/spam/1646.2004-07-06.GP.spam.txt  
  inflating: /content/enron/__MACOSX/enron/enron4/spam/._1646.2004-07-06.GP.spam.txt  
  inflating: /content/enron/enron/enron4/spam/4578.2005-03-08.GP.spam.txt  
  inflating: /content/enron/__MACOSX/enron/e

In [6]:
enron_path = "/content/enron/enron"

In [7]:
# obtain data from enron zip file
def extract_data(path):
    emails = []
    labels = []
    for i in range(1, 7):
        enron_path = os.path.join(path, f"enron{i}")
        spam_path = os.path.join(enron_path,"spam")
        ham_path = os.path.join(enron_path,"ham")
        print(spam_path)
        for filename in os.listdir(spam_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(spam_path, filename)

                with open(file_path, 'r', encoding='latin-1') as f:
                    txt = f.read()
                    emails.append(txt)
                    labels.append('spam')

        for filename in os.listdir(ham_path):
            if filename.endswith('.txt'):
                file_path = os.path.join(ham_path, filename)

                with open(file_path, 'r', encoding='latin-1') as f:
                    txt = f.read()
                    emails.append(txt)
                    labels.append('ham')

    return emails, labels

In [8]:
emails, spam_labels = extract_data(enron_path)
print(len(emails))
print(len(spam_labels))

/content/enron/enron/enron1/spam
/content/enron/enron/enron2/spam
/content/enron/enron/enron3/spam
/content/enron/enron/enron4/spam
/content/enron/enron/enron5/spam
/content/enron/enron/enron6/spam
33715
33715


In [9]:
# convert spam labels to 1s and 0s
spam_labels = [1 if label == "spam" else 0 for label in spam_labels]

In [10]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
# preprocess the emails
def pre_process(text):

    stopwords = nltk.corpus.stopwords.words('english')

    lemmatizer = WordNetLemmatizer()

    text = text.lower()

    text = re.sub(r'[^a-zA-Z\s]', '', text)

    text = word_tokenize(text)

    final_text = [lemmatizer.lemmatize(word) for word in text if word not in stopwords]

    return final_text


In [12]:
preprocessed_emails = [pre_process(email) for email in emails]

In [13]:
enron_df = pd.DataFrame({"Emails" : preprocessed_emails})

In [14]:
enron_df['Labels'] = spam_labels

In [15]:
enron_df.head(10)

Unnamed: 0,Emails,Labels
0,"[subject, special, promotion, get, la, vega, v...",1
1,"[subject, xa, nax, alprazolam, cheap, message,...",1
2,"[subject, going, hey, marjorie, finally, found...",1
3,"[subject, delivery, failure, ao, coa, e, c, ao...",1
4,"[subject, injection]",1
5,"[subject, rubbed, hand, find, partner, hot, pi...",1
6,"[subject, get, back, please, company, timely, ...",1
7,"[subject, penls, enlarg, ment, pllls]",1
8,"[subject, solid, new, home, hour, hows, going,...",1
9,"[subject, home, loan, refinancing, low, rate]",1


In [16]:
# split train and test data
train_txt, test_txt, train_label, test_label = train_test_split(emails, spam_labels,
                                                                test_size=0.2, random_state=42)


In [17]:
# get the length of the train dataset
len(train_txt)

26972

In [18]:
# train and test encodings for based
train_encodings_base = tokenizer_base(
    train_txt,
    truncation=True,
    padding=True,
    max_length=128
)

test_encodings_base = tokenizer_base(
    test_txt,
    truncation=True,
    padding=True,
    max_length=128
)

In [19]:
# train and test encodings for distil
train_encodings_distil = tokenizer_distil(
    train_txt,
    truncation=True,
    padding=True,
    max_length=128
)

test_encodings_distil = tokenizer_distil(
    test_txt,
    truncation=True,
    padding=True,
    max_length=128
)

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        # store tokenized inputs (input_ids, attention_mask)
        self.encodings = encodings
        # Store labels
        self.labels = labels

    def __getitem__(self, idx):
        # tetch tokenized data for this index
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
      ``# total number of samples
        return len(self.labels)


In [21]:
# convert into two types of datasets for disitl and based
train_dataset_distil = TextDataset(train_encodings_distil, train_label)
test_dataset_distil = TextDataset(test_encodings_distil, test_label)
train_dataset_base = TextDataset(train_encodings_base, train_label)
test_dataset_base = TextDataset(test_encodings_base, test_label)


train_loader_base = DataLoader(
    train_dataset_base,
    batch_size=16,
    shuffle=True
)

test_loader_base = DataLoader(
    test_dataset_base,
    batch_size=16
)


train_loader_distil = DataLoader(
    train_dataset_distil,
    batch_size=16,
    shuffle=True
)

test_loader_distil = DataLoader(
    test_dataset_distil,
    batch_size=16
)

In [22]:
# initialize the classification model
from transformers import AutoModelForSequenceClassification, DistilBertForSequenceClassification

In [23]:
from torch.optim import AdamW

In [24]:
def train(model, num_labels, batch_size, lr, epoch, loader):
  # # move model to GPU if available
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  # initialize the type of optimizer
  optimizer = AdamW(model.parameters(), lr=learning_rate)

  model.train()

  for epoch in range(epochs):
      total_loss = 0

      for batch in loader:
          # move batch to GPU
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['labels'].to(device)

          # forward pass
          outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
          loss = outputs.loss

          # backward pass
          loss.backward()
          optimizer.step()
          optimizer.zero_grad()

          total_loss += loss.item()

      print(f"Epoch {epoch+1} | Avg Loss: {total_loss/len(loader):.2f}")


num_labels = 2
batch_size = 16
# standard for BERT fine-tuning
learning_rate = 2e-5
epochs = 3
base_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
distil_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
train(distil_model, num_labels, batch_size, learning_rate, epochs, train_loader_distil)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 | Avg Loss: 0.06
Epoch 2 | Avg Loss: 0.01
Epoch 3 | Avg Loss: 0.01


In [26]:
train(base_model, num_labels, batch_size, learning_rate, epochs, train_loader_base)

Epoch 1 | Avg Loss: 0.05
Epoch 2 | Avg Loss: 0.01
Epoch 3 | Avg Loss: 0.01


In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(model, val_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    val_loss = 0
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())

    # calculate metrics
    accuracy = accuracy_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions, average='binary')  # For multi-class, use 'weighted' or 'macro'
    recall = recall_score(true_labels, predictions, average='binary')
    f1 = f1_score(true_labels, predictions, average='binary')
    avg_loss = val_loss / len(val_loader)

    return avg_loss, accuracy, precision, recall, f1

# run evaluation and get all metrics
val_loss_distil, val_acc_distil, val_precision_distil, val_recall_distil, val_f1_distil = evaluate(distil_model, test_loader_distil)
val_loss_base, val_acc_base, val_precision_base, val_recall_base, val_f1_base = evaluate(base_model, test_loader_base)
print("BERT Based Model Performance Metrics")
print(f"Validation Loss: {val_loss_distil:.2f}")
print(f"Accuracy: {val_acc_distil:.2f}")
print(f"Precision: {val_precision_distil:.2f}")
print(f"Recall: {val_recall_distil:.2f}")
print(f"F1-Score: {val_f1_distil:.2f}")

print("-------------------------------")

print("BERT Distil Model Performance Metrics")
print(f"Validation Loss: {val_loss_base:.2f}")
print(f"Accuracy: {val_acc_base:.2f}")
print(f"Precision: {val_precision_base:.2f}")
print(f"Recall: {val_recall_base:.2f}")
print(f"F1-Score: {val_f1_base:.2f}")

BERT Based Model Performance Metrics
Validation Loss: 0.04
Accuracy: 0.99
Precision: 0.99
Recall: 0.99
F1-Score: 0.99
-------------------------------
BERT Distil Model Performance Metrics
Validation Loss: 0.02
Accuracy: 0.99
Precision: 1.00
Recall: 0.99
F1-Score: 0.99


In [31]:
# Perform k-fold cross-validation evaluation on a pre-trained BERT-like model.

def k_fold_bert(model, train_ds, test_ds, k=5, num_epochs=3, batch_size=8):
    kfold = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(train_ds)):
        print(f"\nFold {fold + 1}/{k}")

        # split train dataset into train and validation subsets
        val_subset = Subset(train_ds, val_idx)

        # define training arguments
        training_args = TrainingArguments(
            output_dir=f"./results/fold_{fold}",
            per_device_eval_batch_size=batch_size,
        )

        # initialize Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            eval_dataset=val_subset
        )

        # evaluate on validation set
        eval_result = trainer.evaluate()
        fold_results.append(eval_result)

    # compute average performance across all folds
    avg_results = {key: np.mean([fold[key] for fold in fold_results]) for key in fold_results[0]}

    # final evaluation on the test dataset
    trainer = Trainer(
        model=model,
        args=training_args,
        eval_dataset=test_ds
    )
    final_test_results = trainer.evaluate()

    return {
        "avg_k_fold_results": avg_results,
        "final_test_results": final_test_results
    }

# call k_fold_bert with trained models
results1 = k_fold_bert(base_model, train_dataset_base, test_dataset_base, k=5)
results2 = k_fold_bert(distil_model, train_dataset_distil, test_dataset_distil, k=5)

print("BERT Base Results:", results1)
print("BERT Distil Results:", results2)


Fold 1/5


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkl13476[0m ([33mkl13476-northeastern-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



Fold 2/5



Fold 3/5



Fold 4/5



Fold 5/5



Fold 1/5



Fold 2/5



Fold 3/5



Fold 4/5



Fold 5/5


BERT Base Results: {'avg_k_fold_results': {'eval_loss': np.float64(0.0015876126126386225), 'eval_model_preparation_time': np.float64(0.00648), 'eval_runtime': np.float64(40.16632), 'eval_samples_per_second': np.float64(134.34859999999998), 'eval_steps_per_second': np.float64(16.8112)}, 'final_test_results': {'eval_loss': 0.021525423973798752, 'eval_model_preparation_time': 0.0047, 'eval_runtime': 50.9228, 'eval_samples_per_second': 132.416, 'eval_steps_per_second': 16.554}}
BERT Distil Results: {'avg_k_fold_results': {'eval_loss': np.float64(6.531439830723685e-05), 'eval_model_preparation_time': np.float64(0.00234), 'eval_runtime': np.float64(20.43064), 'eval_samples_per_second': np.float64(264.0668), 'eval_steps_per_second': np.float64(33.04260000000001)}, 'final_test_results': {'eval_loss': 0.037168536335229874, 'eval_model_preparation_time': 0.0022, 'eval_runtime': 25.7844, 'eval_samples_per_second': 261.515, 'eval_steps_per_second': 32.694}}
