In [1]:
from string import digits

import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import  classification_report
import pandas as pd
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PRE_TRAINED = True
NUM_EPOCHS = 5
OUTPUT = "./spam_roberta.pt"

In [3]:
class EmailDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [4]:
df = pd.read_csv("data/spam_or_not_spam.csv")
df = df.dropna(subset=['email', 'label'])

X_train_temp, X_test, y_train_temp, y_test  = train_test_split(df["email"], df["label"], test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val  = train_test_split(X_train_temp, y_train_temp, test_size=0.2, random_state=1)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
train_dataset = EmailDataset(list(X_train), list(y_train), tokenizer)
val_dataset = EmailDataset(list(X_val), list(y_val), tokenizer)
test_dataset = EmailDataset(list(X_test), list(y_test), tokenizer)

In [6]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=2)
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
if not PRE_TRAINED:
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1} - Loss: {total_loss / len(train_loader):.4f}")

        # === Validation ===
        model.eval()
        val_preds, val_true = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1)

                val_preds.extend(preds.cpu().numpy())
                val_true.extend(labels.cpu().numpy())

        print(classification_report(val_true, val_predsdigits=4))
    torch.save(model.state_dict(), OUTPUT)

In [11]:
if PRE_TRAINED:
    model.load_state_dict(torch.load(OUTPUT))
    model.eval()

    val_preds, val_true = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            val_preds.extend(preds.cpu().numpy())
            val_true.extend(labels.cpu().numpy())

    print(classification_report(val_true, val_preds,digits =4))

    test_preds, test_true = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            test_preds.extend(preds.cpu().numpy())
            test_true.extend(labels.cpu().numpy())

    print(classification_report(test_true, test_preds, digits=4))

              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       400
           1     1.0000    1.0000    1.0000        80

    accuracy                         1.0000       480
   macro avg     1.0000    1.0000    1.0000       480
weighted avg     1.0000    1.0000    1.0000       480

              precision    recall  f1-score   support

           0     1.0000    0.9980    0.9990       492
           1     0.9908    1.0000    0.9954       108

    accuracy                         0.9983       600
   macro avg     0.9954    0.9990    0.9972       600
weighted avg     0.9983    0.9983    0.9983       600



In [10]:
if not PRE_TRAINED:
    model.eval()
    val_preds, val_true = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            val_preds.extend(preds.cpu().numpy())
            val_true.extend(labels.cpu().numpy())

        print(classification_report(val_true, val_preds,digits = 4))

    test_preds, test_true = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            test_preds.extend(preds.cpu().numpy())
            test_true.extend(labels.cpu().numpy())

    print(classification_report(test_true, test_preds,digits = 4))