In [1]:
import torch, torch.nn as nn, pandas as pd, tiktoken
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

### Classifier

In [2]:
class Classifier(nn.Module):
    def __init__(self, vocab_size, num_layers, context_length, dimension, num_heads, dropout, max_len):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, dimension)
        self.positional_encoding = nn.Embedding(context_length, dimension)
        self.dropout = nn.Dropout(dropout)

        encoder = nn.TransformerEncoderLayer(dimension, num_heads, dimension*4, dropout, "gelu", batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder, num_layers=num_layers)

        self.normalization = nn.LayerNorm(dimension)

        self.output = nn.Linear(dimension, 1)
        self.max_len = max_len


    def forward(self, x):
        b, len = x.shape
        padding_mask = (x == 0)

        emb = self.embedding(x)
        pos = self.positional_encoding(torch.arange(len, device=x.device).unsqueeze(0).expand(b, len))
        x = emb + pos
        x = self.dropout(x)

        x = self.transformer(x, src_key_padding_mask=padding_mask)
        x = self.normalization(x)

        x = x.mean(dim=1)
        logits = self.output(x)
        return logits

### Data processing

##### Load data, labels

In [3]:
df = pd.read_csv("spam_Emails_data.csv")
df['label'] = df['label'].str.strip().str.lower().map({'ham': 0.0, 'spam': 1.0})
df = df[df['text'].astype(str).str.len() <= 600]
labels = df['label'].tolist()

##### Tokenization using tiktoken with GPT2's encoding

In [4]:
tokenizer = tiktoken.get_encoding("gpt2")
tokenized = [tokenizer.encode(string) for string in tqdm(df['text'].astype(str).tolist())]
tokenized = [torch.tensor(item) for item in tokenized]

100%|██████████| 77902/77902 [00:01<00:00, 41225.47it/s]


##### Collate function for padding

In [5]:
def collate_fn(batch):
    input_ids = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    input_ids = nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return input_ids, labels

##### Custom Dataset and Dataloaders with 0.6/0.2/0.2 train/validate/test split

In [6]:
class TextDataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.input_ids = tokenized_inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.labels[idx])

dataset = TextDataset(tokenized, labels)
train_size = int(0.6 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

### Evaluation function

In [7]:
@torch.no_grad()
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    for input_ids, labels in tqdm(dataloader, desc="Evaluating"):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        outputs = model(input_ids)
        loss = criterion(outputs, labels)

        total_loss += loss.item()
        preds = (torch.sigmoid(outputs) > 0.5).float()
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    avg_loss = total_loss / len(dataloader)
    acc = correct / total
    return avg_loss, acc

### Training function

In [8]:
def train_model(model, train_loader, val_loader, optimizer, criterion, epochs, device):
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        total_correct = 0
        total_samples = 0

        print(f"\nEpoch {epoch + 1}/{epochs}")
        for batch in tqdm(train_loader, desc="Training"):
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.unsqueeze(1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            preds = (torch.sigmoid(outputs) > 0.5).float()
            total_correct += (preds == labels.unsqueeze(1)).sum().item()
            total_samples += labels.size(0)

        avg_train_loss = total_loss / len(train_loader)
        train_accuracy = total_correct / total_samples
        print(f"Train Loss: {avg_train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")

        val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    return model 

In [None]:
model = Classifier(tokenizer.n_vocab, 2, 1024, 768, 4, 0.1, True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.BCEWithLogitsLoss()

trained_model = train_model(model, train_loader, val_loader, optimizer, criterion, 3, "cuda")


Epoch 1/3


Training: 100%|██████████| 1461/1461 [02:01<00:00, 11.99it/s]


Train Loss: 0.2081, Train Accuracy: 0.9118


  output = torch._nested_tensor_from_mask(
Evaluating: 100%|██████████| 487/487 [00:03<00:00, 124.67it/s]


Validation Loss: 0.2514, Validation Accuracy: 0.9387

Epoch 2/3


Training: 100%|██████████| 1461/1461 [02:03<00:00, 11.83it/s]


Train Loss: 0.0969, Train Accuracy: 0.9626


Evaluating: 100%|██████████| 487/487 [00:04<00:00, 121.48it/s]


Validation Loss: 0.2063, Validation Accuracy: 0.9594

Epoch 3/3


Training: 100%|██████████| 1461/1461 [02:01<00:00, 12.01it/s]


Train Loss: 0.0638, Train Accuracy: 0.9761


Evaluating: 100%|██████████| 487/487 [00:03<00:00, 128.75it/s]

Validation Loss: 0.2017, Validation Accuracy: 0.9596





In [10]:
test_loss, test_accuracy = evaluate(trained_model, test_loader, criterion, "cuda")
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Evaluating: 100%|██████████| 487/487 [00:03<00:00, 128.37it/s]

Test Loss: 0.1955, Test Accuracy: 0.9635





In [30]:
model2 = Classifier(tokenizer.n_vocab, 2, 1024, 768, 4, 0.3, True)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.0001)
criterion2 = nn.BCEWithLogitsLoss()

trained_model2 = train_model(model2, train_loader, val_loader, optimizer2, criterion2, 10, "cuda")


Epoch 1/10


Training: 100%|██████████| 1461/1461 [02:04<00:00, 11.77it/s]


Train Loss: 0.2376, Train Accuracy: 0.8985


Evaluating: 100%|██████████| 487/487 [00:03<00:00, 125.79it/s]


Validation Loss: 0.2604, Validation Accuracy: 0.9412

Epoch 2/10


Training: 100%|██████████| 1461/1461 [02:06<00:00, 11.56it/s]


Train Loss: 0.1322, Train Accuracy: 0.9474


Evaluating: 100%|██████████| 487/487 [00:04<00:00, 118.11it/s]


Validation Loss: 0.2175, Validation Accuracy: 0.9581

Epoch 3/10


Training: 100%|██████████| 1461/1461 [02:05<00:00, 11.60it/s]


Train Loss: 0.1031, Train Accuracy: 0.9585


Evaluating: 100%|██████████| 487/487 [00:03<00:00, 125.38it/s]


Validation Loss: 0.2194, Validation Accuracy: 0.9601

Epoch 4/10


Training: 100%|██████████| 1461/1461 [02:05<00:00, 11.65it/s]


Train Loss: 0.0848, Train Accuracy: 0.9668


Evaluating: 100%|██████████| 487/487 [00:03<00:00, 124.84it/s]


Validation Loss: 0.1988, Validation Accuracy: 0.9616

Epoch 5/10


Training: 100%|██████████| 1461/1461 [02:25<00:00, 10.01it/s]


Train Loss: 0.0734, Train Accuracy: 0.9716


Evaluating: 100%|██████████| 487/487 [00:04<00:00, 105.52it/s]


Validation Loss: 0.1998, Validation Accuracy: 0.9669

Epoch 6/10


Training: 100%|██████████| 1461/1461 [02:39<00:00,  9.14it/s]


Train Loss: 0.0654, Train Accuracy: 0.9746


Evaluating: 100%|██████████| 487/487 [00:04<00:00, 108.92it/s]


Validation Loss: 0.1946, Validation Accuracy: 0.9673

Epoch 7/10


Training: 100%|██████████| 1461/1461 [02:33<00:00,  9.51it/s]


Train Loss: 0.0549, Train Accuracy: 0.9788


Evaluating: 100%|██████████| 487/487 [00:04<00:00, 99.81it/s] 


Validation Loss: 0.1963, Validation Accuracy: 0.9669

Epoch 8/10


Training: 100%|██████████| 1461/1461 [02:34<00:00,  9.44it/s]


Train Loss: 0.0495, Train Accuracy: 0.9805


Evaluating: 100%|██████████| 487/487 [00:04<00:00, 99.13it/s] 


Validation Loss: 0.1811, Validation Accuracy: 0.9722

Epoch 9/10


Training: 100%|██████████| 1461/1461 [02:26<00:00,  9.95it/s]


Train Loss: 0.0450, Train Accuracy: 0.9827


Evaluating: 100%|██████████| 487/487 [00:03<00:00, 126.82it/s]


Validation Loss: 0.1902, Validation Accuracy: 0.9666

Epoch 10/10


Training: 100%|██████████| 1461/1461 [02:20<00:00, 10.43it/s]


Train Loss: 0.0404, Train Accuracy: 0.9848


Evaluating: 100%|██████████| 487/487 [00:04<00:00, 111.78it/s]

Validation Loss: 0.1759, Validation Accuracy: 0.9721





In [31]:
test_loss2, test_accuracy2 = evaluate(trained_model2, test_loader, criterion2, "cuda")
print(f"Test Loss: {test_loss2:.4f}, Test Accuracy: {test_accuracy2:.4f}")

Evaluating: 100%|██████████| 487/487 [00:04<00:00, 110.76it/s]

Test Loss: 0.1721, Test Accuracy: 0.9747



