## Ensuring Reproducibility

In [1]:
import torch
import random
import numpy as np

# Set a fixed seed value for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Loading dataset

In [2]:
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from datasets import DatasetDict, load_dataset, concatenate_datasets
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

# Load the dataset
agnews = load_dataset("ag_news")

In [3]:
dataset = concatenate_datasets([agnews["train"], agnews["test"]])

SAMPLE_SIZE = 15000

dataset = dataset.shuffle(seed=SEED).select([i for i in list(range(SAMPLE_SIZE))])

train_test = dataset.train_test_split(test_size=0.3, stratify_by_column="label")
eval_test = train_test["test"].train_test_split(test_size=0.5)

agnews = DatasetDict(
    {
        "train": train_test["train"],
        "eval": eval_test["train"],
        "test": eval_test["test"],
    }
)

agnews

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10500
    })
    eval: Dataset({
        features: ['text', 'label'],
        num_rows: 2250
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2250
    })
})

## Loading dataset

In [4]:
# Load the pre-trained transformer model and tokenizer
model_name = "google/electra-base-discriminator"
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformer_model = AutoModel.from_pretrained(model_name)

In [5]:
# Freeze the pre-trained model parameters
for param in transformer_model.parameters():
    param.requires_grad = False

## Preparing data

In [6]:
# Set up the data collator and dataloaders
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

max_length = 512
batch_size = 64

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=max_length)

train_dataset = agnews["train"].shuffle(seed=SEED)
eval_dataset = agnews["test"].shuffle(seed=SEED)
test_dataset = agnews["test"]

tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized_eval = eval_dataset.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=["text"])

train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_eval, batch_size=batch_size, collate_fn=data_collator)
test_dataloader = DataLoader(tokenized_test, batch_size=batch_size, collate_fn=data_collator)

Map:   0%|          | 0/10500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

Map:   0%|          | 0/2250 [00:00<?, ? examples/s]

In [7]:
# Move the model to the GPU (if available)
device = "mps" if torch.backends.mps.is_available() else "cpu"
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


## Designing classification head

In [8]:
# Define the custom classification head
class ClassificationHead(nn.Module):
    def __init__(self, transformer_model, num_classes):
        super().__init__()
        self.transformer_model = transformer_model.to(device)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(transformer_model.config.hidden_size, num_classes).to(device)

    def forward(self, input_ids, attention_mask):
        output = self.transformer_model(input_ids=input_ids, attention_mask=attention_mask)[0]
        output = self.dropout(output[:, 0])  # Take the CLS token representation
        output = self.classifier(output)
        return output

## Tuning model

In [9]:
print(f"Number of labels = {len(set(train_dataset['label']))}")
set(train_dataset["label"])

Number of labels = 4


{0, 1, 2, 3}

In [10]:
# Set hyperparameters
num_classes = 4
learning_rate = 2e-5
num_epochs = 5

In [11]:
# Create the classification model
model = ClassificationHead(transformer_model, num_classes)

# Set up the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [12]:
set(train_dataloader.dataset["label"]), set(list(train_dataloader)[0]["labels"].detach().tolist())

({0, 1, 2, 3}, {0, 1, 2, 3})

In [13]:
from pycm import ConfusionMatrix


def train_epoch(model, train_dataloader, optimizer, loss_fn, device):
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        output = model(input_ids, attention_mask)
        loss = loss_fn(output, labels)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    return train_loss / len(train_dataloader)


def eval_epoch(model, eval_dataloader, loss_fn, device):
    model.eval()
    eval_loss = 0
    y_preds = []
    for batch in eval_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        output = model(input_ids, attention_mask)
        loss = loss_fn(output, labels)
        eval_loss += loss.item()
        y_preds.extend(output.argmax(dim=1).detach().tolist())
    return eval_loss / len(eval_dataloader), y_preds


def test_model(model, test_dataloader, loss_fn, device):
    model.eval()
    test_loss = 0
    y_preds = []
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        output = model(input_ids, attention_mask)
        loss = loss_fn(output, labels)
        test_loss += loss.item()
        y_preds.extend(output.argmax(dim=1).detach().tolist())
    test_loss /= len(test_dataloader)
    test_cm = ConfusionMatrix(test_dataloader.dataset["label"], y_preds, digit=5)
    return test_loss, test_cm, y_preds


def train_model(
    model,
    train_dataloader,
    eval_dataloader,
    test_dataloader,
    optimizer,
    loss_fn,
    device,
    num_epochs,
):
    stats = {}
    for epoch in range(num_epochs):
        tloss = train_epoch(model, train_dataloader, optimizer, loss_fn, device)
        eloss, y_preds = eval_epoch(model, eval_dataloader, loss_fn, device)
        evaluation_cm = ConfusionMatrix(
            eval_dataloader.dataset["label"], y_preds, digit=5
        )

        stats[f"epoch_{epoch+1}"] = {
            "training_loss": tloss,
            "validation_loss": eloss,
            f"epoch{epoch+1}_pred": y_preds,
            "validation_metrics": evaluation_cm,
        }

        print(
            f"Epoch = {epoch+1}/{num_epochs}\t Training Loss = {tloss:.2f}\t Validation Loss = {eloss:.2f}\t Validation Accuracy = {evaluation_cm.Overall_ACC:.2f}"
        )

    test_loss, test_cm, test_preds = test_model(model, test_dataloader, loss_fn, device)
    stats["test_loss"] = test_loss
    stats["test_metrics"] = test_cm
    stats["test_pred"] = test_preds

    stats["eval_labels"] = eval_dataloader.dataset["label"]
    stats["test_labels"] = test_dataloader.dataset["label"]

    print(f"\nTest Accuracy = {test_cm.Overall_ACC:.2f}")

    return stats

In [14]:
# Train the model and evaluate on test set
stats = train_model(
    model,
    train_dataloader,
    eval_dataloader,
    test_dataloader,
    optimizer,
    loss_fn,
    device,
    num_epochs,
)

Epoch = 1/5	 Training Loss = 1.38	 Validation Loss = 1.36	 Validation Accuracy = 0.40
Epoch = 2/5	 Training Loss = 1.36	 Validation Loss = 1.34	 Validation Accuracy = 0.43
Epoch = 3/5	 Training Loss = 1.35	 Validation Loss = 1.33	 Validation Accuracy = 0.46
Epoch = 4/5	 Training Loss = 1.33	 Validation Loss = 1.31	 Validation Accuracy = 0.49
Epoch = 5/5	 Training Loss = 1.32	 Validation Loss = 1.30	 Validation Accuracy = 0.48

Test Accuracy = 0.48


## Saving the results

In [15]:
import pickle

with open(f"results/agnews_electra.pickle", "wb") as file:
    pickle.dump(stats, file)

# # with open("agnews_electra.pickle", "rb") as file:
# #     stats = pickle.load(file)

## Model Parameters

In [16]:
from torch.nn.utils import parameters_to_vector
from prettytable import PrettyTable


def count_parameters_per_layer(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            continue
        params = parameter.numel()
        table.add_row([name, params])
        total_params += params
    print(table)
    print(f"Total Number of Parameters: {parameters_to_vector(model.parameters()).numel()}")
    print(f"Total Trainable Params: {total_params}")


count_parameters_per_layer(model)

+-------------------+------------+
|      Modules      | Parameters |
+-------------------+------------+
| classifier.weight |    3072    |
|  classifier.bias  |     4      |
+-------------------+------------+
Total Number of Parameters: 108894724
Total Trainable Params: 3076
