In [2]:
# !pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [1]:
import time
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
# from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
import pickle
import torch.nn.functional as F
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
print(device)

mps


In [3]:
base_model = 'roberta-base'

dataset = load_dataset('ag_news', split='train', cache_dir="./data/")
tokenizer = RobertaTokenizer.from_pretrained(base_model)

# def preprocess(examples):
#     tokenized = tokenizer(examples['text'], truncation=True, padding=True)
#     return tokenized

# tokenized_dataset = dataset.map(preprocess, batched=True,  remove_columns=["text"])
# tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [4]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [5]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label,
    cache_dir="./model_dir")
# model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# Load tokenizer and model
base_model = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(base_model)

# Load datasets
train_dataset = load_dataset('ag_news', split='train', cache_dir='./data/')
test_dataset = load_dataset('ag_news', split='test', cache_dir='./data/')

# Tokenization function
def preprocess(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

# Apply tokenization
tokenized_train = train_dataset.map(preprocess, batched=True, remove_columns=["text"])
tokenized_test = test_dataset.map(preprocess, batched=True, remove_columns=["text"])

# Rename label column
tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_test = tokenized_test.rename_column("label", "labels")

# Set format for PyTorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

batch_size_val = 4 if device == "cpu" else 16 if device == "cuda" else 16
num_workers_val = 4 if device == "cuda" else 0
# pin_mem = True if device == "cuda" else False

# Create DataLoaders
train_dataloader = DataLoader(tokenized_train, batch_size=batch_size_val, shuffle=True, collate_fn=data_collator, num_workers=num_workers_val)
test_dataloader = DataLoader(tokenized_test, batch_size=batch_size_val, shuffle=False, collate_fn=data_collator, num_workers=num_workers_val)

In [7]:
import math
class LoRALayer(torch.nn.Module):
  def __init__(self, in_dim, out_dim, r, alpha):
    super().__init__()
    self.r = r
    self.alpha = alpha

    # Initialize A to kaiming uniform following code: https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
    self.A = torch.nn.Parameter(torch.empty(r, in_dim))
    # Initialize B to zeros.
    self.B = torch.nn.Parameter(torch.empty(out_dim, r))
    torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
    torch.nn.init.zeros_(self.B)

    self.scaling = self.alpha / self.r

  # def forward(self, x):
  #   x = self.scaling * (x @ self.A.transpose(0, 1) @ self.B.transpose(0, 1))
  #   return x
  def forward(self, x):
    A = self.A.to(x.device)
    B = self.B.to(x.device)
    x = self.scaling * (x @ A.transpose(0, 1) @ B.transpose(0, 1))
    return x

In [8]:
class LinearWithLoRA(torch.nn.Module):
  def __init__(self, linear, r, alpha):
    super().__init__()
    self.linear = linear
    self.lora = LoRALayer(
        linear.in_features, linear.out_features, r, alpha
    )

  def forward(self, x):
    return self.linear(x) + self.lora(x)

In [9]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [10]:
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [11]:
def get_accuracy(y_pred, targets):
  predictions = torch.log_softmax(y_pred, dim=1).argmax(dim=1)
  accuracy = (predictions == targets).sum() / len(targets)
  return accuracy

In [12]:
lora_model = model
# optimizer_lora = torch.optim.Adam(params=lora_model.parameters(), lr=lr)
loss_function = torch.nn.CrossEntropyLoss()

In [13]:
def train(model, train_loader, optimizer, save_every_steps=200, output_dir="./checkpoints", epochs=None, max_steps=None):
    import time, os
    os.makedirs(output_dir, exist_ok=True)
    recent_checkpoints = []

    total_time = 0
    total_steps = 0
    interval = len(train_loader) // 5 if len(train_loader) >= 5 else 1
    epoch = 0

    while True:
        if epochs is not None and epoch >= epochs:
            break

        model.train()
        total_train_loss, total_train_acc = 0, 0
        start = time.time()

        for batch_idx, batch in enumerate(train_loader):
            if max_steps is not None and total_steps >= max_steps:
                break

            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs.loss, outputs.logits

            acc = get_accuracy(logits, labels)
            total_train_loss += loss.item()
            total_train_acc += acc.item()

            loss.backward()
            optimizer.step()
            total_steps += 1

            if total_steps % 100 == 0 or total_steps == 1:
                print(f"[Step {total_steps}] Loss: {loss.item():.4f} | Acc: {acc.item():.4f}")

            if total_steps % save_every_steps == 0 or total_steps == max_steps:
                ckpt_path = os.path.join(output_dir, f"model_step_{total_steps}.pt")
                torch.save({
                      'model_state_dict': model.state_dict(),
                      'optimizer_state_dict': optimizer.state_dict(),
                      'step': total_steps,
                      'epoch': epoch,
                  }, ckpt_path)

                print(f"Checkpoint saved: {ckpt_path}")

                recent_checkpoints.append(ckpt_path)
                if len(recent_checkpoints) > 2:
                    old_ckpt = recent_checkpoints.pop(0)
                    if os.path.exists(old_ckpt):
                        os.remove(old_ckpt)
                        print(f"Old checkpoint removed: {old_ckpt}")

        epoch += 1
        total_time += (time.time() - start)

        if max_steps is not None and total_steps >= max_steps:
            print(f"Reached max_steps={max_steps}, stopping training.")
            break

    print(f"Training done in {epoch} epoch(s)")


In [14]:
def train(model, train_loader, optimizer, save_every_steps=200, output_dir="./checkpoints", epochs=None, max_steps=None):
    import time, os
    os.makedirs(output_dir, exist_ok=True)
    recent_checkpoints = []

    total_training_time = 0
    total_steps = 0
    interval = len(train_loader) // 5 if len(train_loader) >= 5 else 1
    epoch = 0

    print("Starting training...\n")
    training_start = time.time()

    while True:
        if epochs is not None and epoch >= epochs:
            break

        model.train()
        total_train_loss, total_train_acc = 0, 0
        epoch_start = time.time()

        for batch_idx, batch in enumerate(train_loader):
            if max_steps is not None and total_steps >= max_steps:
                break

            step_start = time.time()

            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs.loss, outputs.logits

            acc = get_accuracy(logits, labels)
            total_train_loss += loss.item()
            total_train_acc += acc.item()

            loss.backward()
            optimizer.step()
            total_steps += 1

            step_end = time.time()
            step_time = step_end - step_start

            if total_steps % 100 == 0 or total_steps == 1:
                print(f"[Step {total_steps}] Loss: {loss.item():.4f} | Acc: {acc.item():.4f} | Time: {step_time:.2f}s")

            if total_steps % save_every_steps == 0 or total_steps == max_steps:
                ckpt_path = os.path.join(output_dir, f"model_step_{total_steps}.pt")
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'step': total_steps,
                    'epoch': epoch,
                }, ckpt_path)

                print(f"Checkpoint saved: {ckpt_path}")

                recent_checkpoints.append(ckpt_path)
                if len(recent_checkpoints) > 2:
                    old_ckpt = recent_checkpoints.pop(0)
                    if os.path.exists(old_ckpt):
                        os.remove(old_ckpt)
                        print(f"Old checkpoint removed: {old_ckpt}")

        epoch_end = time.time()
        epoch_time = epoch_end - epoch_start
        total_training_time += epoch_time

        print(f"Epoch {epoch+1} completed in {epoch_time:.2f}s")

        epoch += 1

        if max_steps is not None and total_steps >= max_steps:
            print(f"Reached max_steps={max_steps}, stopping training.")
            break

    overall_time = time.time() - training_start
    print(f"\nTraining done in {epoch} epoch(s)")
    print(f"Total training time: {overall_time:.2f}s")


In [15]:
def evaluate(model, test_loader):
  interval = len(test_loader) // 5

  total_test_loss = 0
  total_test_acc = 0

  model.eval()
  with torch.no_grad():
    for batch_idx, batch in enumerate(test_loader):
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)

      outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask,
          labels=labels
      )
      loss = outputs.loss
      logits = outputs.logits

      acc = get_accuracy(logits, labels)

      total_test_loss += loss.item()
      total_test_acc += acc.item()

      if (batch_idx + 1) % interval == 0:
        print("Batch: %s/%s | Test loss: %.4f | accuracy: %.4f" % (batch_idx+1, len(test_loader), loss, acc))

  test_loss = total_test_loss / len(test_loader)
  test_acc = total_test_acc / len(test_loader)

  print(f"Test loss: {test_loss:.4f} acc: {test_acc:.4f}")
  print("")

In [16]:
def evaluate_unlabelled(model, data_loader):
    model.eval()
    preds = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            logits = outputs.logits  # [B, num_classes]
            predictions = torch.argmax(logits, dim=1)  # get predicted class indices
            preds.append(predictions.cpu())

    return torch.cat(preds, dim=0)  # combine into a single tensor


In [17]:
from functools import partial
import torch.nn as nn

def setup_lora_model(base_model, r, alpha):
    model = RobertaForSequenceClassification.from_pretrained(
        base_model,
        id2label=id2label,
        cache_dir="./model_dir"
    )

    # Freeze base model params
    for param in model.parameters():
        param.requires_grad = False

    # Define LoRA layers
    class LoRALayer(nn.Module):
        def __init__(self, in_dim, out_dim, r, alpha):
            super().__init__()
            self.A = nn.Parameter(torch.empty(r, in_dim))
            self.B = nn.Parameter(torch.empty(out_dim, r))
            torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
            torch.nn.init.zeros_(self.B)
            self.scaling = alpha / r

        def forward(self, x):
            A = self.A.to(x.device)
            B = self.B.to(x.device)
            return self.scaling * (x @ A.T @ B.T)

    class LinearWithLoRA(nn.Module):
        def __init__(self, linear, r, alpha):
            super().__init__()
            self.linear = linear
            self.lora = LoRALayer(linear.in_features, linear.out_features, r, alpha)

        def forward(self, x):
            return self.linear(x) + self.lora(x)

    assign_lora = partial(LinearWithLoRA, r=r, alpha=alpha)

    for layer in model.roberta.encoder.layer:
        layer.attention.self.query = assign_lora(layer.attention.self.query)
        layer.attention.self.value = assign_lora(layer.attention.self.value)

    return model.to(device)


In [18]:
def load_checkpoint(model, optimizer, checkpoint_path, resume_training=True):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])

    if resume_training:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        step = checkpoint.get('step', 0)
        epoch = checkpoint.get('epoch', 0)
        print(f"Resumed training from epoch {epoch}, step {step}")
        return model, optimizer, step, epoch
    else:
        print("Model loaded for inference")
        return model

In [None]:
import os
import pandas as pd

lora_r_values = [8, 16, 20, 24, 28, 32]
lora_alpha_values = [16, 32, 40, 48, 56, 64]
max_steps = 100
save_every_steps = 100

# Load unlabelled test set
unlabelled_df = pd.read_pickle("test_unlabelled.pkl")
# unlabelled_dataset = Dataset.from_pandas(unlabelled_df)
tokenized_unlabelled = unlabelled_df.map(preprocess, batched=True, remove_columns=["text"])
tokenized_unlabelled.set_format(type='torch', columns=['input_ids', 'attention_mask'])
unlabelled_loader = DataLoader(tokenized_unlabelled, batch_size=batch_size_val, shuffle=False, collate_fn=data_collator)

for r, alpha in zip(lora_r_values, lora_alpha_values):
    # for alpha in lora_alpha_values:
    tag = f"r{r}_alpha{alpha}"
    print(f"\n=== Training LoRA {tag} ===")

    model = setup_lora_model(base_model, r, alpha)
    print("Trainable Parameter Count: {}".format(count_parameters(model)))
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    ckpt_dir = f"./checkpoints/{tag}"
    train(model, train_dataloader, optimizer, save_every_steps=save_every_steps, output_dir=ckpt_dir, max_steps=max_steps)
    evaluate(lora_model, test_dataloader)
    print(f"=== Inference for {tag} ===")
    inference_start = time.time()
    preds = evaluate_unlabelled(model, unlabelled_loader)

    output_dir = "./results"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"inference_output_{tag}.csv")

    df_output = pd.DataFrame({
      'ID': range(len(preds)),
      'Label': preds.numpy()
    })
    df_output.to_csv(output_path, index=False)

    print(f"Predictions saved to {output_path}")
    inference_end = time.time()
    inference_time = inference_end - inference_start
    print(f"Inference time: {inference_time:.2f}s")
    # Cleanup after inference
    del model
    del optimizer
    del preds
    del df_output
    torch.cuda.empty_cache()  # safe to call on CPU too

    import gc
    gc.collect()


Map: 100%|█████████████████████████| 8000/8000 [00:01<00:00, 4859.80 examples/s]



=== Training LoRA r8_alpha16 ===


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable Parameter Count: 294912
Starting training...

[Step 1] Loss: 1.4121 | Acc: 0.1250 | Time: 1.27s
[Step 100] Loss: 1.3892 | Acc: 0.1250 | Time: 0.33s
Checkpoint saved: ./checkpoints/r8_alpha16/model_step_100.pt
Epoch 1 completed in 37.43s
Reached max_steps=100, stopping training.

Training done in 1 epoch(s)
Total training time: 37.43s
Batch: 190/950 | Test loss: 1.4296 | accuracy: 0.1250
Batch: 380/950 | Test loss: 1.4258 | accuracy: 0.1250
Batch: 570/950 | Test loss: 1.3613 | accuracy: 0.3750
Batch: 760/950 | Test loss: 1.3464 | accuracy: 0.3750
Batch: 950/950 | Test loss: 1.4072 | accuracy: 0.2500
Test loss: 1.3991 acc: 0.2500

=== Inference for r8_alpha16 ===
Predictions saved to ./results/inference_output_r8_alpha16.csv
Inference time: 94.54s

=== Training LoRA r16_alpha32 ===


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable Parameter Count: 589824
Starting training...

[Step 1] Loss: 1.3988 | Acc: 0.1250 | Time: 0.51s
[Step 100] Loss: 1.4028 | Acc: 0.1250 | Time: 0.34s
Checkpoint saved: ./checkpoints/r16_alpha32/model_step_100.pt
Epoch 1 completed in 34.05s
Reached max_steps=100, stopping training.

Training done in 1 epoch(s)
Total training time: 34.06s
Batch: 190/950 | Test loss: 1.4296 | accuracy: 0.1250
Batch: 380/950 | Test loss: 1.4258 | accuracy: 0.1250
Batch: 570/950 | Test loss: 1.3613 | accuracy: 0.3750
Batch: 760/950 | Test loss: 1.3464 | accuracy: 0.3750
Batch: 950/950 | Test loss: 1.4072 | accuracy: 0.2500
Test loss: 1.3991 acc: 0.2500

=== Inference for r16_alpha32 ===
Predictions saved to ./results/inference_output_r16_alpha32.csv
Inference time: 94.58s

=== Training LoRA r20_alpha40 ===


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainable Parameter Count: 737280
Starting training...

[Step 1] Loss: 1.4182 | Acc: 0.0000 | Time: 0.85s
[Step 100] Loss: 1.3659 | Acc: 0.1250 | Time: 0.29s
Checkpoint saved: ./checkpoints/r20_alpha40/model_step_100.pt
Epoch 1 completed in 34.44s
Reached max_steps=100, stopping training.

Training done in 1 epoch(s)
Total training time: 34.44s
Batch: 190/950 | Test loss: 1.4296 | accuracy: 0.1250
Batch: 380/950 | Test loss: 1.4258 | accuracy: 0.1250
Batch: 570/950 | Test loss: 1.3613 | accuracy: 0.3750
Batch: 760/950 | Test loss: 1.3464 | accuracy: 0.3750
Batch: 950/950 | Test loss: 1.4072 | accuracy: 0.2500
Test loss: 1.3991 acc: 0.2500

=== Inference for r20_alpha40 ===
