### Notebook Params

- Base Training Arguments: 
    - Batch Size 32 + AdamW Optimiser + Cosine Scheduler
    - Learning rate to 2e-4
    - using warmup ratio of 0.1
    - weight decay 0.1
    - Num of epochs is 10
- Base LoRA Config
    - Changed dropout to 0.6
    - rank to 4 and alpha to 8 onm query and value
    - use dora
    - use rslora
    - use gaussian initialization
- Base Data Pre-processing

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score, f1_score, precision_score
import numpy as np
import pandas as pd
import pickle
import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# some cudnn methods can be random even after fixing the seed
# unless you tell it to be deterministic
torch.backends.cudnn.deterministic = True

In [3]:
# -----------------------------
# 2. Use GPU if available
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# -----------------------------
# 3. Load and preprocess AGNEWS dataset
# -----------------------------
dataset = load_dataset("ag_news")
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Using device: cuda


In [4]:
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# some cudnn methods can be random even after fixing the seed
# unless you tell it to be deterministic
torch.backends.cudnn.deterministic = True

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=4)

lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    target_modules=["query", "value"],
    lora_dropout=0.6,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    use_dora = True,
    use_rslora = True,
    init_lora_weights = 'gaussian',
)

model = get_peft_model(model, lora_config)
model.to(device)
model.print_trainable_parameters()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 759,556 || all params: 125,408,264 || trainable%: 0.6057


In [6]:
training_args = TrainingArguments(
    output_dir = "./results",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 2e-4,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 64,
    num_train_epochs = 10,
    weight_decay = 0.1,
    logging_dir = "./logs",
    report_to = "none",
    lr_scheduler_type = "cosine",
    warmup_ratio = 0.1,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    greater_is_better = False,
    # max_grad_norm = 0.5
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions), 'f1': f1_score(labels, predictions, average='weighted'), "precision": precision_score(labels, predictions, average='weighted')}

In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)],
)

trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision
1,0.2874,0.263342,0.915526,0.9154,0.916514
2,0.2459,0.212297,0.926053,0.925915,0.925973
3,0.2343,0.198943,0.932763,0.932683,0.933471
4,0.2103,0.200446,0.931842,0.931889,0.932075
5,0.1952,0.195283,0.936316,0.936117,0.936704
6,0.188,0.188765,0.937895,0.937758,0.937678


TrainOutput(global_step=22500, training_loss=0.24767076687282985, metrics={'train_runtime': 3432.9444, 'train_samples_per_second': 349.554, 'train_steps_per_second': 10.924, 'total_flos': 4.778084450304e+16, 'train_loss': 0.24767076687282985, 'epoch': 6.0})

In [8]:
eval_results = trainer.evaluate()
print("Final Evaluation Accuracy:", eval_results["eval_accuracy"])

Final Evaluation Accuracy: 0.9378947368421052


In [9]:
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")

Trainable parameters: 759556


In [10]:
from datasets import Dataset
from torch.utils.data import DataLoader

# Load dataset object
with open("./data/test_unlabelled.pkl", "rb") as f:
    test_dataset = pickle.load(f)

# Convert to HuggingFace Dataset (already is, but this helps formatting)
test_dataset = Dataset.from_dict({"text": test_dataset["text"]})

# Tokenize function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenizer
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Create PyTorch DataLoader for batching
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=64)

# Prediction loop
model.eval()
all_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(preds.cpu().numpy())

Map: 100%|██████████| 8000/8000 [00:00<00:00, 20482.69 examples/s]


In [11]:
df = pd.DataFrame({
    "ID": list(range(len(all_predictions))),   # ID ✅
    "label": all_predictions
})
df.to_csv("submission-v71.csv", index=False)
print("✅ Batched predictions complete. Saved to submission.csv.")

✅ Batched predictions complete. Saved to submission.csv.
