# Starter Notebook

Install and import required libraries

In [4]:
!pip install transformers datasets evaluate accelerate peft trl bitsandbytes
!pip install nvidia-ml-py3
!pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [2]:
import os
import numpy as np
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification,EarlyStoppingCallback
from peft import LoraConfig, get_peft_model, PeftModel, TaskType
from datasets import load_dataset, Dataset, ClassLabel
import pickle

## Load Tokenizer and Preprocess Data

In [16]:
base_model = 'roberta-base'
dataset = load_dataset('ag_news')
tokenizer = RobertaTokenizer.from_pretrained(base_model)

num_labels = dataset['train'].features['label'].num_classes
class_names = dataset['train'].features["label"].names
print(f"Number of labels: {num_labels}")
print(f"The labels: {class_names}")

id2label = {i: label for i, label in enumerate(class_names)}
label2id = {v: k for k, v in id2label.items()}

import nlpaug.augmenter.word as naw

# Use WordNet-based synonym replacement
aug = naw.SynonymAug(aug_src='wordnet', aug_min=1, aug_p=0.3)

def augment_text(text):
    try:
        return aug.augment(text)
    except:
        return text  # fallback in case of errors

def preprocess_function(examples):
    texts = [augment_text(text.replace('\n', ' ').strip()) for text in examples["text"]]
    tokenized = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=128
    )
    tokenized['labels'] = examples['label']
    return tokenized

# Process dataset
columns_to_remove = [col for col in dataset['train'].column_names if col not in ['label']]
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=columns_to_remove,
    desc="Tokenizing datasets"
)

class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        batch = super().__call__(features)
        return {k: v.to(torch.long) for k, v in batch.items()}

data_collator = CustomDataCollator(
    tokenizer=tokenizer,
    padding=True
)

Number of labels: 4
The labels: ['World', 'Sports', 'Business', 'Sci/Tech']


Tokenizing datasets:   0%|          | 0/120000 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       d

Tokenizing datasets:   0%|          | 0/7600 [00:00<?, ? examples/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       d

## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [17]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

## Anything from here on can be modified

## Setup LoRA Config
Setup PEFT config and get peft model for finetuning

In [18]:
lora_config = LoraConfig(
    r=8,  # Keep same rank as it's working well
    lora_alpha=32,  # Increased from 16 to 32 for stronger adaptation
    target_modules=["query", "value"],
    lora_dropout=0.1,  # Reduced dropout for better convergence
    bias="lora_only",
    task_type=TaskType.SEQ_CLS,
    modules_to_save=['classifier']  # Fine-tune classifier layer
)

In [19]:
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 907,012 || all params: 125,537,288 || trainable%: 0.7225


## Training Setup

In [20]:
# To track evaluation accuracy during training
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = (predictions == labels).mean()
    return {"accuracy": accuracy}


In [21]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",    # Changed from "no" to save checkpoints
    learning_rate=1e-4,       # Reduced from 2e-4
    per_device_train_batch_size=32,  # Increased from 16
    per_device_eval_batch_size=64,
    num_train_epochs=4,       # Increased to allow proper early stopping
    weight_decay=0.03,        # Increased from 0.01
    logging_dir="./logs",
    report_to="none",
    fp16=True,
    load_best_model_at_end=True,  # Required for early stopping
    metric_for_best_model="eval_loss",  # Monitor validation loss
    greater_is_better=False,  # Lower loss is better
    warmup_steps=500,         # Added warmup steps
    lr_scheduler_type="cosine",  # Added cosine scheduler
    gradient_accumulation_steps=2,  # Effectively doubles batch size
    fp16_full_eval=True,      # Enable full fp16 for evaluation
    logging_steps=50          # Log more frequently
)



### Start Training

In [22]:
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.0005  # Tighter threshold
)

In [23]:
tokenized_dataset_aug = dataset["train"].map(
    preprocess_function,
    batched=True,
    remove_columns=columns_to_remove,
    desc="Augmenting + Tokenizing training set"
)

In [24]:
trainer = Trainer(
   model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset_aug,
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback],  # Add callback
)

# 9. Train
trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2344,0.206918,0.930921
2,0.194,0.183675,0.938158
3,0.1766,0.178466,0.941711
4,0.1667,0.17786,0.942895


TrainOutput(global_step=7500, training_loss=0.22243072814941406, metrics={'train_runtime': 2170.7996, 'train_samples_per_second': 221.117, 'train_steps_per_second': 3.455, 'total_flos': 3.190145974272e+16, 'train_loss': 0.22243072814941406, 'epoch': 4.0})

## Evaluate Finetuned Model


In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)
eval_results = trainer.evaluate()
print("Final Evaluation Accuracy:", eval_results["eval_accuracy"])

Using device: cuda


Final Evaluation Accuracy: 0.9428947368421052


In [26]:
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params}")

Trainable parameters: 907012


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [31]:
from datasets import Dataset
from torch.utils.data import DataLoader

# Load dataset object
with open("/content/test_unlabelled.pkl", "rb") as f:
    test_dataset = pickle.load(f)

# Convert to HuggingFace Dataset (already is, but this helps formatting)
test_dataset = Dataset.from_dict({"text": test_dataset["text"]})

# Tokenize function
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenizer
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Create PyTorch DataLoader for batching
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=64)

# Prediction loop
model.eval()
all_predictions = []

with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        preds = torch.argmax(outputs.logits, dim=-1)
        all_predictions.extend(preds.cpu().numpy())

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [32]:
df = pd.DataFrame({
    "ID": range(len(all_predictions)),         # ✅ actual IDs from the dataset
    "Labels": all_predictions         # ✅ correct column name "Labels"
})

df.to_csv("new_submission1.csv", index=False)      # ✅ filename doesn’t matter, but keep index=False
print("✅ Submission file created: submission.csv")

✅ Submission file created: submission.csv
