# Starter Notebook

Install and import required libraries

In [1]:
# !pip install transformers datasets evaluate accelerate peft trl bitsandbytes 
# !pip install nvidia-ml-py3
# !pip install pandas
# !pip install scikit-learn
# !pip install nlpaug

In [2]:
import os
import pandas as pd
import torch
from transformers import RobertaModel, RobertaTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding, RobertaForSequenceClassification
from peft import LoraConfig, get_peft_model, PeftModel
from datasets import load_dataset, Dataset, ClassLabel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pickle
from torch.utils.data import DataLoader
import evaluate
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import warnings
warnings.filterwarnings('ignore')

## Load Tokenizer and Preprocess Data

In [4]:
base_model = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(base_model)

In [5]:
# Load CSV into HuggingFace Dataset
df = pd.read_csv("augmented_agnews_31.csv")
hf_dataset = Dataset.from_pandas(df)

# Tokenize
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_dataset = hf_dataset.map(tokenize, batched=True)
train_dataset = train_dataset.rename_column("label", "labels")
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


raw_eval_dataset = load_dataset("ag_news", split="test")
df_eval = pd.DataFrame({'text': raw_eval_dataset["text"], 'label': raw_eval_dataset["label"]})
hf_eval_dataset = Dataset.from_pandas(df_eval)

eval_dataset = hf_eval_dataset.map(tokenize, batched=True)
eval_dataset = eval_dataset.rename_column("label", "labels")
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

def preprocess(examples):
    tokenized = tokenizer(examples['text'], truncation=True, padding=True)
    return tokenized

Map: 100%|██████████| 1237984/1237984 [09:31<00:00, 2167.04 examples/s]
Map: 100%|██████████| 7600/7600 [00:04<00:00, 1698.74 examples/s]


In [6]:
dataset = load_dataset('ag_news', split='train')

# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


In [7]:
# Extract the number of classess and their names
num_labels = dataset.features['label'].num_classes
class_names = dataset.features["label"].names
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

# Create an id2label mapping
# We will need this for our classifier.
id2label = {i: label for i, label in enumerate(class_names)}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


number of labels: 4
the labels: ['World', 'Sports', 'Business', 'Sci/Tech']


## Load Pre-trained Model
Set up config for pretrained model and download it from hugging face

In [8]:
model = RobertaForSequenceClassification.from_pretrained(
    base_model,
    id2label=id2label)
model

# Freeze all parameters
for param in model.base_model.parameters():
    param.requires_grad = False

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Anything from here on can be modified

In [9]:
# # Split the original training set
# train_dataset = AugmentedDataset(
#     hf_dataset=raw_train_dataset,
#     tokenizer=tokenizer,
#     augment=True,
#     pct_augment=0.3,
#     max_length=128
# )

# eval_dataset = AugmentedDataset(
#     hf_dataset=raw_eval_dataset,
#     tokenizer=tokenizer,
#     augment=False,
#     max_length=128
# )

print("Training Samples: ", len(eval_dataset))
print("Testing Samples: ", len(train_dataset))

Training Samples:  7600
Testing Samples:  1237984


In [10]:
# Target only layers 6 through 11
top_layers = [f"encoder.layer.{i}.attention.self" for i in range(8, 12)]
target_modules = []
for layer in top_layers:
    target_modules.extend([f"{layer}.query", f"{layer}.value"])

In [11]:
# LoRA config
peft_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=target_modules
)

In [12]:
peft_model = get_peft_model(model, peft_config)
peft_model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-7): 8 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): Linear(in_features=768, out_features=768, bias=True)
                  (key): Linear(in_features=768, out_features=768, bias=True)
                  (value): Linear(in_features=768, out_features=768, bias=True)
                  (dropout): Dropout(p=0.1, inplace=False

In [13]:
print("Trainable parameters:")
for name, param in peft_model.named_parameters():
    if param.requires_grad:
        print(name)

Trainable parameters:
base_model.model.roberta.encoder.layer.8.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.8.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.8.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.8.attention.self.value.lora_B.default.weight
base_model.model.roberta.encoder.layer.9.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.9.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.9.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.9.attention.self.value.lora_B.default.weight
base_model.model.roberta.encoder.layer.10.attention.self.query.lora_A.default.weight
base_model.model.roberta.encoder.layer.10.attention.self.query.lora_B.default.weight
base_model.model.roberta.encoder.layer.10.attention.self.value.lora_A.default.weight
base_model.model.roberta.encoder.layer.10.attention

In [14]:
print('PEFT Model')
peft_model.print_trainable_parameters()

PEFT Model
trainable params: 986,884 || all params: 125,635,592 || trainable%: 0.7855


## Training Setup

In [15]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Calculate accuracy
    accuracy = accuracy_score(labels, preds)
    return {
        'accuracy': accuracy
    }

In [16]:
# Setup Training args
output_dir = "results_batch_size_64_data_aug_31"
training_args = TrainingArguments(
    output_dir=output_dir,
    report_to=None,
    eval_strategy='steps',
    logging_steps=5000,
    learning_rate=1e-5,
    num_train_epochs=60,
    # max_steps=1200,
    use_cpu=False,
    dataloader_num_workers=8,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    optim="adamw_torch",
    weight_decay=0.01,
    gradient_checkpointing=True,
    lr_scheduler_type="reduce_lr_on_plateau",
    warmup_ratio=0.1,
    bf16=True,
    gradient_checkpointing_kwargs={'use_reentrant':True}
)

def get_trainer(model):
      return  Trainer(
          model=model,
          args=training_args,
          compute_metrics=compute_metrics,
          train_dataset=train_dataset,
          eval_dataset=eval_dataset,
          data_collator=data_collator,
      )

### Start Training

In [17]:
peft_lora_finetuning_trainer = get_trainer(peft_model)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [19]:
result = peft_lora_finetuning_trainer.train()

Step,Training Loss,Validation Loss,Accuracy
5000,1.0691,0.668379,0.873553
10000,0.6274,0.410501,0.882632
15000,0.5285,0.3585,0.888553
20000,0.4987,0.338348,0.891184
25000,0.4804,0.32644,0.892895
30000,0.4699,0.320493,0.893816
35000,0.4634,0.316145,0.894605
40000,0.4604,0.311912,0.895921
45000,0.4538,0.308618,0.894868
50000,0.4491,0.306663,0.895658


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



## Evaluate Finetuned Model


### Performing Inference on Custom Input
Uncomment following functions for running inference on custom inputs

In [20]:
def classify(model, tokenizer, text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**inputs)

    prediction = output.logits.argmax(dim=-1).item()

    print(f'\n Class: {prediction}, Label: {id2label[prediction]}, Text: {text}')
    return id2label[prediction]

In [21]:
classify( peft_model, tokenizer, "Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...")
classify( peft_model, tokenizer, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.")


 Class: 0, Label: World, Text: Kederis proclaims innocence Olympic champion Kostas Kederis today left hospital ahead of his date with IOC inquisitors claiming his ...

 Class: 2, Label: Business, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindlinand of ultra-cynics, are seeing green again.


'Business'

### Run Inference on eval_dataset

In [22]:
def evaluate_model(inference_model, dataset, labelled=True, batch_size=8, data_collator=None):
    """
    Evaluate a PEFT model on a dataset.

    Args:
        inference_model: The model to evaluate.
        dataset: The dataset (Hugging Face Dataset) to run inference on.
        labelled (bool): If True, the dataset includes labels and metrics will be computed.
                         If False, only predictions will be returned.
        batch_size (int): Batch size for inference.
        data_collator: Function to collate batches. If None, the default collate_fn is used.

    Returns:
        If labelled is True, returns a tuple (metrics, predictions)
        If labelled is False, returns the predictions.
    """
    # Create the DataLoader
    eval_dataloader = DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    inference_model.to(device)
    inference_model.eval()

    all_predictions = []
    if labelled:
        metric = evaluate.load('accuracy')

    # Loop over the DataLoader
    for batch in tqdm(eval_dataloader):
        # Move each tensor in the batch to the device
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = inference_model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.append(predictions.cpu())

        if labelled:
            # Expecting that labels are provided under the "labels" key.
            references = batch["labels"]
            metric.add_batch(
                predictions=predictions.cpu().numpy(),
                references=references.cpu().numpy()
            )

    # Concatenate predictions from all batches
    all_predictions = torch.cat(all_predictions, dim=0)

    if labelled:
        eval_metric = metric.compute()
        print("Evaluation Metric:", eval_metric)
        return eval_metric, all_predictions
    else:
        return all_predictions

In [23]:
# Check evaluation accuracy
_, _ = evaluate_model(peft_model, eval_dataset, True, 8, data_collator)

100%|██████████| 950/950 [00:14<00:00, 64.88it/s]

Evaluation Metric: {'accuracy': 0.9028947368421053}





### Run Inference on unlabelled dataset

In [24]:
#Load your unlabelled data
unlabelled_dataset = pd.read_pickle("test_unlabelled.pkl")
test_dataset = unlabelled_dataset.map(preprocess, batched=True, remove_columns=["text"])
unlabelled_dataset

Map: 100%|██████████| 8000/8000 [00:06<00:00, 1297.71 examples/s]


Dataset({
    features: ['text'],
    num_rows: 8000
})

In [25]:
# Run inference and save predictions
preds = evaluate_model(peft_model, test_dataset, False, 8, data_collator)
df_output = pd.DataFrame({
    'ID': range(len(preds)),
    'Label': preds.numpy()  # or preds.tolist()
})
df_output.to_csv(os.path.join(output_dir,"inference_output_batch_size_64_data_aug_bf16_31.csv"), index=False)
print("Inference complete. Predictions saved to inference_output.csv")

100%|██████████| 1000/1000 [00:16<00:00, 61.62it/s]

Inference complete. Predictions saved to inference_output.csv





In [26]:
peft_model.save_pretrained("lora_roberta_adapter_batch_size_64_data_aug_bf16_31")
tokenizer.save_pretrained("lora_roberta_adapter_batch_size_64_data_aug_bf16_31")

('lora_roberta_adapter_batch_size_64_data_aug_bf16_31/tokenizer_config.json',
 'lora_roberta_adapter_batch_size_64_data_aug_bf16_31/special_tokens_map.json',
 'lora_roberta_adapter_batch_size_64_data_aug_bf16_31/vocab.json',
 'lora_roberta_adapter_batch_size_64_data_aug_bf16_31/merges.txt',
 'lora_roberta_adapter_batch_size_64_data_aug_bf16_31/added_tokens.json')

In [27]:
# from peft import PeftModel
# original_model = AutoModelForSequenceClassification.from_pretrained(
#   model.config["_name_or_path"]
# )
# original_with_adapter = PeftModel.from_pretrained(
#   original_model, "bert-peft" # bert-peft; the folder of the saved adapter
# )
# merged_model = original_with_adapter.merge_and_unload()
# merged_model.save_pretrained("merged-model")



# config = PeftConfig.from_pretrained("lora_roberta_adapter")
# base_model = AutoModelForSequenceClassification.from_pretrained(config.base_model_name_or_path)
# model = PeftModel.from_pretrained(base_model, "lora_roberta_adapter")
# tokenizer = AutoTokenizer.from_pretrained("lora_roberta_adapter")