# LoRA Demo

In [1]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding, 
    TrainingArguments, 
    Trainer
)
from peft import LoraConfig, get_peft_model, TaskType
import torch
import evaluate
import numpy as np


In [2]:
# Load dataset
# Standard GLUE SST-2 dataset - Sentiment Analysis of given sentences
dataset =  load_dataset("glue", "sst2")
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [3]:
base_model_name = "roberta-base" 

# define label maps
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

# AutoConfig loads RoBERTa’s default configuration but overrides some fields:
# num_labels=2 → adds a classification head with 2 output labels
# id2label and label2id → maps between label IDs and label names
config = AutoConfig.from_pretrained(
    base_model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
)

# Loads tokenizer for RoBERTa
# This tokenizer: Splits text into tokens, converts tokens to IDs, and handles special tokens like [CLS], [SEP], and [PAD].
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# RoBERTa itself is not specifically a classification model; it's a general language model.
# But when we load it with AutoModelForSequenceClassification, it becomes a classifier.
# why? Because we specify the config with num_labels=2
# how? By using AutoModelForSequenceClassification, we are telling the model to add a classification head on top of the base RoBERTa model.
base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    config=config,
)

print(f"Before adding PAD token, tokenizer vocalbulary size: {len(tokenizer)}")
print(f"Before adding PAD token, tokenizer padding token: {tokenizer.pad_token}")

# Padding is needed because transformer models (like RoBERTa, BERT, GPT) 
# only work with fixed-length batches, but sentences in real life have variable lengths.
# RoBERTa does NOT have a pad token by default.
# It uses the <mask> token as padding—but this is not ideal for training.
if tokenizer.pad_token is None:
    print("Adding PAD token to tokenizer and resizing model embeddings...")
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    base_model.resize_token_embeddings(len(tokenizer))

print(f"After adding PAD token, tokenizer vocalbulary size: {len(tokenizer)}")
print(f"After adding PAD token, tokenizer padding token: {tokenizer.pad_token}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Before adding PAD token, tokenizer vocalbulary size: 50265
Before adding PAD token, tokenizer padding token: <pad>
After adding PAD token, tokenizer vocalbulary size: 50265
After adding PAD token, tokenizer padding token: <pad>


In [21]:
# Base Model Structure
print(base_model)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): lora.Linear(
                (base_layer): Linear(in_features=768, out_features=768, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=768, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
      

In [4]:
# Create tokenization function
# this function will be applied to each record in the dataset
# it extracts the sentence, tokenizes it to IDs, and truncates/pads to max length of 512
def tokenize_function(examples):
    # extract the sentence
    sentences = examples["sentence"]
    # tokenize and truncate/pad to max length
    tokenizer.truncation_side = 'left'
    tokenized_inputs = tokenizer(
        sentences, 
        return_tensors='np',
        truncation=True, 
        max_length=512 
    )
    return tokenized_inputs

# tokenize training and validation datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [5]:
# Data collator to dynamically pad the inputs received, so they are of equal length within a batch
# Data collators are used to batch multiple samples of data together and prepare it for training.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
# evaluate library (by Hugging Face) lets you load standard evaluation metrics.
# it caldculates accuracy by comparing predicted labels to true labels.
accuracy_metric = evaluate.load("accuracy")

# define evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_metric.compute(predictions=predictions, references=labels)}

In [8]:
# Define a list of samples for testing the tokenizer 
text_list = [
    "I loved the new Batman movie!",
    "The food at that restaurant was terrible.",
    "What an amazing experience!",
    "I will never go back to that place again."
]

## Test the BASE MODEL (before Finetunning)

In [15]:
print("Untrained model predictions:")
for text in text_list:
    inputs = tokenizer(text, return_tensors="pt")
    outputs = base_model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    print(f"Text: {text} - {id2label[predictions.item()]}")
    print()

Untrained model predictions:
Text: I loved the new Batman movie! - POSITIVE

Text: The food at that restaurant was terrible. - POSITIVE

Text: What an amazing experience! - POSITIVE

Text: I will never go back to that place again. - POSITIVE



## Finetunning using LoRA

In [None]:
# print base model size
base_model_size = sum(param.numel() for param in base_model.parameters())
print(f"Base model size: {base_model_size/1e6:.2f} million parameters") 
 

Base model size: 125.39 million parameters


In [20]:
# Define LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, # Task type = sequence classification (e.g., sentiment analysis). This tells LoRA which parts of the model to modify.
    inference_mode=False, # Set to False because you are training. True would freeze the base model for inference.
    r=4, # Rank of the low-rank decomposition. LoRA inserts small weight matrices of size r instead of modifying the full weight matrix.
    lora_alpha=32, # Scaling factor for LoRA weights (helps control magnitude).
    lora_dropout=0.1, # Dropout applied to LoRA layers during training (prevents overfitting).
    target_modules=["query"] # target_modules specifies which parts of the transformer model will get LoRA adapters.
                            # In multi-head attention, each attention layer has weights for query (Q), key (K), value (V), and output (O)
                            # By setting target_modules=["query"], LoRA will only inject trainable adapters into the query weight matrices.
                            # This reduces the number of trainable parameters even further.
)

peft_model = get_peft_model(base_model, lora_config)
peft_model.print_trainable_parameters()


trainable params: 739,586 || all params: 125,386,756 || trainable%: 0.5898


In [25]:
# # peft_model size
# peft_model_size = sum(param.numel() for param in peft_model.parameters())
# print(f"PEFT model size: {peft_model_size/1e6:.2f} million parameters") 


In [30]:
# Hyperparameters
learning_rate = 2e-4
batch_size = 16
num_epochs = 1
weight_decay = 0.01

# Training Configuration
training_args = TrainingArguments(
    output_dir="./outputs/peft-lora-sst2",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=weight_decay,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
)


In [31]:
# Train the model using Trainer API
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
# Train the model
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")
# Test the fine-tuned model
print("Fine-tuned model predictions:")
for text in text_list:
    inputs = tokenizer(text, return_tensors="pt")
    outputs = peft_model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    print(f"Text: {text}")
    print(f"Text: {text} - {id2label[predictions.item()]}")
    print()

## Upload the model to Hugging Face

In [None]:
# === Upload Fine-Tuned Model to Hugging Face Hub ===
# Make sure you have:
# 1. Installed `huggingface_hub` (`pip install huggingface_hub`)
# 2. Logged in using `huggingface-cli login` or set the HF_TOKEN environment variable

from huggingface_hub import HfApi

# Define repository ID (format: "username/model_name") on Hugging Face Hub
# Example: "your-username/roberta-lora-sst2"
repo_id = "your-username/roberta-lora-sst2"

# Save the LoRA adapter weights only (recommended for LoRA)
peft_model.save_pretrained(f"./outputs/peft-lora-sst2")

# Save tokenizer as well
tokenizer.save_pretrained(f"./outputs/peft-lora-sst2")

# Push to Hugging Face Hub
# This will upload both the LoRA weights and tokenizer to your repository
peft_model.push_to_hub(repo_id, use_auth_token=True)
tokenizer.push_to_hub(repo_id, use_auth_token=True)

print(f"Fine-tuned LoRA model and tokenizer successfully uploaded to Hugging Face Hub: https://huggingface.co/{repo_id}")


## Upload the MODEL card to Hugging Face

In [None]:
import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN_WRITE")

readme_content = """
---
license: mit
tags:
  - causal-lm
  - instruction-following
  - loRA
  - QLoRA
  - quantized
language: en
library_name: transformers
base_model: microsoft/phi-2
---

# Phi-2 QLoRA Fine-Tuned Model


**Model:** `mishrabp/phi2-qlora-finetuned`

**Base Model:** [`microsoft/phi-2`](https://huggingface.co/microsoft/phi-2)

**Fine-Tuning Method:** QLoRA (4-bit quantized LoRA)

**Task:** Instruction-following / Customer Support Responses

---

## Model Description

This repository contains a **Phi-2 language model fine-tuned using QLoRA** on a synthetic dataset of customer support instructions and responses. The fine-tuning uses **4-bit quantized LoRA adapters** for memory-efficient training and can run on GPU or CPU (slower on CPU).

The model is designed for **instruction-following tasks** like customer support, FAQs, or other dialog generation tasks.

---

## Training Data

The fine-tuning dataset is synthetic, consisting of 3000 instruction-response pairs:

**Example:**

```text
Instruction: "Customer asks about refund window #1"
Response: "Our refund window is 30 days from delivery."
```

Here is the dataset that was used for fine-tunning:
https://huggingface.co/datasets/mishrabp/customer-support-responses/resolve/main/train.csv

You can replace the dataset with your own CSV/JSON file to train on real-world data.

---

## Intended Use

* Generate responses to instructions in customer support scenarios.
* Small-scale instruction-following experiments.
* Educational or research purposes.

---

## How to Use

### Load the Fine-Tuned Model

```python
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# -----------------------------
# Load fine-tuned model from HF
# -----------------------------
model_name = "mishrabp/phi2-qlora-finetuned"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")
model = PeftModel.from_pretrained(base_model, model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# -----------------------------
# Sample evaluation dataset
# -----------------------------
eval_data = [
    {"instruction": "Customer asks about refund window", "reference": "Our refund window is 30 days from delivery."},
    {"instruction": "Order arrived late", "reference": "Sorry for the delay. A delivery credit has been applied."},
    {"instruction": "Wrong item received", "reference": "We’ll ship the correct item and provide a return label."},
]

# -----------------------------
# Evaluation loop
# -----------------------------
for i, example in enumerate(eval_data, 1):
    prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output_ids = model.generate(**inputs, max_new_tokens=50)
    generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    print(f"Example {i}")
    print("Instruction:", example["instruction"])
    print("Generated Response:", generated.split("### Response:")[-1].strip())
    print("Reference Response:", example["reference"])
    print("-" * 50)

# -----------------------------
# Optional: compute simple token-level accuracy or BLEU
# -----------------------------
from nltk.translate.bleu_score import sentence_bleu

bleu_scores = []
for example in eval_data:
    prompt = f"### Instruction:\n{example['instruction']}\n\n### Response:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output_ids = model.generate(**inputs, max_new_tokens=50)
    generated = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("### Response:")[-1].strip()

    reference_tokens = example["reference"].split()
    generated_tokens = generated.split()
    bleu = sentence_bleu([reference_tokens], generated_tokens)
    bleu_scores.append(bleu)

print("Average BLEU score:", sum(bleu_scores)/len(bleu_scores))

```

---

## Training Script

The training script performs the following steps:

1. Loads the **Phi-2 base model**.
2. Creates a **synthetic dataset** of instruction-response pairs.
3. Tokenizes and formats the dataset for causal language modeling.
4. Applies a **LoRA adapter**.
5. Trains using **QLoRA** if GPU is available, otherwise full-precision LoRA on CPU.
6. Saves the adapter and tokenizer to `./phi2-qlora`.
7. Pushes the adapter and tokenizer to Hugging Face Hub.

### Requirements

```bash
pip install torch transformers peft datasets huggingface_hub python-dotenv
```

---

## Parameters

* `r=8`, `lora_alpha=16`, `lora_dropout=0.05`
* `target_modules=["q_proj","v_proj"]` (adjust for different base models)
* Learning rate: `2e-4`
* Batch si

"""

with open("README.md", "w", encoding="utf-8") as f:
    f.write(readme_content)

from huggingface_hub import HfApi, Repository

repo_id = "mishrabp/phi2-qlora-finetuned"

# Option 1: Using HfApi to upload README
api = HfApi()
api.upload_file(
    path_or_fileobj="README.md",
    path_in_repo="README.md",  # must be exactly README.md for HF Hub
    repo_id=repo_id,
    repo_type="model",
    token=os.environ["HF_TOKEN"]
)
