<a href="https://colab.research.google.com/github/adityasodani03/LLM-RAG/blob/main/lora_Qlora.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step-1: Install Required Libraries

In [2]:
!pip install -qU transformers datasets accelerate peft safetensors

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.2/507.2 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Step-2: Import Libraries

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
from peft import LoraConfig, get_peft_model
import torch

In [6]:
# Step-3: Load a Small Open-Source Model

In [7]:
# AutoTokenizer.from_pretrained(...) downloads the tokenizer for gpt2. Tokenizers turn text into token ids the model understands.

# AutoModelForCausalLM.from_pretrained(...) loads the GPT-2 model weights and architecture for language generation (causal LM).

# tokenizer.pad_token = tokenizer.eos_token — GPT-2’s tokenizer typically has no pad token. Many training utilities expect a pad token for batching; setting the pad token to the eos_token is a simple safe shortcut.

# Note: When you add a pad token, the model embeddings size may need resize_token_embeddings if you added a new token. For GPT-2 setting pad token to eos_token reuses existing token so resizing isn’t required, but it's harmless to call:

In [8]:
model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [9]:
# Step-4: Create a Small Training Dataset in Notebook

In [10]:
training_sentences = [
    "A data scientist uses statistics and machine learning to solve problems.",
    "LoRA helps large models train faster and cheaper.",
    "QLoRA allows model fine tuning even on low RAM computers.",
    "Transformers library makes NLP easy and powerful.",
]

dataset = Dataset.from_dict({"text": training_sentences})

dataset

Dataset({
    features: ['text'],
    num_rows: 4
})

In [11]:
# Step-5: Tokenize Dataset

In [12]:
# tokenizer(...) converts raw strings to input_ids and attention_mask.

# truncation=True ensures long lines are cut to max_length.

# padding="max_length" pads all examples to the same length (128). This makes batching straightforward.

# .map(..., batched=True) applies tokenization to all examples efficiently.

# Why max_length=128?
# Small demo length that keeps memory usage low. For real tasks pick a length that fits your data and hardware.

In [13]:
def tokenize(batch):
  return tokenizer(
      batch["text"],
      truncation=True,
      padding="max_length",
      max_length=128,
  )

tokenized_dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [14]:
# Step-6: Data Collator (Helps MLM Training)

In [15]:
# The collator builds mini-batches and creates labels appropriate for language modeling.

# mlm=False because GPT-2 is a causal language model (not masked LM). The collator will shift inputs into labels for next-token prediction.

In [16]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [17]:
# PART-A: TRAIN LoRA ON CPU

In [18]:
# Step-7: Configure LoRA

In [19]:
# LoraConfig defines the LoRA adapter hyperparameters:

# r (rank): low-rank dimension. Small r → fewer trainable params. 8 is a reasonable demo default.

# lora_alpha: scaling factor applied to adapter outputs (larger alpha scales updates).

# target_modules: module names where LoRA will be injected. For GPT-2, "c_attn" (combined QKV) is common. If your model uses different internal names you must adapt this.

# lora_dropout: small dropout on adapter outputs to regularize.

# get_peft_model(model, lora_config) inserts the LoRA adapter modules into the model and returns a wrapped model that exposes only adapter parameters for training.

# print_trainable_parameters() prints how many parameters are trainable (should be a small fraction).

In [20]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn"],
    lora_dropout=0.05,
)

In [21]:
# freezes the base model and trains only adapter parameters.

In [25]:
for param in model.parameters():
    param.requires_grad = False

In [26]:
# Step-8: Apply LoRA

In [27]:
model_lora = get_peft_model(model, lora_config)
model_lora.print_trainable_parameters()



trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364


In [28]:
# Step-9: Training Arguments

In [29]:
# output_dir: where model / checkpoints are written.

# per_device_train_batch_size: batch size per device (1 on CPU to reduce RAM).

# gradient_accumulation_steps=2: accumulate gradients across N mini-batches so effective batch size = batch_size * accumulation_steps. Useful when memory is small.

# num_train_epochs=1: number of passes over the dataset (kept tiny for demo). Increase for real training.

# logging_steps=5: how often to print logs.

# report_to="none": disables external loggers (e.g., WandB). This fixes the API key prompt you saw earlier.

# Trainer handles the training loop (forward/backward/optimizer/step) for you.

In [30]:
training_args = TrainingArguments(
    output_dir="./lora-gpt2",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    logging_steps=5,
    report_to="none"
)

In [31]:
# Step-10: Start LoRA Fine-Tuning

In [32]:
trainer = Trainer(
    model=model_lora,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=2, training_loss=5.326165199279785, metrics={'train_runtime': 25.1908, 'train_samples_per_second': 0.159, 'train_steps_per_second': 0.079, 'total_flos': 262198001664.0, 'train_loss': 5.326165199279785, 'epoch': 1.0})

In [33]:
# Step-11: Test the Model

In [34]:
input_text = "LoRA is helpful because"
inputs = tokenizer(input_text, return_tensors="pt")

output = model_lora.generate(**inputs, max_length=80)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


LoRA is helpful because it allows you to make a decision about your own life.

The first thing you need to know about the RA is that it is a very important thing.

The RA is a very important thing. It is a very important thing. It is a very important thing. It is a very important. It is a very important thing. It is a very important


In [35]:
# step-12
# PART-B: QLoRA (Low RAM Version)

In [36]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
import torch

model_name = "gpt2"

# Load normally (no bitsandbytes)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

# Convert weights to lower precision (optional)
model = model.to(torch.float32)

In [37]:
# Add LoRA (acts like QLoRA on CPU)

In [38]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],  # GPT2 attention layer
    lora_dropout=0.05,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364




In [39]:
# Train the model

In [40]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./cpu-qlora-sim",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()



Step,Training Loss


TrainOutput(global_step=2, training_loss=5.326672554016113, metrics={'train_runtime': 25.807, 'train_samples_per_second': 0.155, 'train_steps_per_second': 0.077, 'total_flos': 262198001664.0, 'train_loss': 5.326672554016113, 'epoch': 1.0})

In [41]:
# Test the model

In [42]:
input_text = "QLoRA helps small machines"
inputs = tokenizer(input_text, return_tensors="pt")

output = model.generate(**inputs, max_length=80)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


QLoRA helps small machines to run on a single machine.

The following is a list of the most popular Linux distributions that are supported by the Linux Foundation.

Linux Mint

Linux Mint is a Linux Mint-based operating system. It is a Linux Mint-based operating system.

Linux Mint is a Linux Mint-based operating system. It is a Linux Mint-
