<a href="https://colab.research.google.com/github/alfazick/AppliedLLMCourse/blob/main/Module3Part2peftminiproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# assotiative matrix Multiplication
# how does the matrix trick work so that ŒîW = BA never gets materialized?

# so imagine you have a matrix d = 4096
# so full W will be shape of (4096,4096)

# so let's say you choose a rank = 8

# and now you have two matrixes
# A (8*4096)
# B (4096*8)

# so input x will be the shape of hidden dimension like for example x = (1,4096)

d = 4096
r = 8

import torch
torch.manual_seed(97)

d = 4096
r = 8

A = torch.randn(d, r)   # [4096,8] - Down Projection
B = torch.randn(r, d )  # [8,4096] - Up Projection
x = torch.randn(1, d)   # [1, 4096]

# Original pretrained weight (full matrix, stored once)
W0 = torch.randn(d, d)        # [4096, 4096]


# build ŒîW explicitly, then multiply:
DeltaW = A @ B          # [4096, 8] @ [8, 4096] -> [4096, 4096]  HUGE
W_eff  = W0 + DeltaW          # [4096, 4096]

h_naive = x @ W_eff           # [1, 4096]

# --- MATRIX TRICK (what LoRA really does) ------------------------
# use associativity:   x (B A)  =  (x B) A
# LoRA trick: use associativity to avoid materializing DeltaW

base  = x @ W0                #  [1, 4096] @ [4096, 4096] ‚Üí [1, 4096]  original model output

z     = x @ A                 #  [1, 4096] @ [4096, 8] ‚Üí [1, 8]      compress to rank r
delta = z @ B                 # # [1, 8] @ [8, 4096] ‚Üí [1, 4096] expand back to d
h_trick = base + delta        # [1, 4096]

diff = h_naive - h_trick
rel_error = diff.norm() / h_naive.norm()

print("max abs diff:", diff.abs().max().item())
print("relative error:", rel_error.item())


max abs diff: 0.001708984375
relative error: 1.7000663774524583e-06


In [None]:

# make sure to click on runtime
# and change runtime
# it should run at most in 3 min
# with T4GPU

# Option 1: Qwen2-0.5B-Instruct (500M params - smaller!)
model_name = "Qwen/Qwen2-0.5B-Instruct"

# Option 2: SmolLM2-360M-Instruct (360M params - TINY!)
model_name = "HuggingFaceTB/SmolLM2-360M-Instruct"

In [6]:
# Install required libraries
!pip install -q transformers peft datasets accelerate

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset

# 1Ô∏è‚É£ LOAD AN INSTRUCT MODEL (has built-in chat template!)
model_name = "Qwen/Qwen2-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

print(f"üîπ Base model parameters: {base_model.num_parameters():,}")
print(f"üîπ Has chat template: {tokenizer.chat_template is not None}")

# 2Ô∏è‚É£ CONFIGURE LORA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# 3Ô∏è‚É£ APPLY LORA
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

# 4Ô∏è‚É£ CREATE DATASET WITH CHAT FORMAT
train_data = [
    {
        "messages": [
            {"role": "user", "content": "Hello, how are you?"},
            {"role": "assistant", "content": "Ahoy, matey! I be doin' fine!"}
        ]
    },
    {
        "messages": [
            {"role": "user", "content": "What's your name?"},
            {"role": "assistant", "content": "They call me Captain Blackbeard!"}
        ]
    },
    {
        "messages": [
            {"role": "user", "content": "Where are you from?"},
            {"role": "assistant", "content": "I hail from the seven seas!"}
        ]
    },
    {
        "messages": [
            {"role": "user", "content": "What do you do?"},
            {"role": "assistant", "content": "I sail the ocean in search of treasure, arr!"}
        ]
    },
    {
        "messages": [
            {"role": "user", "content": "Tell me a joke"},
            {"role": "assistant", "content": "Why don't pirates shower? They prefer to just wash up on shore, har har!"}
        ]
    },
] * 50  # More data

# Format using the model's built-in chat template
def format_chat(example):
    text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )
    return {"text": text}

dataset = Dataset.from_list(train_data)
dataset = dataset.map(format_chat)

# Show formatted example
print("\n" + "="*50)
print("üìù FORMATTED EXAMPLE (using built-in template):")
print("="*50)
print(dataset[0]["text"])
print("="*50 + "\n")

# Tokenize with labels
def tokenize_function(examples):
    result = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text", "messages"])

# 5Ô∏è‚É£ TRAINING
training_args = TrainingArguments(
    output_dir="./lora-pirate",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    logging_steps=10,
    save_strategy="no",
    report_to="none",
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# 6Ô∏è‚É£ TEST BEFORE TRAINING
print("="*50)
print("üîπ BEFORE FINE-TUNING:")
print("="*50)

test_messages = [
    {"role": "user", "content": "Hello, how are you?"}
]
# add_generation_prompt=True will properly add the assistant turn
prompt = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)
print(f"Full prompt:\n{prompt}\n")

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=30, do_sample=False, pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
print(f"User: Hello, how are you?")
print(f"Assistant: {response}\n")

# 7Ô∏è‚É£ TRAIN!
print("üî• Starting LoRA fine-tuning...")
trainer.train()

# 8Ô∏è‚É£ TEST AFTER TRAINING
print("\n" + "="*50)
print("üîπ AFTER FINE-TUNING:")
print("="*50)

model.eval()
with torch.no_grad():
    test_messages = [
        {"role": "user", "content": "Hello, how are you?"}
    ]
    prompt = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=30, do_sample=True, temperature=0.7, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)
    print(f"User: Hello, how are you?")
    print(f"Assistant: {response}\n")

# 9Ô∏è‚É£ SAVE
model.save_pretrained("./pirate-lora-adapter")
tokenizer.save_pretrained("./pirate-lora-adapter")
print("‚úÖ Saved LoRA adapter!")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

üîπ Base model parameters: 494,032,768
üîπ Has chat template: True
trainable params: 2,162,688 || all params: 496,195,456 || trainable%: 0.4359


Map:   0%|          | 0/250 [00:00<?, ? examples/s]


üìù FORMATTED EXAMPLE (using built-in template):
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Hello, how are you?<|im_end|>
<|im_start|>assistant
Ahoy, matey! I be doin' fine!<|im_end|>




Map:   0%|          | 0/250 [00:00<?, ? examples/s]

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîπ BEFORE FINE-TUNING:
Full prompt:
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Hello, how are you?<|im_end|>
<|im_start|>assistant




The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151645}.


User: Hello, how are you?
Assistant: I'm just a computer program, so I don't have feelings or emotions like humans do. How can I assist you today?

üî• Starting LoRA fine-tuning...


Step,Training Loss
10,6.8437
20,0.2027
30,0.0784
40,0.027
50,0.0098
60,0.0088
70,0.0077
80,0.0074
90,0.0073
100,0.0072



üîπ AFTER FINE-TUNING:
User: Hello, how are you?
Assistant: Ahoy, matey! I be doin' fine!

‚úÖ Saved LoRA adapter!


In [7]:
# üîü LOAD THE FINE-TUNED MODEL AND TEST WITH NEW PROMPTS

print("\n" + "="*70)
print("üîÑ LOADING SAVED LORA ADAPTER...")
print("="*70)

from peft import PeftModel

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./pirate-lora-adapter")

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, "./pirate-lora-adapter")
model.eval()

print("‚úÖ Model loaded successfully!\n")

# Test with DIFFERENT prompts (not in training data)
print("="*70)
print("üè¥‚Äç‚ò†Ô∏è TESTING WITH NEW PROMPTS:")
print("="*70 + "\n")

test_prompts = [
    "How's the weather today?",
    "What's your favorite color?",
    "Can you help me with my homework?",
    "What did you have for breakfast?",
    "Tell me about your ship",
    "What's the meaning of life?",
    "Do you like cats or dogs?",
    "What's your biggest treasure?",
]

for user_prompt in test_prompts:
    messages = [{"role": "user", "content": user_prompt}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1
        )

    response = tokenizer.decode(outputs[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)

    print(f"üë§ User: {user_prompt}")
    print(f"üè¥‚Äç‚ò†Ô∏è Pirate: {response}")
    print("-" * 70 + "\n")


üîÑ LOADING SAVED LORA ADAPTER...
‚úÖ Model loaded successfully!

üè¥‚Äç‚ò†Ô∏è TESTING WITH NEW PROMPTS:

üë§ User: How's the weather today?
üè¥‚Äç‚ò†Ô∏è Pirate: It's sunny and warm, in fact, it's even been a little chilly!
----------------------------------------------------------------------

üë§ User: What's your favorite color?
üè¥‚Äç‚ò†Ô∏è Pirate: I have an affinity for blue, particularly when it comes to painting my nails!
----------------------------------------------------------------------

üë§ User: Can you help me with my homework?
üè¥‚Äç‚ò†Ô∏è Pirate: Of course, I can! What subject is it?
----------------------------------------------------------------------

üë§ User: What did you have for breakfast?
üè¥‚Äç‚ò†Ô∏è Pirate: I had eggs, bacon, and toast for breakfast, bacon being the main ingredient!
----------------------------------------------------------------------

üë§ User: Tell me about your ship
üè¥‚Äç‚ò†Ô∏è Pirate: Ahoy, matey! I be Captain Blackbeard! S

In [8]:
# BONUS: Compare with base model (no LoRA)
print("="*70)
print("üìä COMPARISON: Fine-tuned vs Base Model")
print("="*70 + "\n")

# Load base model without adapter
base_model_only = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype=torch.float16,
    device_map="auto"
)

test_prompt = "Hello, how are you?"
messages = [{"role": "user", "content": test_prompt}]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Base model response
with torch.no_grad():
    outputs_base = base_model_only.generate(**inputs, max_new_tokens=30, do_sample=True, temperature=0.7)
    response_base = tokenizer.decode(outputs_base[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)

# Fine-tuned model response
with torch.no_grad():
    outputs_ft = model.generate(**inputs, max_new_tokens=30, do_sample=True, temperature=0.7)
    response_ft = tokenizer.decode(outputs_ft[0][len(inputs['input_ids'][0]):], skip_special_tokens=True)

print(f"üë§ User: {test_prompt}\n")
print(f"ü§ñ Base Model: {response_base}\n")
print(f"üè¥‚Äç‚ò†Ô∏è Fine-tuned Model: {response_ft}\n")
print("="*70)

üìä COMPARISON: Fine-tuned vs Base Model

üë§ User: Hello, how are you?

ü§ñ Base Model: I'm just a computer program, so I don't have feelings or emotions like humans do. However, I'm here to help you with any questions

üè¥‚Äç‚ò†Ô∏è Fine-tuned Model: Ahoy, matey! I be doin' fine!



In [9]:
# ok so you know what to do right ?
# in class asignment
# create a web application with at least 2 different characters
# finetune two lora adapters
# allow user to change the characters and chat with it

