In [2]:
%pip install -q "transformers==4.37.2" "datasets==2.16.1" "peft==0.8.2" "accelerate==0.26.1" "bitsandbytes==0.46.1" "trl==0.7.10" "huggingface_hub[hf_xet]"

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install -q -U jupyter ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [1]:
pip list

Package                   Version
------------------------- --------------
accelerate                0.26.1
aiohappyeyeballs          2.6.1
aiohttp                   3.12.14
aiosignal                 1.4.0
anyio                     4.9.0
argon2-cffi               25.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
asttokens                 3.0.0
async-lru                 2.0.5
attrs                     25.3.0
babel                     2.17.0
beautifulsoup4            4.13.4
bitsandbytes              0.43.1
bleach                    6.2.0
certifi                   2025.7.14
cffi                      1.17.1
charset-normalizer        3.4.2
colorama                  0.4.6
comm                      0.2.2
datasets                  2.16.1
debugpy                   1.8.15
decorator                 5.2.1
defusedxml                0.7.1
dill                      0.3.7
docstring_parser          0.17.0
executing                 2.2.0
fastjsonschema            2.21.1
filelock   

In [7]:
import os
import textwrap
import warnings

import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer

warnings.filterwarnings("ignore")

In [2]:
# --- Check for GPU availability ---
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    print(f"✅ {gpu_count} GPU(s) detected.")
    print(f"   - Primary GPU: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("⚠️ No GPU detected, falling back to CPU. This will be very slow.")
    device = "cpu"
print("-" * 30)

✅ 1 GPU(s) detected.
   - Primary GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU
------------------------------


In [3]:
# 2. Define model, dataset, and directory paths
# model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
model_id = "microsoft/phi-2"
dataset_id = "HuggingFaceH4/no_robots"
raw_model_dir = "./phi-2/raw"
finetuned_model_dir = "./phi-2/fine-tuned"

os.makedirs(raw_model_dir, exist_ok=True)
os.makedirs(finetuned_model_dir, exist_ok=True)

# 3. Configure Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

In [4]:
# 4. Load the tokenizer and the base model
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load the model with the 4-bit quantization config.
# `device_map="auto"` will automatically place the model on your available GPU.
model = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map="auto", trust_remote_code=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
print(f"Model successfully loaded on device: {model.device}")
print("-" * 30)

Model successfully loaded on device: cuda:0
------------------------------


In [6]:
# 5. Interact with the Base Model (Before Fine-Tuning) ---
print("\n--- Testing Base Model (Before Fine-Tuning) ---")

# Get input from the user
user_question = input("Enter your question for the model: ")

# Define the chat structure for the prompt using the user's input
chat = [
    {
        "role": "system",
        "content": "You are a helpful, respectful and honest assistant.",
    },
    {"role": "user", "content": user_question},
]
# Apply the chat template
prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)

# Tokenize the input and move to GPU
inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False).to("cuda")

# Generate the response
outputs = model.generate(**inputs, max_length=100)
response = tokenizer.batch_decode(outputs)[0]

print("\n--- Base Model Response ---")
# Decode and print only the assistant's part of the response
print(response.split("<|assistant|>")[1].strip())
print("-" * 30)


--- Testing Base Model (Before Fine-Tuning) ---


Enter your question for the model:  hi



No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



--- Base Model Response ---


IndexError: list index out of range

In [7]:
# 6. Configure LoRA (Parameter-Efficient Fine-Tuning)
model = prepare_model_for_kbit_training(model)
# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "dense"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
peft_model = get_peft_model(model, lora_config)

In [8]:
# 7. Load the dataset
dataset = load_dataset(dataset_id, split="train[:10%]")

# --- MODIFIED: Create a formatting function to handle batches ---
# def formatting_func(batch):
#     # The function receives a batch of examples and must return a list of formatted strings.
#     # We iterate over each conversation in the batch.
#     output_texts = []
#     for i in range(len(batch['messages'])):
#         # `batch['messages'][i]` is a single conversation (a list of dictionaries).
#         # `apply_chat_template` formats this conversation into a single string.
#         text = tokenizer.apply_chat_template(batch['messages'][i], tokenize=False)
#         output_texts.append(text)
#     return output_texts


def formatting_func(batch):
    output_texts = []
    for i in range(len(batch["messages"])):
        messages = batch["messages"][i]
        # We're assuming a simple user-assistant turn for the instruction format
        user_message = next(
            (msg["content"] for msg in messages if msg["role"] == "user"), None
        )
        assistant_message = next(
            (msg["content"] for msg in messages if msg["role"] == "assistant"), None
        )

        if user_message and assistant_message:
            text = f"Instruct: {user_message}\nOutput: {assistant_message}"
            output_texts.append(text)
    return output_texts


# 8. Define Training Arguments
training_args = TrainingArguments(
    # output_dir="./tinyllama-no_robots-finetuned",
    output_dir="./phi2-no_robots-finetuned",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    bf16=True,
    logging_steps=10,
    max_steps=100,
    optim="paged_adamw_8bit",
)

# 9. Initialize the SFTTrainer
# We use the new `formatting_func` that correctly processes batches.
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=dataset,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args,
)

Downloading readme: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/950 [00:00<?, ? examples/s]

In [9]:
print(dataset)

Dataset({
    features: ['prompt', 'prompt_id', 'messages', 'category'],
    num_rows: 950
})


In [10]:
# 10. Start the fine-tuning process
print("\n--- Starting Fine-Tuning Process ---")
trainer.train()
print("Fine-tuning completed!")

print("--- Saving final adapter ---")
trainer.save_model()
print("Adapter saved!")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...



--- Starting Fine-Tuning Process ---


Step,Training Loss
10,1.8205
20,1.8833
30,1.7811
40,1.8306
50,1.8822
60,1.8207
70,1.7474
80,1.7345
90,1.69
100,1.8205


Fine-tuning completed!
--- Saving final adapter ---
Adapter saved!


In [None]:
# The model we want to fine-tune
model_id = "microsoft/phi-2"
# The new model name for the fine-tuned version
new_model_id = "phi-2-no-robots-finetuned"

# Configuration for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load the model with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=quantization_config,
    device_map={"": 0},  # map model to the first GPU
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
# Phi-2 doesn't have a pad token, so we set it to the End-Of-Sequence token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training

# Prepare the model for k-bit training
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

In [None]:
# Load the dataset
dataset = load_dataset(
    "HuggingFaceH4/no_robots", split="train[:1000]"
)  # Using 1000 samples for a quick fine-tune


# Formatting function to structure the data
def format_instruction(sample):
    # The 'no_robots' dataset has a 'messages' field, which is a list of conversations.
    # We'll take the first two messages as prompt and response.
    prompt = sample["messages"][0]["content"]
    response = sample["messages"][1]["content"]
    return f"### Instruction:\n{prompt}\n\n### Response:\n{response}"


# You can see an example here
# print(format_instruction(dataset[0]))

In [None]:
# LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    # Target modules for Phi-2
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "dense",
        "fc1",
        "fc2",
    ],
)

# Get the PEFT model
model = get_peft_model(model, peft_config)

In [None]:
# Training arguments
training_arguments = TrainingArguments(
    output_dir="./results",  # Output directory for checkpoints
    num_train_epochs=1,  # A single epoch is often enough for fine-tuning
    per_device_train_batch_size=2,  # Batch size per device
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    optim="paged_adamw_32bit",  # Memory-efficient optimizer
    save_steps=50,  # Save a checkpoint every 50 steps
    logging_steps=10,  # Log training metrics every 10 steps
    learning_rate=2e-4,  # Learning rate
    weight_decay=0.001,  # Weight decay for regularization
    fp16=False,  # Use fp16 precision
    bf16=True,  # Use bf16 precision for better performance on new GPUs
    max_grad_norm=0.3,  # Gradient clipping
    max_steps=-1,  # Number of training steps (overrides num_train_epochs)
    warmup_ratio=0.03,  # Warmup ratio
    group_by_length=True,  # Group sequences of similar length to save memory
    lr_scheduler_type="constant",  # Learning rate scheduler
    report_to="tensorboard",
)

In [None]:
from transformers import pipeline

# The instruction you want to test
prompt = "Explain the importance of low-rank adaptation in large language models."

# Set up the generation pipeline
pipe = pipeline(
    task="text-generation", model=model, tokenizer=tokenizer, max_length=200
)

# Generate the response
result = pipe(f"### Instruction:\n{prompt}\n\n### Response:\n")
print(result[0]["generated_text"])

In [41]:
# 11. Merge the model for inference
base_model_for_merge = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True
)
# MODIFIED: The path should be the main output directory, not the specific checkpoint folder.
# The trainer saves the final adapter config and weights to the root of the output directory.
final_adapter_path = "./tinyllama-no_robots-finetuned"
fine_tuned_model = PeftModel.from_pretrained(base_model_for_merge, final_adapter_path)
fine_tuned_model = fine_tuned_model.merge_and_unload()



In [43]:
# 12. Test the fine-tuned model with the same prompt
print("\n--- Testing Fine-Tuned Model (After Fine-Tuning) ---")
# Get new input from the user for the fine-tuned model
finetuned_user_question = input("Enter your question for the FINE-TUNED model: ")

# Define the chat structure for the new prompt
finetuned_chat = [
    {
        "role": "system",
        "content": "You are a helpful, respectful and honest assistant.",
    },
    {"role": "user", "content": finetuned_user_question},
]
# Apply the chat template to the new prompt
finetuned_prompt = tokenizer.apply_chat_template(
    finetuned_chat, tokenize=False, add_generation_prompt=True
)


# Tokenize the new input and move to the detected device
inputs = tokenizer(
    finetuned_prompt, return_tensors="pt", return_attention_mask=False
).to(device)

# Generate the response
outputs = fine_tuned_model.generate(**inputs, max_length=200)
response = tokenizer.batch_decode(outputs)[0]

print("\n--- Fine-Tuned Model Response ---")
print(response.split("<|assistant|>")[1].strip())


--- Testing Fine-Tuned Model (After Fine-Tuning) ---


Enter your question for the FINE-TUNED model:  hi



--- Fine-Tuned Model Response ---
Hi! I'm a helpful, respectful and honest assistant. I'm glad you're here. Can you help me with a task? I need to write a thank you note to my boss. I'm not sure what to say. Can you help me? I'm really grateful for all the help I've received from you. Thank you! 

Best regards, 
[Your Name] 
[Your Address] 
[City, State ZIP Code] 
[Email Address] 
[Phone Number] 
[Date] 

[Your Name] 
[Your Address] 
[City, State ZIP Code] 
[Email Address] 
[Phone Number] 
[Date]


In [45]:
# %pip install textwrap
import textwrap

In [46]:
# Step 13 - Side-by-Side Comparison ---
print("\n\n" + "=" * 85)
print(f"{'--- COMPARISON ---':^85}")
print("=" * 85)
print(f"\nQUESTION: {user_question}\n")
print("-" * 85)
print(f"| {'BASE MODEL':^38} | {'FINE-TUNED MODEL':^40} |")
print("-" * 85)

# Wrap text for clean side-by-side display
wrapper = textwrap.TextWrapper(width=38)
base_lines = wrapper.wrap(base_model_response_text)
finetuned_lines = wrapper.wrap(fine_tuned_model_response_text)

# Make the number of lines in both response lists equal for easier printing
max_lines = max(len(base_lines), len(finetuned_lines))
base_lines += [""] * (max_lines - len(base_lines))
finetuned_lines += [""] * (max_lines - len(finetuned_lines))

# Print each line of the responses side-by-side
for i in range(max_lines):
    print(f"| {base_lines[i]:<38} | {finetuned_lines[i]:<40} |")

print("-" * 85)



                                 --- COMPARISON ---                                  

QUESTION: hi

-------------------------------------------------------------------------------------
|               BASE MODEL               |             FINE-TUNED MODEL             |
-------------------------------------------------------------------------------------


NameError: name 'base_model_response_text' is not defined

In [48]:
# Load models for comparison
print("\n--- Loading models for interactive comparison ---")

# Load the original base model (quantized)
base_model_for_comparison = AutoModelForCausalLM.from_pretrained(
    model_id, quantization_config=bnb_config, device_map="auto", trust_remote_code=True
)

# Load the fine-tuned model
base_model_for_merge = AutoModelForCausalLM.from_pretrained(
    model_id, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True
)
final_adapter_path = "./tinyllama-no_robots-finetuned"
fine_tuned_model = PeftModel.from_pretrained(base_model_for_merge, final_adapter_path)
fine_tuned_model = fine_tuned_model.merge_and_unload()

print("✅ Models ready for comparison.")


# --- Step 11 - Interactive Side-by-Side Comparison ---
while True:
    print("\n\n" + "=" * 85)
    print(f"{'--- INTERACTIVE COMPARISON ---':^85}")
    print("=" * 85)
    user_question = input("Enter your question (or type 'quit' to exit): ")

    if user_question.lower() in ["quit", "exit"]:
        print("Exiting comparison.")
        break

    # Define the chat structure for the prompt
    chat = [
        {
            "role": "system",
            "content": "You are a helpful, respectful and honest assistant.",
        },
        {"role": "user", "content": user_question},
    ]
    # Apply the chat template
    prompt = tokenizer.apply_chat_template(
        chat, tokenize=False, add_generation_prompt=True
    )
    inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=False).to(
        "cuda"
    )

    # --- Generate from Base Model ---
    base_outputs = base_model_for_comparison.generate(**inputs, max_length=200)
    base_response = tokenizer.batch_decode(base_outputs)[0]
    base_model_response_text = base_response.split("<|assistant|>")[1].strip()

    # --- Generate from Fine-Tuned Model ---
    finetuned_outputs = fine_tuned_model.generate(**inputs, max_length=200)
    finetuned_response = tokenizer.batch_decode(finetuned_outputs)[0]
    fine_tuned_model_response_text = finetuned_response.split("<|assistant|>")[
        1
    ].strip()

    # --- Print Comparison Table ---
    print("\n" + "-" * 85)
    print(f"| {'BASE MODEL':^38} | {'FINE-TUNED MODEL':^40} |")
    print("-" * 85)

    wrapper = textwrap.TextWrapper(width=38)
    base_lines = wrapper.wrap(base_model_response_text)
    finetuned_lines = wrapper.wrap(fine_tuned_model_response_text)

    max_lines = max(len(base_lines), len(finetuned_lines))
    base_lines += [""] * (max_lines - len(base_lines))
    finetuned_lines += [""] * (max_lines - len(finetuned_lines))

    for i in range(max_lines):
        print(f"| {base_lines[i]:<38} | {finetuned_lines[i]:<40} |")

    print("-" * 85)


--- Loading models for interactive comparison ---




ValueError: We need an `offload_dir` to dispatch this model according to this `device_map`, the following submodules need to be offloaded: base_model.model.model.layers.10, base_model.model.model.layers.11, base_model.model.model.layers.12, base_model.model.model.layers.13, base_model.model.model.layers.14, base_model.model.model.layers.15, base_model.model.model.layers.16, base_model.model.model.layers.17, base_model.model.model.layers.18, base_model.model.model.layers.19, base_model.model.model.layers.20, base_model.model.model.layers.21, base_model.model.model.norm, base_model.model.lm_head.