## Setup

In [None]:
%pip install -q "transformers==4.37.2" "datasets==2.16.1" "peft==0.8.2" "accelerate==0.26.1" "bitsandbytes==0.46.1" "trl==0.7.10" "huggingface_hub[hf_xet]"

In [None]:
%pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

In [None]:
%pip install -q -U jupyter ipywidgets

In [None]:
%pip install scikit-learn matplotlib evaluate

## Finetuning

In [2]:
# 1. Import necessary libraries
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, 
    TrainingArguments, EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from trl import SFTTrainer
import warnings
import textwrap
import os
import json
import numpy as np
import matplotlib.pyplot as plt
from transformers.trainer_utils import EvalPrediction
from torch.nn import CrossEntropyLoss

warnings.filterwarnings("ignore")

# --- Check for GPU availability ---
if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    print(f"✅ {gpu_count} GPU(s) detected.")
    print(f"   - Primary GPU: {torch.cuda.get_device_name(0)}")
    device = "cuda:0"
else:
    print("⚠️ No GPU detected, falling back to CPU. This will be very slow.")
    device = "cpu"
print("-" * 50)

✅ 1 GPU(s) detected.
   - Primary GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU
--------------------------------------------------


In [1]:
# 2. Define model, dataset, and new directory paths
model_id = "microsoft/phi-2"
dataset_id = "HuggingFaceH4/no_robots"
raw_model_dir = "./phi-2/raw"
finetuned_adapter_dir = "./phi-2/fine-tuned-adapter-enhanced"
finetuned_merged_dir = "./phi-2/fine-tuned-merged-enhanced"
offload_dir = "./phi-2/offload"
logs_dir = "./phi-2/logs"
results_dir = "./phi-2/results"

os.makedirs(raw_model_dir, exist_ok=True)
os.makedirs(finetuned_adapter_dir, exist_ok=True)
os.makedirs(finetuned_merged_dir, exist_ok=True)
os.makedirs(offload_dir, exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)
os.makedirs(results_dir, exist_ok=True)

# 3. Configure Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# 4. Load Tokenizer and Base Model
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)
print(f"Base model loaded on device: {base_model.device}")
print("-" * 50)

# --- PHASE 1: Interact with the Base Model (Before Fine-Tuning) ---
print("\n--- PHASE 1: TESTING BASE MODEL ---")
user_question_base = input("Enter your question for the BASE model: ")
prompt = f"### Instruction:\n{user_question_base}\n\n### Response:\n"
inputs = tokenizer(prompt, return_tensors="pt").to(device) 
base_outputs = base_model.generate(
    **inputs, 
    max_new_tokens=150, 
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)
base_response = tokenizer.batch_decode(base_outputs)[0]
print("\n--- Base Model Response ---")
print(base_response.split("### Response:\n")[1].strip().replace("<|endoftext|>", ""))
print("-" * 50)

# --- Save the raw model for later comparison ---
print(f"--- Saving base model to {raw_model_dir} ---")
base_model.save_pretrained(raw_model_dir)
tokenizer.save_pretrained(raw_model_dir)
print("✅ Base model saved!")
print("-" * 30)


# 5. Configure LoRA and Fine-Tune
peft_model = prepare_model_for_kbit_training(base_model)
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "dense", "fc1", "fc2"],
    lora_dropout=0.1, #0.05
    bias="none",
    task_type="CAUSAL_LM",
)
peft_model = get_peft_model(peft_model, lora_config)
peft_model.print_trainable_parameters()

train_dataset = load_dataset(dataset_id, split="train[:20%]")
eval_dataset = load_dataset(dataset_id, split="train[20%:25%]")

# dataset = load_dataset(dataset_id, split="train[:10%]")

def formatting_func(batch):
    output_texts = []
    for i in range(len(batch['messages'])):
        messages = batch['messages'][i]
        user_message = next((msg['content'] for msg in messages if msg['role'] == 'user'), None)
        assistant_message = next((msg['content'] for msg in messages if msg['role'] == 'assistant'), None)
        if user_message and assistant_message:
            text = f"Instruct: {user_message}\nOutput: {assistant_message}"
            output_texts.append(text)
    return output_texts

training_args = TrainingArguments(
    output_dir=finetuned_adapter_dir, # Save checkpoints and final adapter here
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    bf16=True,
    logging_steps=5,
    max_steps=110,
    optim="paged_adamw_8bit",
)

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=dataset,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args,
)

print("\n--- Starting Fine-Tuning Process for Phi-2 ---")
trainer.train()
print("✅ Fine-tuning completed!")
trainer.save_model() 
print(f"✅ Fine-tuned adapter saved to {finetuned_adapter_dir}!")
print("-" * 30)

W0724 21:30:25.178000 10256 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


✅ 1 GPU(s) detected.
   - Primary GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU
------------------------------


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Base model loaded on device: cuda:0
------------------------------

--- PHASE 1: TESTING BASE MODEL ---


Enter your question for the BASE model:  technically explain Transformers in ML



--- Base Model Response ---
Transformers are a type of machine learning model that are commonly used for natural language processing tasks such as language translation, text summarization, and sentiment analysis. They are based on the concept of self-attention, which allows the model to focus on different parts of the input text at different times. Transformers consist of an encoder and a decoder, with the encoder responsible for extracting features from the input text and the decoder responsible for generating the output text. They have been shown to achieve state
------------------------------
--- Saving base model to ./phi-2/raw ---
✅ Base model saved!
------------------------------

--- Starting Fine-Tuning Process for Phi-2 ---


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
5,2.2237
10,2.0218
15,2.2922
20,2.2646
25,2.0681
30,2.2446
35,2.1293
40,2.0613
45,2.2292
50,2.2506


✅ Fine-tuning completed!
✅ Fine-tuned adapter saved to ./phi-2/fine-tuned-adapter!
------------------------------


In [None]:
print("\n--- Starting Fine-Tuning Process for Phi-2 ---")
trainer.train()
print("✅ Fine-tuning completed!")
trainer.save_model() # Saves the adapter to the output_dir
print(f"✅ Fine-tuned adapter saved to {finetuned_model_dir}!")
print("-" * 30)

In [3]:
# --- Merge adapter and save the full fine-tuned model to disk ---
print("--- Merging adapter and saving full fine-tuned model ---")
# MODIFIED: Load the base model fully on the CPU for a stable merge.
# This avoids device_map complexities during the merge operation.
base_model_for_merge = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)
fine_tuned_model = PeftModel.from_pretrained(
    base_model_for_merge, 
    finetuned_adapter_dir,
)
fine_tuned_model = fine_tuned_model.merge_and_unload()
fine_tuned_model.save_pretrained(finetuned_merged_dir)
tokenizer.save_pretrained(finetuned_merged_dir)
print(f"✅ Fully merged fine-tuned model saved to {finetuned_merged_dir}!")
print("-" * 30)

--- Merging adapter and saving full fine-tuned model ---


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Fully merged fine-tuned model saved to ./phi-2/fine-tuned-merged!
------------------------------


In [4]:
# --- Clear all models from memory before comparison phases ---
print("--- Clearing all models from memory to free up VRAM ---")
del base_model
del peft_model
del base_model_for_merge
del fine_tuned_model
torch.cuda.empty_cache()
print("✅ GPU Memory Cleared.")
print("-" * 30)

--- Clearing all models from memory to free up VRAM ---
✅ GPU Memory Cleared.
------------------------------


In [5]:
# --- PHASE 2: Interact with the Fine-Tuned Model ---
print("\n--- PHASE 2: TESTING FINE-TUNED MODEL ---")
# Load the saved merged model from disk
fine_tuned_model_for_test = AutoModelForCausalLM.from_pretrained(
    finetuned_merged_dir,
    device_map="auto",
    trust_remote_code=True,
    offload_folder=offload_dir
)
user_question_finetuned = input("Enter your question for the FINE-TUNED model: ")
prompt = f"Instruct: {user_question_finetuned}\nOutput:"
inputs = tokenizer(prompt, return_tensors="pt").to(device)
finetuned_outputs = fine_tuned_model_for_test.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
finetuned_response = tokenizer.batch_decode(finetuned_outputs)[0]
print("\n--- Fine-Tuned Model Response ---")
print(finetuned_response.split("Output:")[1].strip())
del fine_tuned_model_for_test # Clear memory
torch.cuda.empty_cache()
print("-" * 30)


--- PHASE 2: TESTING FINE-TUNED MODEL ---


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Enter your question for the FINE-TUNED model:  technically explain Transformers in ML



--- Fine-Tuned Model Response ---
Transformers are a type of neural network architecture that are commonly used in natural language processing (NLP) tasks such as machine translation, text classification, and sentiment analysis. They are based on the idea of self-attention, which allows the model to focus on different parts of the input text at different times. Transformers consist of an encoder and a decoder, which are both made up of multiple layers of self-attention blocks. The encoder takes in the input text and outputs a sequence
------------------------------


                       --- PHASE 3: INTERACTIVE COMPARISON ---                       


Enter your question (or type 'quit' to exit):  What is your personal opinion on modern art?


Loading RAW model for generation...


ValueError: 
                        Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
                        the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
                        these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
                        `device_map` to `from_pretrained`. Check
                        https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                        for more details.
                        

In [7]:
while True:
    print("\n\n" + "="*85)
    print(f"{'--- PHASE 3: INTERACTIVE COMPARISON ---':^85}")
    print("="*85)
    user_question_comp = input("Enter your question (or type 'quit' to exit): ")
    
    if user_question_comp.lower() in ['quit', 'exit']:
        print("Exiting comparison.")
        break
        
    prompt = f"Instruct: {user_question_comp}\nOutput:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    # --- Generate from RAW Model ---
    print("Loading RAW model for generation...")
    # MODIFIED: Explicitly set device_map to the target device instead of "auto"
    raw_model_for_comparison = AutoModelForCausalLM.from_pretrained(
        model_id, 
        quantization_config=bnb_config, 
        device_map=device, 
        trust_remote_code=True
    )
    raw_outputs = raw_model_for_comparison.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
    raw_response_text = tokenizer.batch_decode(raw_outputs)[0].split("Output:")[1].strip()
    del raw_model_for_comparison
    torch.cuda.empty_cache()
    print("RAW model unloaded.")

    # --- Generate from FINE-TUNED Model ---
    print("Loading FINE-TUNED model for generation...")
    fine_tuned_model_for_comp = AutoModelForCausalLM.from_pretrained(
        finetuned_merged_dir, 
        device_map="auto", 
        trust_remote_code=True, 
        offload_folder=offload_dir
    )
    finetuned_outputs_comp = fine_tuned_model_for_comp.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer.eos_token_id)
    finetuned_response_text = tokenizer.batch_decode(finetuned_outputs_comp)[0].split("Output:")[1].strip()
    del fine_tuned_model_for_comp
    torch.cuda.empty_cache()
    print("FINE-TUNED model unloaded.")

    # --- Print Comparison Table ---
    print("\n" + "-" * 85)
    print(f"| {'RAW MODEL':^38} | {'FINE-TUNED MODEL':^40} |")
    print("-" * 85)

    wrapper = textwrap.TextWrapper(width=38)
    raw_lines = wrapper.wrap(raw_response_text)
    finetuned_lines = wrapper.wrap(finetuned_response_text)

    max_lines = max(len(raw_lines), len(finetuned_lines))
    raw_lines += [''] * (max_lines - len(raw_lines))
    finetuned_lines += [''] * (max_lines - len(finetuned_lines))

    for i in range(max_lines):
        print(f"| {raw_lines[i]:<38} | {finetuned_lines[i]:<40} |")

    print("-" * 85)




                       --- PHASE 3: INTERACTIVE COMPARISON ---                       


Enter your question (or type 'quit' to exit):  What is your personal opinion on modern art?


Loading RAW model for generation...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RAW model unloaded.
Loading FINE-TUNED model for generation...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



FINE-TUNED model unloaded.

-------------------------------------------------------------------------------------
|               RAW MODEL                |             FINE-TUNED MODEL             |
-------------------------------------------------------------------------------------
| I!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | I think modern art is a reflection of    |
| !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | the times we live in. It's a way for     |
| !!!!!!!!!!!!!!!!!!!!!!!!               | artists to express their thoughts and    |
|                                        | emotions in a way that is unique to      |
|                                        | them. I appreciate the creativity and    |
|                                        | innovation that goes into modern art,    |
|                                        | even if it's not my personal taste.      |
-------------------------------------------------------------------------------------


                       -

Enter your question (or type 'quit' to exit):  Can you tell me a secret?


Loading RAW model for generation...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RAW model unloaded.
Loading FINE-TUNED model for generation...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



FINE-TUNED model unloaded.

-------------------------------------------------------------------------------------
|               RAW MODEL                |             FINE-TUNED MODEL             |
-------------------------------------------------------------------------------------
| What!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | Sure, what is it? <|endoftext|>          |
| !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! |                                          |
| !!!!!!!!!!!!!!!!!!!!!!!!!!!            |                                          |
-------------------------------------------------------------------------------------


                       --- PHASE 3: INTERACTIVE COMPARISON ---                       


Enter your question (or type 'quit' to exit):  Give me some advice on how to procrastinate more effectively.


Loading RAW model for generation...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RAW model unloaded.
Loading FINE-TUNED model for generation...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



FINE-TUNED model unloaded.

-------------------------------------------------------------------------------------
|               RAW MODEL                |             FINE-TUNED MODEL             |
-------------------------------------------------------------------------------------
| Well!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | Well, first of all, you should always    |
| !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | start with the most boring and tedious   |
| !!!!!!!!!!!!!!!!!!!!!!!!!!!            | tasks, because that way you can avoid    |
|                                        | the ones that actually matter. Then,     |
|                                        | you should take frequent breaks and      |
|                                        | reward yourself with snacks and games.   |
|                                        | And finally, you should never finish     |
|                                        | anything, because that would be too      |
|                         

Enter your question (or type 'quit' to exit):  quit


Exiting comparison.
