<a href="https://colab.research.google.com/github/bblovecc0816/Node/blob/master/Fine-tuning-phi-2-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning phi-2 with unsloth and LoRA


📒Notebook Created by ❤️ [@prasadmahamulkar](https://x.com/prsdm17). Check out the step by step guide [here.](https://medium.com/@prasadmahamulkar/fine-tuning-phi-2-a-step-by-step-guide-e672e7f1d009)

📄Dataset: [MedQuad-phi2-1k](https://huggingface.co/prsdm/MedQuad-phi2-1k). You can run this notebook in Google Colab using T4 GPU.


In [None]:
# Install and import the necessary libraries
!pip install -q unsloth transformers accelerate bitsandbytes trl peft datasets scipy
!pip install -q xformers --no-deps

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m192.5/192.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Step 2: Import libraries
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from unsloth import FastLanguageModel
from tqdm import tqdm

## 3. set up environment

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Step 4: Load the base model and tokenizer

In [None]:
# Model
base_model = "microsoft/phi-2"

# Unsloth optimized model loading with LoRA configuration
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model,
    max_seq_length=512,
    dtype=torch.bfloat16,
    load_in_4bit=True,
    # Use the next line for fine-tuning on an 80GB A100
    # load_in_8bit=True,
)

tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

Device does not support bfloat16. Will change to float16.


==((====))==  Unsloth 2025.3.18: Fast Phi patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

microsoft/phi-2 does not have a padding token! Will use pad_token = <|endoftext|>.


# 5. Config LoRA parameter

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=42,
)

Unsloth: Making `model.base_model.model.model` require gradients


# 6. Base model inference with simple prompt

In [None]:
# Step 6: Test base model inference before fine-tuning
def generate_response(prompt, model, tokenizer, max_new_tokens=128):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Simple inference test before fine-tuning
test_prompt = "How to prevent Lung Cancer?"
print("Testing base model with prompt:", test_prompt)
base_response = generate_response(test_prompt, model, tokenizer)
print("Base model response:", base_response)

Testing base model with prompt: How to prevent Lung Cancer?
Base model response: How to prevent Lung Cancer?

Answer: Some ways to prevent lung cancer include quitting smoking, avoiding exposure to secondhand smoke, and avoiding exposure to pollutants and harmful chemicals.

How to prevent Lung Cancer?

Answer: Some ways to prevent lung cancer include quitting smoking, avoiding exposure to secondhand smoke, and avoiding exposure to pollutants and harmful chemicals.

How to prevent Lung Cancer?

Answer: Some ways to prevent lung cancer include quitting smoking, avoiding exposure to secondhand smoke, and avoiding exposure to pollutants and harmful chemicals.

How to prevent Lung Cancer?

Answer: Some ways to prevent lung cancer include quitting smoking,


# 7. Load and display dataset

In [None]:
dataset = load_dataset("prsdm/MedQuad-phi2-1k", split="train").select(range(100))
print(f"Dataset loaded with {len(dataset)} examples")

# Display a few examples
print("Dataset examples:")
for i in range(min(3, len(dataset))):
    print(f"Example {i+1}:")
    print(dataset[i])
    print("-" * 50)

Dataset loaded with 100 examples
Dataset examples:
Example 1:
{'text': '### Instruction: How to prevent Lung Cancer ? ### Assistant: Key Points\n                    - Avoiding risk factors and increasing protective factors may help prevent lung cancer.    - The following are risk factors for lung cancer:         - Cigarette, cigar, and pipe smoking      - Secondhand smoke     - Family history     - HIV infection     - Environmental risk factors     - Beta carotene supplements in heavy smokers        - The following are protective factors for lung cancer:         - Not smoking     - Quitting smoking     - Lower exposure to workplace risk factors      - Lower exposure to radon        - It is not clear if the following decrease the risk of lung cancer:         - Diet     - Physical activity        - The following do not decrease the risk of lung cancer:         - Beta carotene supplements in nonsmokers     - Vitamin E supplements         - Cancer prevention clinical trials are used to stu

# 8. Define trainer

In [38]:
# Set supervised fine-tuning parameters
from trl import SFTTrainer

# Create the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=SFTConfig(
        output_dir = "./phi2-medquad-finetuned",
        num_train_epochs = 1,
        fp16 = False,
        bf16 = False,
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 1,
        gradient_accumulation_steps = 4,
        gradient_checkpointing = True,
        max_grad_norm = 0.3,
        learning_rate = 2e-4,
        weight_decay = 0.001,
        optim = "paged_adamw_32bit",
        lr_scheduler_type = "cosine",
        max_steps = 50,
        warmup_ratio = 0.03,
        group_by_length = True,
        save_steps = 0,
        logging_steps = 10,
        dataset_text_field = "text",
        max_seq_length= None,
        remove_unused_columns=False,
    ),
    train_dataset=dataset,
)


# 9. Start training

In [40]:
print("Starting training...")
model = model.to_empty(device="cuda")
trainer.train()

Starting training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100 | Num Epochs = 9 | Total steps = 50
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 7,864,320/1,529,256,960 (0.51% trained)
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:


Abort: 

# 10. Save fine tuned model

In [None]:
# Save trained model
output_dir = "./phi2-medquad-finetuned"
trainer.save_model(output_dir)
print(f"Model saved to {output_dir}")
# Clear the memory
del model, trainer

# 11.Load the fine-tuned model using Unsloth

In [None]:

print(f"Loading fine-tuned model from {model_path}")
fine_tuned_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=2048,
    dtype=torch.bfloat16,
    load_in_4bit=True,
)
print("Fine tuned model loaded successfully!")


Loading fine-tuned model from ./phi-2-medquad-finetuned


FileNotFoundError: ./phi-2-medquad-finetuned/*.json (invalid repository id)

# 12: Compare inference results

In [None]:

print("\nTesting fine-tuned model...")
test_prompts = [
    "How to prevent Lung Cancer?",
]

print("\nComparing base model vs fine-tuned model responses:")
for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")

    # Format the prompt appropriately for the fine-tuned model
    formatted_prompt = f"Instruct: {prompt}\nOutput:"

    fine_tuned_response = generate_response(formatted_prompt, fine_tuned_model, tokenizer)
    print(f"Fine-tuned model response: {fine_tuned_response}")

# Step 15: Verify model improvements
print("\nFine-tuning complete! The model should now have improved capabilities for medical Q&A.")
print("You can further evaluate the model performance by comparing responses to the medical queries.")

In [None]:
# Reload model and merge it with LoRA parameters
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    cache_dir="",
    device_map={"": 0},
)
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

In [None]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
!huggingface-cli login

In [None]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)