<a href="https://colab.research.google.com/github/bblovecc0816/Node/blob/master/Fine-tuning-phi-2-model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning phi-2


📒Notebook Created by ❤️ [@prasadmahamulkar](https://x.com/prsdm17). Check out the step by step guide [here.](https://medium.com/@prasadmahamulkar/fine-tuning-phi-2-a-step-by-step-guide-e672e7f1d009)

📄Dataset: [MedQuad-phi2-1k](https://huggingface.co/prsdm/MedQuad-phi2-1k). You can run this notebook in Google Colab using T4 GPU.


In [1]:
# Install and import the necessary libraries
!pip install -q unsloth transformers accelerate bitsandbytes trl peft datasets scipy
!pip install -q xformers --no-deps

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.3/196.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Step 2: Import libraries
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from unsloth import FastLanguageModel
from tqdm import tqdm

from trl import SFTTrainer, SFTConfig

os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import FastLanguageModel


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Using device: cuda


In [3]:
# Model
base_model = "microsoft/phi-2"
new_model = "./phi-2-medquad"


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model,
    max_seq_length=2048,
    dtype=torch.bfloat16,
    load_in_4bit=True,
    # Use the next line for fine-tuning on an 80GB A100
    # load_in_8bit=True,
)

tokenizer.pad_token=tokenizer.eos_token
tokenizer.padding_side="right"

model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=42,
)

# Quantization configuration
# bnb_config = BitsAndBytesConfig(
#    load_in_4bit=True,
#    bnb_4bit_quant_type="nf4",
#    bnb_4bit_compute_dtype=torch.float16,
#    bnb_4bit_use_double_quant=False,
#)

#
# Load base moodel
#model = AutoModelForCausalLM.from_pretrained(
#    base_model,
#    quantization_config=bnb_config,
#    trust_remote_code=True,
#    low_cpu_mem_usage=True,
#    device_map={"": 0},
#    revision="refs/pr/23" #the main version of Phi-2 doesn’t support gradient checkpointing (while training this model)
#)


# Define a simple prompt
prompt = "How to prevent Lung Cancer?"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate response
with torch.no_grad():
    output = model.generate(**inputs,
                            max_new_tokens=128,
                            temperature=0.5,      # Control randomness
                            top_p=0.1,
                            do_sample = True)            # Use nucleus sampling

# Decode and print output
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response)

Device does not support bfloat16. Will change to float16.


==((====))==  Unsloth 2025.3.17: Fast Phi patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

microsoft/phi-2 does not have a padding token! Will use pad_token = <|endoftext|>.
Unsloth: Making `model.base_model.model.model` require gradients


The 'batch_size' attribute of StaticCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
Using `torch.compile`.


How to prevent Lung Cancer?

Answer:
To prevent lung cancer, it is important to avoid smoking and exposure to secondhand smoke. It is also important to avoid exposure to environmental pollutants and to eat a healthy diet and exercise regularly.

Exercise:
What are the symptoms of Lung Cancer?

Answer:
The symptoms of lung cancer include coughing, chest pain, shortness of breath, fatigue, and weight loss.

Exercise:
What are the risk factors for Lung Cancer?

Answer:
The risk factors for lung cancer include smoking, exposure to secondhand smoke, exposure to environmental pollutants, and a family history


In [12]:
#model.config.use_cache = False
#model.config.pretraining_tp = 1
#model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

# Dataset
dataset = load_dataset("prsdm/MedQuad-phi2-1k", split="train")

dataset = dataset.select(range(300))

for i, data in enumerate(dataset):
    print(f"Sample {i+1}: {data}\n")


#print_trainable_parameters(model)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    tokenizer=tokenizer,
    args= SFTConfig(
        output_dir = "./phi2-medquad-finetuned",
        num_train_epochs = 1,
        fp16 = False,
        bf16 = False,
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 1,
        gradient_accumulation_steps = 4,
        gradient_checkpointing = True,
        max_grad_norm = 0.3,
        learning_rate = 2e-4,
        weight_decay = 0.001,
        optim = "paged_adamw_32bit",
        lr_scheduler_type = "cosine",
        max_steps = 50,
        warmup_ratio = 0.03,
        group_by_length = True,
        save_steps = 0,
        logging_steps = 25,
        dataset_text_field = "text",
        max_seq_length= None,
    ),
)

Sample 1: {'text': '### Instruction: How to prevent Lung Cancer ? ### Assistant: Key Points\n                    - Avoiding risk factors and increasing protective factors may help prevent lung cancer.    - The following are risk factors for lung cancer:         - Cigarette, cigar, and pipe smoking      - Secondhand smoke     - Family history     - HIV infection     - Environmental risk factors     - Beta carotene supplements in heavy smokers        - The following are protective factors for lung cancer:         - Not smoking     - Quitting smoking     - Lower exposure to workplace risk factors      - Lower exposure to radon        - It is not clear if the following decrease the risk of lung cancer:         - Diet     - Physical activity        - The following do not decrease the risk of lung cancer:         - Beta carotene supplements in nonsmokers     - Vitamin E supplements         - Cancer prevention clinical trials are used to study ways to prevent cancer.    -  New ways to prevent

In [13]:
# Train model
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 50
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 7,864,320/1,529,256,960 (0.51% trained)


Step,Training Loss
25,1.0763
50,1.0734


TrainOutput(global_step=50, training_loss=1.0748672485351562, metrics={'train_runtime': 1084.3732, 'train_samples_per_second': 0.738, 'train_steps_per_second': 0.046, 'total_flos': 1.137275697352704e+16, 'train_loss': 1.0748672485351562})

In [14]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [16]:
# Clear the memory
del model, trainer

In [23]:
# Path to your fine-tuned model
model_path = "./phi2-medquad-finetuned"

# Load the fine-tuned model using Unsloth
print(f"Loading fine-tuned model from {model_path}")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_path,
    max_seq_length=2048,
    dtype=torch.bfloat16,
    load_in_4bit=True,
)

# Move model to device
model = model.to(device)
print("Model loaded successfully!")

# Function to generate responses
def generate_response(prompt, max_new_tokens=150):
    # Format the prompt as used during fine-tuning
    formatted_prompt = f"Instruct: {prompt}\nOutput:"

    # Tokenize input
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
        )

    # Decode and return response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the model's answer part (after the prompt)
    response = full_response.split("Output:")[-1].strip()
    return response

# Test cases - medical questions
test_questions = [
    "How to prevent Lung Cancer?",
]

# Run test cases
print("\n" + "="*50)
print("TESTING FINE-TUNED MODEL RESPONSES")
print("="*50)

for i, question in enumerate(test_questions, 1):
    print(f"\nQuestion {i}: {question}")
    print("-"*50)
    response = generate_response(question)
    print(f"Response: {response}")
    print("="*50)

# Compare with the original Phi-2 model (optional)
print("\nWould you like to compare with the original Phi-2 model? (y/n)")
compare_response = input()

if compare_response.lower() == 'y':
    print("\nLoading original Phi-2 model for comparison...")
    orig_model, orig_tokenizer = FastLanguageModel.from_pretrained(
        model_id="microsoft/phi-2",
        max_seq_length=2048,
        dtype=torch.bfloat16,
        load_in_4bit=True,
    )
    orig_model = orig_model.to(device)

    def generate_orig_response(prompt, max_new_tokens=150):
        inputs = orig_tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = orig_model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )
        response = orig_tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

    print("\n" + "="*50)
    print("COMPARISON: ORIGINAL VS FINE-TUNED")
    print("="*50)

    for question in test_questions[:3]:  # Test first 3 questions only to save time
        print(f"\nQuestion: {question}")
        print("-"*50)

        orig_response = generate_orig_response(question)
        ft_response = generate_response(question)

        print(f"Original model: {orig_response}")
        print("-"*30)
        print(f"Fine-tuned model: {ft_response}")
        print("="*50)

print("\nTesting complete!")

Loading fine-tuned model from ./phi-2-medquad-finetuned


FileNotFoundError: ./phi-2-medquad-finetuned/*.json (invalid repository id)

In [None]:
# Reload model and merge it with LoRA parameters
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    cache_dir="",
    device_map={"": 0},
)
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

In [None]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
!huggingface-cli login

In [None]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)