# Phi-3 Mini Fine-tuning V3 on Google Colab T4 GPU

**Configuration:**
- Model: microsoft/Phi-3-mini-4k-instruct
- Dataset: 4,500 train / 500 valid (enriched with diverse meat/fish)
- Batch size: 2
- Learning rate: 5e-5
- Epochs: 2
- LoRA rank: 8, alpha: 16

**Estimated time on T4**: 2-2.5 hours

## 1. Setup Environment

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Install required packages (without trl to avoid compatibility issues)
!pip install -q transformers datasets peft bitsandbytes accelerate tensorboard

## 2. Upload Training Data

Upload `train.jsonl` and `valid.jsonl` files using the file upload button on the left sidebar.

In [None]:
# Verify uploaded files
import os
import json

print("Checking files...")
assert os.path.exists('train.jsonl'), "Please upload train.jsonl"
assert os.path.exists('valid.jsonl'), "Please upload valid.jsonl"

# Count samples
with open('train.jsonl', 'r') as f:
    train_count = sum(1 for _ in f)
    
with open('valid.jsonl', 'r') as f:
    valid_count = sum(1 for _ in f)

print(f"‚úÖ Train samples: {train_count:,}")
print(f"‚úÖ Valid samples: {valid_count:,}")

## 3. Load Model and Setup LoRA

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset

# Model configuration
model_name = "microsoft/Phi-3-mini-4k-instruct"
output_dir = "./phi3-recipe-lora-v3"

print(f"Loading model: {model_name}")

# 4-bit quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("‚úÖ Model and tokenizer loaded")

In [None]:
# Prepare model for training
model = prepare_model_for_kbit_training(model)

# LoRA configuration (V3 settings)
peft_config = LoraConfig(
    r=8,                    # LoRA rank
    lora_alpha=16,          # LoRA alpha (reduced from 20)
    lora_dropout=0.05,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"\nüìä Trainable parameters: {trainable_params:,} ({100 * trainable_params / total_params:.3f}%)")
print(f"üìä Total parameters: {total_params:,}")

## 4. Load and Tokenize Dataset

In [None]:
# Load datasets
dataset = load_dataset(
    "json",
    data_files={
        "train": "train.jsonl",
        "validation": "valid.jsonl"
    }
)

print("‚úÖ Dataset loaded:")
print(f"   Train: {len(dataset['train'])} samples")
print(f"   Valid: {len(dataset['validation'])} samples")

# Show sample
print("\nüìù Sample data:")
sample = dataset['train'][0]
print(f"Text length: {len(sample['text'])} chars")
print(f"First 200 chars: {sample['text'][:200]}...")

In [None]:
# Tokenize function
def tokenize_function(examples):
    # Tokenize the text
    result = tokenizer(
        examples["text"],
        truncation=True,
        max_length=2048,
        padding="max_length",
        return_tensors=None,
    )
    # For causal LM, labels are the same as input_ids
    result["labels"] = result["input_ids"].copy()
    return result

print("Tokenizing datasets...")

# Tokenize datasets
tokenized_train = dataset["train"].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing train dataset",
)

tokenized_valid = dataset["validation"].map(
    tokenize_function,
    batched=True,
    remove_columns=dataset["validation"].column_names,
    desc="Tokenizing validation dataset",
)

print(f"‚úÖ Tokenized train: {len(tokenized_train)} samples")
print(f"‚úÖ Tokenized valid: {len(tokenized_valid)} samples")

## 5. Training Configuration

In [None]:
# Training arguments (V3 optimized)
training_args = TrainingArguments(
    output_dir=output_dir,
    
    # Training schedule
    num_train_epochs=2,              # Reduced from 3 (prevent overfitting)
    per_device_train_batch_size=2,   # Batch size 2
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=2,   # Effective batch size = 4
    
    # Optimizer
    learning_rate=5e-5,              # Conservative (prevent overfitting)
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    optim="paged_adamw_8bit",        # Memory efficient
    
    # Logging and evaluation
    logging_steps=50,
    eval_steps=200,
    save_steps=400,                  # Must be multiple of eval_steps (200)
    eval_strategy="steps",           # Changed from evaluation_strategy
    save_strategy="steps",
    
    # Memory optimization
    fp16=False,
    bf16=True,
    max_grad_norm=1.0,
    gradient_checkpointing=True,
    
    # Other
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="tensorboard",
    seed=42,
)

print("‚úÖ Training arguments configured")
print(f"\nüìä Training details:")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"   Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"   Learning rate: {training_args.learning_rate}")
print(f"   Total steps: {len(dataset['train']) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")

## 6. Initialize Trainer

In [None]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM (not masked LM)
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
)

print("‚úÖ Trainer initialized")

## 7. Start Training

In [None]:
print("üöÄ Starting fine-tuning...")
print("="*70)
print("‚è±Ô∏è  Estimated time: 2-2.5 hours on T4 GPU")
print("="*70)

# Train
trainer.train()

print("\n‚úÖ Training completed!")

## 8. Save Model

In [None]:
# Save final model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"‚úÖ Model saved to {output_dir}")
print("\nüì¶ Adapter files saved:")
print("   - adapter_config.json")
print("   - adapter_model.safetensors")
print("   - tokenizer files")

## 9. Test Inference (Optional)

In [None]:
# Quick test
test_prompt = """<|system|>
You are a creative recipe generator with access to the user's pantry inventory.

Available ingredients in pantry:
- tomato (vegetable)
- onion (vegetable)
- garlic (vegetable)
- chicken breast (meat) [FORBIDDEN for vegetarian]
- olive oil (oil)
- salt (seasoning)
- pepper (seasoning)

User dietary preference: vegetarian

Instructions:
1. Based on the user's request, select appropriate ingredients from the available inventory
2. IMPORTANT: Respect dietary restrictions - do NOT select meat/fish for vegetarian recipes
3. Generate a complete, practical recipe with clear steps<|end|>
<|user|>
I want a healthy vegetarian dinner<|end|>
<|assistant|>
"""

inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    **inputs,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)
print("\nüß™ Test Generation:")
print("="*70)
print(generated_text)
print("="*70)

# Check if it correctly avoided chicken breast
if "chicken" not in generated_text.split("<|assistant|>")[1].lower():
    print("\n‚úÖ SUCCESS: Model correctly avoided chicken breast for vegetarian recipe!")
else:
    print("\n‚ö†Ô∏è WARNING: Model selected chicken for vegetarian recipe")

## 10. Download Trained Adapter

Zip and download the adapter files:

In [None]:
# Zip the adapter
!zip -r phi3-recipe-lora-v3.zip {output_dir}

print("‚úÖ Adapter zipped: phi3-recipe-lora-v3.zip")
print("\nüì• Download the zip file using the file browser on the left")

# Show file size
import os
size_mb = os.path.getsize('phi3-recipe-lora-v3.zip') / (1024 * 1024)
print(f"\nüì¶ File size: {size_mb:.2f} MB")
print("\nüí° After downloading, extract and use the adapter on your Mac M4 Pro with MLX!")