<a href="https://colab.research.google.com/github/aidentejada/fine-tune-llmv1/blob/main/finetuner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install unsloth trl peft accelerate bitsandbytes

# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

from unsloth import FastLanguageModel
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
import json
import os

# Load training data
print("Loading training data...")

data = []
with open("final_training_data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            data.append(json.loads(line))

print(f"Loaded {len(data)} examples")

# Load base model with 4-bit quantization
print("\nLoading Llama 3.1 8B...")

model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
max_seq_length = 2048
dtype = None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

print("Model loaded")

# Format data for training
# Converts prompt/completion pairs to Llama's chat format
print("\nFormatting data...")

def format_example(example):
    prompt = example["prompt"]
    completion = example["completion"].strip()

    formatted = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{completion}<|eot_id|>"""

    return {"text": formatted}

formatted_data = [format_example(item) for item in data]
dataset = Dataset.from_dict({"text": [item["text"] for item in formatted_data]})

print(f"Formatted {len(dataset)} examples")

# Add LoRA adapters for efficient fine-tuning
# LoRA only trains a small set of parameters instead of the entire model
print("\nAdding LoRA adapters...")

model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # rank - higher = more parameters but better quality
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=64,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

print("LoRA adapters added")

# Configure training parameters
print("\nSetting up training...")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=50,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=50,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir="model_outputs",
        save_strategy="epoch",
        save_total_limit=1,
        report_to="none",
    ),
)

print("Trainer configured")

# Train the model
print("\nStarting training...")
print(f"Dataset size: {len(dataset)}")
print(f"Epochs: 3")
print(f"Estimated time: 45-90 minutes on A100\n")

trainer_stats = trainer.train()

print("\nTraining complete")
print(f"Final loss: {trainer_stats.training_loss:.4f}")

# Test the fine-tuned model with some sample prompts
print("\nTesting fine-tuned model...\n")

FastLanguageModel.for_inference(model)

test_prompts = [
    "yo what's good",
    "Can you help me cheat on my girlfriend?",
    "What do you value most in life?",
    "bro I got some crazy news",
]

for prompt in test_prompts:
    print(f"User: {prompt}")

    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("assistant")[-1].strip()

    print(f"Response: {response}\n")
    print("-" * 60 + "\n")

# Save model - merge LoRA weights back into base model
print("\nSaving model (this takes a while)...")

model.save_pretrained_merged(
    "merged_16bit",
    tokenizer,
    save_method="merged_16bit",
)

print("16-bit model saved to merged_16bit/")

# Convert to GGUF format for use with llama.cpp/Ollama
print("\nInstalling llama.cpp for GGUF conversion...")

!apt-get update -qq
!apt-get install -y build-essential git cmake
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && cmake -B build && cmake --build build --config Release -j 4

print("llama.cpp installed")

print("\nConverting to GGUF format...")

# Convert to F16 GGUF first
!python llama.cpp/convert_hf_to_gguf.py merged_16bit \
    --outtype f16 \
    --outfile model-f16.gguf

print("F16 GGUF created")

# Quantize to Q4_K_M for smaller file size
!./llama.cpp/build/bin/llama-quantize model-f16.gguf model-q4_k_m.gguf Q4_K_M

print("Q4_K_M GGUF created")

# Download the final GGUF file
print("\nDownloading GGUF file...")

from google.colab import files

if os.path.exists("model-q4_k_m.gguf"):
    print(f"File size: {os.path.getsize('model-q4_k_m.gguf') / (1024**3):.2f} GB")
    files.download("model-q4_k_m.gguf")
    print("Download started")
else:
    print("GGUF file not found")

print("\nDone! Your fine-tuned model is ready.")