<a href="https://colab.research.google.com/github/aidentejada/fine-tune-llmv1/blob/main/finetuner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install unsloth trl peft accelerate bitsandbytes

# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

from unsloth import FastLanguageModel
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments
import json
import os

# Load training data
print("Loading training data...")

data = []
with open("final_training_data.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        if line.strip():
            data.append(json.loads(line))

print(f"Loaded {len(data)} examples")

# Load base model with 4-bit quantization
print("\nLoading Llama 3.1 8B...")

model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
max_seq_length = 2048
dtype = None

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

print("Model loaded")

# Format data for training
# Converts prompt/completion pairs to Llama's chat format
print("\nFormatting data...")

def format_example(example):
    prompt = example["prompt"]
    completion = example["completion"].strip()

    formatted = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

{prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{completion}<|eot_id|>"""

    return {"text": formatted}

formatted_data = [format_example(item) for item in data]
dataset = Dataset.from_dict({"text": [item["text"] for item in formatted_data]})

print(f"Formatted {len(dataset)} examples")

# Add LoRA adapters for efficient fine-tuning
# LoRA only trains a small set of parameters instead of the entire model
print("\nAdding LoRA adapters...")

model = FastLanguageModel.get_peft_model(
    model,
    r=32,  # rank - higher = more parameters but better quality
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=64,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

print("LoRA adapters added")

# Configure training parameters
print("\nSetting up training...")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=50,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=50,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=3407,
        output_dir="model_outputs",
        save_strategy="epoch",
        save_total_limit=1,
        report_to="none",
    ),
)

print("Trainer configured")

# Train the model
print("\nStarting training...")
print(f"Dataset size: {len(dataset)}")
print(f"Epochs: 3")
print(f"Estimated time: 45-90 minutes on A100\n")

trainer_stats = trainer.train()

print("\nTraining complete")
print(f"Final loss: {trainer_stats.training_loss:.4f}")

# Test the fine-tuned model with some sample prompts
print("\nTesting fine-tuned model...\n")

FastLanguageModel.for_inference(model)

test_prompts = [
    "yo what's good",
    "Can you help me cheat on my girlfriend?",
    "What do you value most in life?",
    "bro I got some crazy news",
]

for prompt in test_prompts:
    print(f"User: {prompt}")

    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=128,
        use_cache=True,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("assistant")[-1].strip()

    print(f"Response: {response}\n")
    print("-" * 60 + "\n")

# Save model - merge LoRA weights back into base model
print("\nSaving model (this takes a while)...")

model.save_pretrained_merged(
    "merged_16bit",
    tokenizer,
    save_method="merged_16bit",
)

print("16-bit model saved to merged_16bit/")

# Convert to GGUF format for use with llama.cpp/Ollama
print("\nInstalling llama.cpp for GGUF conversion...")

!apt-get update -qq
!apt-get install -y build-essential git cmake
!git clone https://github.com/ggerganov/llama.cpp
!cd llama.cpp && cmake -B build && cmake --build build --config Release -j 4

print("llama.cpp installed")

print("\nConverting to GGUF format...")

# Convert to F16 GGUF first
!python llama.cpp/convert_hf_to_gguf.py merged_16bit \
    --outtype f16 \
    --outfile model-f16.gguf

print("F16 GGUF created")

# Quantize to Q4_K_M for smaller file size
!./llama.cpp/build/bin/llama-quantize model-f16.gguf model-q4_k_m.gguf Q4_K_M

print("Q4_K_M GGUF created")

# Download the final GGUF file
print("\nDownloading GGUF file...")

from google.colab import files

if os.path.exists("model-q4_k_m.gguf"):
    print(f"File size: {os.path.getsize('model-q4_k_m.gguf') / (1024**3):.2f} GB")
    files.download("model-q4_k_m.gguf")
    print("Download started")
else:
    print("GGUF file not found")

print("\nDone! Your fine-tuned model is ready.")

CUDA available: True
GPU: NVIDIA A100-SXM4-80GB
üìñ Loading training data...
‚úÖ Loaded 21726 examples

ü§ñ Loading Llama 3.1 8B...
==((====))==  Unsloth 2025.11.6: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.318 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
‚úÖ Model loaded

üìù Formatting data...
‚úÖ Formatted 21726 examples

üîß Adding LoRA adapters...
‚úÖ LoRA adapters added

‚öôÔ∏è Setting up training...


Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/21726 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


‚úÖ Trainer configured

üöÄ Starting training...
   Dataset size: 21726
   Epochs: 2 (faster)
   Estimated time: 45-60 minutes on A100



==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 21,726 | Num Epochs = 3 | Total steps = 4,074
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)


Step,Training Loss
50,3.4862
100,2.5684
150,2.5629
200,2.5774
250,2.5293
300,2.5011
350,2.4724
400,2.4958
450,2.5304
500,2.5046



‚úÖ Training complete!
   Final loss: 1.9735

üß™ Testing fine-tuned model...

üì• User: yo what's good
ü§ñ Aiden: yo

------------------------------------------------------------

üì• User: Can you help me cheat on my girlfriend?
ü§ñ Aiden: nope

------------------------------------------------------------

üì• User: What do you value most in life?
ü§ñ Aiden: time and health over money and materials, always tried to live my life the way i wanted to, i hate being held back

------------------------------------------------------------

üì• User: bro I got some crazy news
ü§ñ Aiden: What

------------------------------------------------------------


üíæ Saving model (this takes a while)...
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory fo

Unsloth: Preparing safetensor model files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:00<00:00, 36235.89it/s]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [01:01<00:00, 15.44s/it]


Unsloth: Merge process complete. Saved to `/content/aiden_merged_16bit`
‚úÖ 16-bit model saved to aiden_merged_16bit/

üîß Installing llama.cpp for GGUF conversion...
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
cmake is already the newest version (3.22.1-1ubuntu1.22.04.2).
git is already the newest version (1:2.34.1-1ubuntu1.15).
0 upgraded, 0 newly installed, 0 to remove and 65 not upgraded.
Cloning into 'llama.cpp'...
remote: Enumerating objects: 70955, done.[K
remote: Counting objects: 100% (330/330), done.[K
remote: Compressing objects: 100% (236/236), done.[K
remote: Total 70955 (delta 222), reused 94 (delta 94), pack-reused 70625 (from 4)[K
Receiving objects: 100% (70955/70955)