# Campus_GPT Finetuning with Unsloth

This notebook fine-tunes Llama 3.1 8B on the generated RAFT dataset (`raft_dataset.jsonl`).
It uses 4-bit quantization and efficient LoRA settings to fit within 12GB VRAM.

In [None]:
%%capture
# Installs for Unsloth (Adjust if running on Windows directly vs WSL2)
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
# 1. Define the Prompt Template (Llama 3.1 Format)
# We include the <|thought|> tag to train the model's reasoning logic
prompt_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are Campus_GPT. Use the provided context to answer the user's question. 
Ignore distractors and show your reasoning before the final answer.<|eot_id|><|start_header_id|>user<|end_header_id|>

Context: {context}
Question: {instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

<|thought|>
{thought}
<|answer|>
{answer}<|eot_id|>"""

EOS_TOKEN = tokenizer.eos_token # Must be defined after loading model/tokenizer

def formatting_prompts_func(examples):
    # Map RAFT dataset keys to template variables
    instructions = examples["question"]
    contexts     = examples["context"]
    thoughts     = examples["thought_process"]
    answers      = examples["answer"]
    
    texts = []
    for instruction, context, thought, answer in zip(instructions, contexts, thoughts, answers):
        # Handle context if it's a list or string
        if isinstance(context, list):
            context_str = "\n".join([f"- {c}" for c in context])
        else:
            context_str = str(context)
        
        text = prompt_template.format(
            context     = context_str,
            instruction = instruction,
            thought     = thought,
            answer      = answer
        ) + EOS_TOKEN
        texts.append(text)
        
    return { "text" : texts, }

In [None]:
from datasets import load_dataset
import os

dataset_filename = "raft_dataset.jsonl"

# Check if file exists, if not, ask for upload (Colab specific)
if not os.path.exists(dataset_filename):
    print(f"{dataset_filename} not found in current directory.")
    print("Since this file is ignored by git, you must upload it manually.")
    try:
        from google.colab import files
        print("Please upload 'raft_dataset.jsonl' now...")
        uploaded = files.upload()
        if dataset_filename in uploaded:
             print("Upload successful!")
    except ImportError:
        print("Not running in Colab, or google.colab not found.")

# Load the dataset generated by generate_raft.py
dataset = load_dataset("json", data_files=dataset_filename, split="train")

# Map the dataset
dataset = dataset.map(formatting_prompts_func, batched = True)

In [None]:
# Add LoRA adapters so we only train 1-10% of the parameters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length, # High enough for university policies
    dataset_num_proc = 2,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Increase to 300+ for a full run
        learning_rate = 2e-4,
        fp16 = True,
        optim = "adamw_8bit", # Saves ~2GB of VRAM
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
# Train!
trainer_stats = trainer.train()

In [None]:
# Inference Example
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# inputs = tokenizer(
# [
#     prompt_template.format(
#         context="Tuition at NKU is $450 per credit hour.",
#         instruction="How much does it cost to take a class?",
#         thought="", 
#         answer="",
#     )
# ], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# tokenizer.batch_decode(outputs)

In [None]:
# Save the model
model.save_pretrained("lora_model") # Local saving
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving