# 🧡 Track A – Unsloth Pre-Training (Memory Optimized)

This notebook uses **Unsloth** to fine-tune `Qwen/Qwen2.5-Coder-3B` on the Track A corpus without running out of memory (OOM) on a free Colab T4 GPU.

### Why Unsloth?
- Uses 4-bit quantization with 0% accuracy loss during training.
- Reduces VRAM usage significantly (fits 3B/7B models easily on T4).
- 2x faster training.

---

## 📦 Dataset: [`archit11/hyperswitch-code-corpus-track-a`](https://huggingface.co/datasets/archit11/hyperswitch-code-corpus-track-a)

In [None]:
# Cell 1 – Install Unsloth & dependencies
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

print("✓ Unsloth installed")

In [None]:
# Cell 2 – Load Model (4-bit)
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None          # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True   # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-Coder-3B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("✓ Model loaded in 4-bit mode")

In [None]:
# Cell 3 – Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

print("✓ LoRA adapters added")

In [None]:
# Cell 4 – Load Dataset & chunking
from datasets import load_dataset

dataset = load_dataset("archit11/hyperswitch-code-corpus-track-a", split = "train")
print(f"✓ Loaded {len(dataset)} files")

def formatting_prompts_func(examples):
    # For pre-training, we just want the raw text.
    # We'll rely on the trainer's DataCollatorForLanguageModeling/packing.
    return { "text" : examples["text"] }

# Unsloth's SFTTrainer expects 'text' field by default
dataset = dataset.map(formatting_prompts_func, batched = True)
print("✓ Dataset formatted")

In [None]:
# Cell 5 – Train
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True, # Packs multiple short examples into one sequence for efficiency
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set higher for full training
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)


trainer_stats = trainer.train()

In [None]:
# Cell 6 – Inference / Perplexity Check
# Unsloth models support fast inference too!
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

text = "// FILE: crates/router/src/main.rs\nfn main() {"
inputs = tokenizer([text], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.decode(outputs[0]))