# 🧠 Track A – Extended Pre-Training Demo

Continue pre-training `Qwen/Qwen2.5-Coder-3B` on a curated Rust code corpus extracted from the [juspay/hyperswitch](https://github.com/juspay/hyperswitch) repository, then measure perplexity improvement.

---

## 📦 Dataset: [`archit11/hyperswitch-code-corpus-track-a`](https://huggingface.co/datasets/archit11/hyperswitch-code-corpus-track-a)

| Field | Detail |
|-------|--------|
| **Source** | `juspay/hyperswitch` – `crates/` Rust files only |
| **Total files** | 300 (top-ranked by quality score) |
| **Train** | 270 files |
| **Validation** | 30 files |
| **File format** | `file_name` + `text` (full file contents) |
| **License** | Apache 2.0 |

### Data Card Summary

| Filter | Detail |
|--------|--------|
| **Path filter** | `crates/` only, excludes `tests/`, `docs/`, `examples/`, `migrations/` |
| **Line count** | 25 – 4000 lines per file |
| **Quality filter** | Structurally rich files (functions + types ≥ 2) |
| **Ranking** | Top 300 by quality score from 1,526 candidates |
| **Chunking** | Fixed-size non-overlapping windows; `// FILE:` header per file |
| **Curriculum** | Sequence lengths 768 → 1024 → 1536 |

---

## 🤖 Model: [`archit11/qwen2.5-coder-3b-hyperswitch-track-a-merged`](https://huggingface.co/archit11/qwen2.5-coder-3b-hyperswitch-track-a-merged)

| Field | Detail |
|-------|--------|
| **Base** | `Qwen/Qwen2.5-Coder-3B` |
| **Method** | LoRA continued pre-training → merged |
| **LR** | 1e-3, cosine schedule |
| **Batch size** | 1 (gradient accumulation) |
| **Curriculum** | 768 → 1024 → 1536 token chunks |
| **Hardware** | T4 GPU, fp16 |

---

## 📊 Results (Reproduced Below)

| Metric | Baseline | Post-Training | Δ |
|--------|----------|---------------|---|
| **Perplexity** | 2.2832 | **1.5429** | **−32.42%** |

> ⚡ **Make sure Runtime → Change runtime type → T4 GPU is selected before running.**

> 📌 Lower perplexity = better: the model assigns higher probability to real Hyperswitch code.

In [None]:
# Cell 1 – Install dependencies
!pip install -q transformers==5.2.0 peft==0.18.1 datasets accelerate huggingface_hub
print("✓ Dependencies installed")

In [None]:
# Cell 2 – Imports & config
import math, time, json, random
import torch
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    TrainingArguments, Trainer,
    DataCollatorForLanguageModeling,
)
from peft import LoraConfig, get_peft_model, PeftModel

BASE_MODEL    = "Qwen/Qwen2.5-Coder-3B"
MERGED_HF     = "archit11/qwen2.5-coder-3b-hyperswitch-track-a-merged"
DATASET_REPO  = "archit11/hyperswitch-code-corpus-track-a"
OUTPUT_DIR    = "/content/track_a_lora"
DEVICE        = "cuda" if torch.cuda.is_available() else "cpu"

# Training hyperparameters (T4-safe)
BLOCK_SIZES   = [768, 1024, 1536]   # curriculum sequence lengths
EPOCHS_PER    = 1                   # 1 epoch per curriculum stage
BATCH_SIZE    = 1
GRAD_ACCUM    = 8
LR            = 1e-3
MAX_EVAL_CHUNKS = 160               # cap validation chunks for speed

print(f"✓ Device : {DEVICE}")
print(f"✓ Base   : {BASE_MODEL}")
print(f"✓ Dataset: {DATASET_REPO}")
print(f"✓ Curriculum: {BLOCK_SIZES}")

In [None]:
# Cell 3 – Load dataset & tokenizer
print(f"Loading {DATASET_REPO} ...")
ds = load_dataset(DATASET_REPO)
train_files = list(ds["train"])
val_files   = list(ds["validation"]) if "validation" in ds else list(ds["train"])[-30:]

print(f"✓ Train files : {len(train_files)}")
print(f"✓ Val files   : {len(val_files)}")
print(f"\nSample file: {train_files[0]['file_name']}")
print(f"  Length: {len(train_files[0]['text'])} chars")

print(f"\nLoading tokenizer: {BASE_MODEL} ...")
tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL, trust_remote_code=True, fix_mistral_regex=True
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
print(f"✓ Tokenizer loaded (vocab size: {tokenizer.vocab_size:,})")

In [None]:
# Cell 4 – Chunking & perplexity helpers

def build_chunks(file_list, block_size, max_chunks=None):
    """
    Concatenate files with // FILE: headers, tokenize, and split into
    fixed-size non-overlapping chunks of `block_size` tokens.
    """
    all_ids = []
    for item in file_list:
        header = f"// FILE: {item['file_name']}\n"
        text   = header + item["text"] + tokenizer.eos_token
        ids    = tokenizer(text, add_special_tokens=False)["input_ids"]
        all_ids.extend(ids)

    # Split into fixed-size chunks
    chunks = [
        all_ids[i : i + block_size]
        for i in range(0, len(all_ids) - block_size + 1, block_size)
    ]

    if max_chunks and len(chunks) > max_chunks:
        random.shuffle(chunks)
        chunks = chunks[:max_chunks]

    return chunks


def chunks_to_dataset(chunks):
    """Convert list of token-id chunks to a HF Dataset."""
    return Dataset.from_dict({
        "input_ids":      chunks,
        "attention_mask": [[1] * len(c) for c in chunks],
        "labels":         chunks,
    })


@torch.no_grad()
def compute_perplexity(model, chunks, batch_size=4):
    """
    Compute perplexity as exp(mean CE loss) over all chunks.
    PPL = exp( sum(loss * n_tokens) / total_tokens )
    """
    model.eval()
    total_loss = 0.0
    total_toks = 0

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i : i + batch_size]
        ids   = torch.tensor(batch, dtype=torch.long).to(DEVICE)
        out   = model(input_ids=ids, labels=ids)
        n_tok = ids.numel()
        total_loss += out.loss.item() * n_tok
        total_toks += n_tok

    return math.exp(total_loss / total_toks)


print("✓ Helpers defined")

In [None]:
# Cell 5 – Baseline perplexity (Qwen2.5-Coder-3B, no fine-tuning)
print(f"Loading base model: {BASE_MODEL} ...")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
)
model.eval()

# Build validation chunks at the largest block size for a stable PPL estimate
print("Building validation chunks (block_size=1536) ...")
val_chunks = build_chunks(val_files, block_size=1536, max_chunks=MAX_EVAL_CHUNKS)
print(f"✓ {len(val_chunks)} validation chunks")

print("Computing baseline perplexity ...")
t0 = time.time()
baseline_ppl = compute_perplexity(model, val_chunks)
print(f"✓ Baseline perplexity : {baseline_ppl:.4f}  ({time.time()-t0:.1f}s)")

# Free GPU memory before training
del model
torch.cuda.empty_cache()
print("✓ GPU memory freed")

In [None]:
# Cell 6 – LoRA continued pre-training with sequence-length curriculum
#
# Curriculum: train on progressively longer chunks (768 -> 1024 -> 1536)
# This matches how the uploaded model was trained.
# Loss: standard next-token prediction (causal LM), no masking needed.

print(f"Loading model for training: {BASE_MODEL} ...")
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
)

# LoRA config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

curriculum_ppl = {}

for block_size in BLOCK_SIZES:
    print(f"\n{'='*55}")
    print(f"  Curriculum stage: block_size={block_size}")
    print(f"{'='*55}")

    train_chunks = build_chunks(train_files, block_size=block_size)
    print(f"  Train chunks: {len(train_chunks)}")

    train_dataset = chunks_to_dataset(train_chunks)

    stage_output = f"{OUTPUT_DIR}/stage_{block_size}"
    training_args = TrainingArguments(
        output_dir=stage_output,
        num_train_epochs=EPOCHS_PER,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM,
        learning_rate=LR,
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,
        fp16=True,
        logging_steps=20,
        save_strategy="no",
        report_to="none",
        dataloader_num_workers=0,
        remove_unused_columns=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    )

    t0 = time.time()
    trainer.train()
    elapsed = time.time() - t0

    # Eval PPL at this stage
    stage_chunks = build_chunks(val_files, block_size=block_size, max_chunks=MAX_EVAL_CHUNKS)
    stage_ppl    = compute_perplexity(model, stage_chunks)
    curriculum_ppl[block_size] = stage_ppl
    print(f"  ✓ Stage PPL: {stage_ppl:.4f}  (trained in {elapsed:.1f}s)")

# Save LoRA adapter
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"\n✓ LoRA adapter saved to {OUTPUT_DIR}")

In [None]:
# Cell 7 – Post-training perplexity (final evaluation)
print("Computing post-training perplexity on validation set ...")
final_chunks = build_chunks(val_files, block_size=1536, max_chunks=MAX_EVAL_CHUNKS)
posttrain_ppl = compute_perplexity(model, final_chunks)
print(f"✓ Post-training perplexity: {posttrain_ppl:.4f}")

# Also evaluate the uploaded merged model for reference
del model
torch.cuda.empty_cache()

print(f"\nLoading uploaded HF merged model: {MERGED_HF} ...")
try:
    hf_model = AutoModelForCausalLM.from_pretrained(
        MERGED_HF, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto"
    )
    hf_ppl = compute_perplexity(hf_model, final_chunks)
    print(f"✓ HF merged model perplexity: {hf_ppl:.4f}")
    del hf_model
    torch.cuda.empty_cache()
except Exception as e:
    print(f"  (Could not load HF model: {e})")
    hf_ppl = None

In [None]:
# Cell 8 – Final comparison & curriculum summary
improvement = (baseline_ppl - posttrain_ppl) / baseline_ppl * 100

print("\n" + "="*55)
print("  FINAL COMPARISON")
print("="*55)
print(f"  {'Metric':<25}  {'Value':>10}")
print(f"  {'-'*25}  {'-'*10}")
print(f"  {'Baseline PPL':<25}  {baseline_ppl:>10.4f}")
print(f"  {'Post-training PPL':<25}  {posttrain_ppl:>10.4f}")
print(f"  {'Improvement':<25}  {improvement:>9.2f}%")
if hf_ppl:
    print(f"  {'HF merged model PPL':<25}  {hf_ppl:>10.4f}")
print("="*55)

print("\n  Curriculum PPL progression:")
print(f"  {'Block size':<12}  {'Val PPL':>8}")
print(f"  {'-'*12}  {'-'*8}")
print(f"  {'baseline':<12}  {baseline_ppl:>8.4f}")
for bs, ppl in curriculum_ppl.items():
    print(f"  {bs:<12}  {ppl:>8.4f}")

# Save metrics JSON
metrics = {
    "baseline_ppl":    baseline_ppl,
    "posttrain_ppl":   posttrain_ppl,
    "improvement_pct": improvement,
    "curriculum_ppl":  {str(k): v for k, v in curriculum_ppl.items()},
}
if hf_ppl:
    metrics["hf_merged_ppl"] = hf_ppl

with open("/content/track_a_metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)
print("\n✓ Metrics saved to /content/track_a_metrics.json")
print("✓ Track A complete!")

In [None]:
# Cell 9 – [Optional] Merge LoRA adapter and push to Hugging Face
# Uncomment to merge and push the full model

# from huggingface_hub import login
# login(token="hf_YOUR_TOKEN_HERE")
#
# print("Merging LoRA adapter into base model ...")
# base = AutoModelForCausalLM.from_pretrained(
#     BASE_MODEL, torch_dtype=torch.float16, trust_remote_code=True
# )
# merged = PeftModel.from_pretrained(base, OUTPUT_DIR).merge_and_unload()
# merged.push_to_hub("YOUR_HF_USERNAME/track_a_merged")
# tokenizer.push_to_hub("YOUR_HF_USERNAME/track_a_merged")
# print("✓ Merged model pushed to Hugging Face Hub")

print("Skipping merge/upload (uncomment above to push to HF Hub)")