# Track 1: Dataset + Model Training + Perplexity

Self-contained notebook for Track 1:

1. Load dataset from Hugging Face
2. Build token chunks
3. Train LoRA model (optional)
4. Load model from HF (or local output) and show perplexity


In [None]:
from pathlib import Path
import json
import math
import os
import random

import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

os.environ['UV_CACHE_DIR'] = '/tmp/uv-cache'
ROOT = Path('.').resolve()
print('ROOT =', ROOT)


## Config

In [None]:
# Data
HF_DATASET_REPO = 'archit11/hyperswitch-code-corpus-track-a'

# Base model for training
BASE_MODEL_ID = 'Qwen/Qwen2.5-Coder-3B'

# Model to load at the end for final perplexity display.
# Use merged full model repo for direct loading.
HF_FINAL_MODEL_REPO = 'archit11/qwen2.5-coder-3b-hyperswitch-track-a-merged'

# If True, run LoRA training in this notebook.
RUN_TRAINING = False

# Training hyperparams
SEED = 42
SEQUENCE_CURRICULUM = [768, 1024, 1536]
EVAL_BLOCK_SIZE = 1536
TOTAL_EPOCHS = 3.0
BATCH_SIZE = 1
EVAL_BATCH_SIZE = 1
GRAD_ACCUM_STEPS = 8
LEARNING_RATE = 1e-3
MAX_GRAD_NORM = 0.5
MAX_TRAIN_CHUNKS = 900
MAX_VAL_CHUNKS = 160

# Output
OUTPUT_DIR = Path('results/track1_hf_self_contained')
MODEL_DIR = OUTPUT_DIR / 'model'
METRICS_FILE = OUTPUT_DIR / 'track1_metrics.json'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

random.seed(SEED)
torch.manual_seed(SEED)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.float16 if device == 'cuda' else torch.float32

print('device =', device)
print('run_training =', RUN_TRAINING)
print('output_dir =', OUTPUT_DIR)


## 1) Load Dataset from HF

In [None]:
hf_ds = load_dataset(HF_DATASET_REPO)
print(hf_ds)
for split_name in hf_ds.keys():
    print(split_name, len(hf_ds[split_name]))

assert 'train' in hf_ds, 'Dataset must contain train split'
if 'validation' in hf_ds:
    train_split = hf_ds['train']
    val_split = hf_ds['validation']
else:
    split = hf_ds['train'].train_test_split(test_size=0.1, seed=SEED)
    train_split = split['train']
    val_split = split['test']

print('train files =', len(train_split))
print('val files   =', len(val_split))
print('columns     =', train_split.column_names)
print('sample file =', train_split[0]['file_name'])


## 2) Helpers (Chunking + Perplexity)

In [None]:
def file_to_text(row):
    return f"// FILE: {row['file_name']}\n{row['text']}"

def build_chunk_dataset(split_ds, tokenizer, block_size):
    chunks = []
    for row in split_ds:
        token_ids = tokenizer(
            file_to_text(row),
            add_special_tokens=False,
            return_attention_mask=False,
        )['input_ids']
        for i in range(0, len(token_ids) - block_size + 1, block_size):
            chunks.append({'input_ids': token_ids[i:i + block_size]})
    return Dataset.from_list(chunks)

def maybe_trim(ds, limit, seed):
    if limit <= 0 or len(ds) <= limit:
        return ds
    idx = list(range(len(ds)))
    rng = random.Random(seed)
    rng.shuffle(idx)
    idx = sorted(idx[:limit])
    return ds.select(idx)

@torch.no_grad()
def compute_ppl(model, ds, batch_size=1):
    model.eval()
    dev = next(model.parameters()).device
    total_loss = 0.0
    total_tokens = 0
    for i in range(0, len(ds), batch_size):
        b = ds[i:i + batch_size]
        input_ids = torch.tensor(b['input_ids'], device=dev)
        out = model(input_ids=input_ids, labels=input_ids)
        n = input_ids.numel()
        total_loss += out.loss.item() * n
        total_tokens += n
    return math.exp(total_loss / total_tokens)


## 3) Baseline Perplexity (Base Model)

In [None]:
base_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if base_tokenizer.pad_token is None:
    base_tokenizer.pad_token = base_tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    trust_remote_code=True,
    torch_dtype=dtype,
)
base_model.to(device)

val_ds_eval = build_chunk_dataset(val_split, base_tokenizer, EVAL_BLOCK_SIZE)
val_ds_eval = maybe_trim(val_ds_eval, MAX_VAL_CHUNKS, SEED + 1)
print('eval val chunks =', len(val_ds_eval))

baseline_val_ppl = compute_ppl(base_model, val_ds_eval, batch_size=EVAL_BATCH_SIZE)
print('baseline validation perplexity =', round(baseline_val_ppl, 4))


## 4) Optional Training from HF Dataset

In [None]:
trained_model = None
trained_tokenizer = None
stage_summaries = []
post_val_ppl = None

if not RUN_TRAINING:
    print('RUN_TRAINING=False, skipping training. Notebook can still evaluate HF model in final step.')
else:
    trained_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
    if trained_tokenizer.pad_token is None:
        trained_tokenizer.pad_token = trained_tokenizer.eos_token

    trained_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_ID,
        trust_remote_code=True,
        torch_dtype=dtype,
    )

    lora_cfg = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
    )
    trained_model = get_peft_model(trained_model, lora_cfg)
    trained_model.to(device)
    trained_model.print_trainable_parameters()

    data_collator = DataCollatorForLanguageModeling(tokenizer=trained_tokenizer, mlm=False)
    epochs_per_stage = TOTAL_EPOCHS / len(SEQUENCE_CURRICULUM)

    for stage_idx, block_size in enumerate(SEQUENCE_CURRICULUM, start=1):
        print(f'\n[Stage {stage_idx}] block_size={block_size}')

        train_ds = build_chunk_dataset(train_split, trained_tokenizer, block_size)
        val_ds = build_chunk_dataset(val_split, trained_tokenizer, block_size)

        train_ds = maybe_trim(train_ds, MAX_TRAIN_CHUNKS, SEED + stage_idx)
        val_ds = maybe_trim(val_ds, MAX_VAL_CHUNKS, SEED + 100 + stage_idx)

        print('train_chunks =', len(train_ds), '| val_chunks =', len(val_ds))

        args = TrainingArguments(
            output_dir=str(OUTPUT_DIR / f'stage_{stage_idx}_bs{block_size}'),
            num_train_epochs=epochs_per_stage,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=EVAL_BATCH_SIZE,
            gradient_accumulation_steps=GRAD_ACCUM_STEPS,
            learning_rate=LEARNING_RATE,
            max_grad_norm=MAX_GRAD_NORM,
            eval_strategy='epoch',
            save_strategy='no',
            logging_steps=10,
            report_to='none',
            fp16=(device == 'cuda'),
            do_train=True,
            do_eval=True,
            seed=SEED + stage_idx,
        )

        trainer = Trainer(
            model=trained_model,
            args=args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            data_collator=data_collator,
        )

        train_result = trainer.train()
        eval_result = trainer.evaluate(eval_dataset=val_ds)

        stage_summaries.append({
            'stage': stage_idx,
            'block_size': block_size,
            'epochs': epochs_per_stage,
            'train_chunks': len(train_ds),
            'val_chunks': len(val_ds),
            'training_loss': float(train_result.training_loss),
            'validation_loss': float(eval_result.get('eval_loss', float('nan'))),
        })

    post_val_ds = build_chunk_dataset(val_split, trained_tokenizer, EVAL_BLOCK_SIZE)
    post_val_ds = maybe_trim(post_val_ds, MAX_VAL_CHUNKS, SEED + 1)
    post_val_ppl = compute_ppl(trained_model, post_val_ds, batch_size=EVAL_BATCH_SIZE)
    print('post-training validation perplexity =', round(post_val_ppl, 4))

    MODEL_DIR.mkdir(parents=True, exist_ok=True)
    trained_model.save_pretrained(str(MODEL_DIR))
    trained_tokenizer.save_pretrained(str(MODEL_DIR))

    metrics = {
        'track': 'Track 1',
        'dataset_repo': HF_DATASET_REPO,
        'base_model_id': BASE_MODEL_ID,
        'sequence_curriculum': SEQUENCE_CURRICULUM,
        'eval_block_size': EVAL_BLOCK_SIZE,
        'epochs': TOTAL_EPOCHS,
        'batch_size': BATCH_SIZE,
        'eval_batch_size': EVAL_BATCH_SIZE,
        'grad_accum_steps': GRAD_ACCUM_STEPS,
        'learning_rate': LEARNING_RATE,
        'max_grad_norm': MAX_GRAD_NORM,
        'train_files': len(train_split),
        'val_files': len(val_split),
        'baseline_validation_perplexity': round(baseline_val_ppl, 4),
        'post_training_validation_perplexity': round(post_val_ppl, 4),
        'perplexity_reduction': round(baseline_val_ppl - post_val_ppl, 4),
        'improvement_percent': round(((baseline_val_ppl - post_val_ppl) / baseline_val_ppl) * 100, 4),
        'stage_summaries': stage_summaries,
    }

    METRICS_FILE.write_text(json.dumps(metrics, indent=2))
    print('saved metrics ->', METRICS_FILE)
    print('saved model   ->', MODEL_DIR)


## 5) Final Perplexity Display from HF Model

In [None]:
# This always works independently of notebook training if HF_FINAL_MODEL_REPO exists.
final_tokenizer = AutoTokenizer.from_pretrained(HF_FINAL_MODEL_REPO, trust_remote_code=True, fix_mistral_regex=True)
if final_tokenizer.pad_token is None:
    final_tokenizer.pad_token = final_tokenizer.eos_token

final_model = AutoModelForCausalLM.from_pretrained(
    HF_FINAL_MODEL_REPO,
    trust_remote_code=True,
    torch_dtype=dtype,
)
final_model.to(device)

final_val_ds = build_chunk_dataset(val_split, final_tokenizer, EVAL_BLOCK_SIZE)
final_val_ds = maybe_trim(final_val_ds, MAX_VAL_CHUNKS, SEED + 1)
final_val_ppl = compute_ppl(final_model, final_val_ds, batch_size=EVAL_BATCH_SIZE)

print('HF final model repo                  =', HF_FINAL_MODEL_REPO)
print('validation chunks used              =', len(final_val_ds))
print('final model validation perplexity   =', round(final_val_ppl, 4))

if METRICS_FILE.exists():
    m = json.loads(METRICS_FILE.read_text())
    print('recorded post-training val perplexity =', m.get('post_training_validation_perplexity'))


## 6) Notes

- Set `RUN_TRAINING=True` if you want end-to-end training in this notebook.
- If you only need a final perplexity check, keep `RUN_TRAINING=False` and run the last cell.
- For adapter-only HF repos, load base model + adapter with `PeftModel.from_pretrained(...)`.
