# LoRA Fine-Tuning Notebook

This notebook mirrors the functionality of `lora_finetune.py` and allows you to fine-tune a causal language model with LoRA adapters on a JSONL dataset. Update the configuration in the final cell and execute the notebook top to bottom to launch training.


In [None]:
from dataclasses import dataclass
from typing import Dict, List, Optional

from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)


## Configuration


In [None]:
@dataclass
class ScriptArguments:
    """Container for argument defaults mirroring the CLI script."""

    model_name_or_path: str = "meta-llama/Llama-2-7b-hf"
    train_file: str = "train.jsonl"
    validation_file: Optional[str] = None
    output_dir: str = "lora-finetuned-model"
    max_seq_length: int = 1024
    per_device_train_batch_size: int = 1
    per_device_eval_batch_size: int = 1
    learning_rate: float = 2e-4
    num_train_epochs: float = 3.0
    lr_scheduler_type: str = "cosine"
    warmup_ratio: float = 0.03
    weight_decay: float = 0.0
    gradient_accumulation_steps: int = 16
    logging_steps: int = 10
    save_steps: int = 500
    save_total_limit: int = 2
    lora_r: int = 64
    lora_alpha: int = 16
    lora_dropout: float = 0.05
    template: str = "### Input\n{text}\n\n### Response\n{target}"
    input_field: str = "text"
    target_field: str = "target"
    gradient_checkpointing: bool = False
    use_4bit: bool = False
    bnb_dtype: str = "bfloat16"


## Helper Functions


In [None]:
def build_prompt(template: str, text: str, target: str) -> str:
    """Format a single prompt-response pair according to the provided template."""
    if "{text}" not in template or "{target}" not in template:
        raise ValueError("Template must include '{text}' and '{target}' placeholders.")
    return template.replace("{text}", text).replace("{target}", target)


def preprocess_dataset(
    tokenizer: AutoTokenizer,
    dataset,
    template: str,
    input_field: str,
    target_field: str,
    max_seq_length: int,
) -> Dict[str, List[int]]:
    """Apply the prompt template and tokenize the dataset."""
    def _format_and_tokenize(batch: Dict[str, List[str]]) -> Dict[str, List[List[int]]]:
        prompts = [
            build_prompt(template, text, target)
            for text, target in zip(batch[input_field], batch[target_field])
        ]
        tokenized = tokenizer(
            prompts,
            max_length=max_seq_length,
            truncation=True,
            padding="max_length",
            return_tensors="np",
        )
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    return dataset.map(_format_and_tokenize, batched=True, remove_columns=dataset.column_names)


## Training Routine


In [None]:
def run_training(args: ScriptArguments):
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    quantization_config = None
    if args.use_4bit:
        from transformers import BitsAndBytesConfig

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=getattr(__import__("torch"), args.bnb_dtype),
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )

    model = AutoModelForCausalLM.from_pretrained(
        args.model_name_or_path,
        device_map="auto" if quantization_config else None,
        quantization_config=quantization_config,
    )

    if quantization_config is not None:
        from peft import prepare_model_for_kbit_training

        model = prepare_model_for_kbit_training(model)

    lora_config = LoraConfig(
        r=args.lora_r,
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = get_peft_model(model, lora_config)

    dataset_dict = {"train": args.train_file}
    if args.validation_file:
        dataset_dict["validation"] = args.validation_file

    dataset = load_dataset("json", data_files=dataset_dict)

    tokenized_datasets = {
        split: preprocess_dataset(
            tokenizer,
            dataset[split],
            args.template,
            args.input_field,
            args.target_field,
            args.max_seq_length,
        )
        for split in dataset
    }

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        per_device_train_batch_size=args.per_device_train_batch_size,
        per_device_eval_batch_size=args.per_device_eval_batch_size,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        learning_rate=args.learning_rate,
        num_train_epochs=args.num_train_epochs,
        lr_scheduler_type=args.lr_scheduler_type,
        warmup_ratio=args.warmup_ratio,
        weight_decay=args.weight_decay,
        logging_steps=args.logging_steps,
        save_steps=args.save_steps,
        save_total_limit=args.save_total_limit,
        evaluation_strategy="steps" if "validation" in tokenized_datasets else "no",
        fp16=not args.use_4bit,
        bf16=args.use_4bit,
        gradient_checkpointing=args.gradient_checkpointing,
        report_to="none",
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets.get("validation"),
        data_collator=data_collator,
    )

    trainer.train()
    trainer.save_model()
    tokenizer.save_pretrained(args.output_dir)


## Launch Training


In [None]:
script_args = ScriptArguments(
    model_name_or_path="meta-llama/Llama-2-7b-hf",
    train_file="train.jsonl",
    validation_file=None,
    output_dir="lora-finetuned-model",
)

# Uncomment the line below when you are ready to start training.
# run_training(script_args)
