# DPO Training Notebook

This notebook allows you to train an LLM using Direct Preference Optimization (DPO). parameters are configured in the `Config` class below for easy interactive modification.

In [None]:
import os
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import DPOTrainer, DPOConfig
from peft import LoraConfig

In [None]:
class Config:
    """
    Configuration class to replace argparse for notebook usage.
    """
    def __init__(self):
        # Model & Data Paths
        self.base_model_path = "mistralai/Mistral-7B-v0.1" # Replace with your model path
        self.data_path = "data.json" # Replace with your data path
        self.output_dir = "./results"
        
        # Training Hyperparameters
        self.num_train_epochs = 3
        self.learning_rate = 5e-5
        self.batch_size = 2
        
        # LoRA Configuration
        self.lora_r = 16
        self.lora_alpha = 32
        self.lora_dropout = 0.05
        
        # Text Generation / Processing
        self.max_length = 1024
        self.max_prompt_length = 512
        
        # DPO Specific
        self.beta = 0.1
        
        # Optimization
        self.warmup_steps = 100
        self.warmup_ratio = 0.1
        self.max_grad_norm = 0.3

args = Config()

In [None]:
def training_data_processor(args):
    """
    Placeholder for processing training data.
    In a real scenario, this would load and format your dataset.
    For DPO, you need 'prompt', 'chosen', and 'rejected' columns.
    """
    print(f"Loading data from {args.data_path}")
    # Example dummy data for demonstration
    prompts = [
        "What is the capital of France?",
        "Tell me about large language models.",
    ]
    chosen = [
        "The capital of France is Paris.",
        "Large language models (LLMs) are deep learning models that can understand and generate human-like text.",
    ]
    rejected = [
        "France's capital is Berlin.",
        "LLMs are small, simple neural networks.",
    ]

    return {"prompt": prompts, "chosen": chosen, "rejected": rejected}

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(args.base_model_path, padding_side="left")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token # Or another suitable token

In [None]:
# Load model
# Use torch.float32 for CPU training, bfloat16 for supported GPUs
if torch.cuda.is_available():
    torch_dtype = torch.bfloat16 # or torch.float16 depending on GPU support
    print("CUDA available, using bfloat16 for model.")
else:
    torch_dtype = torch.float32
    print("CUDA not available, using float32 for model.")

model = AutoModelForCausalLM.from_pretrained(
    args.base_model_path,
    trust_remote_code=True,
    ignore_mismatched_sizes=True,
    torch_dtype=torch_dtype,
)

In [None]:
# Prepare training data
data_dict = training_data_processor(args)
dataset = Dataset.from_dict(data_dict)

In [None]:
# LoRA configuration
lora_config = None
if args.lora_r > 0:
    lora_config = LoraConfig(
        r=args.lora_r,
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
    )
else:
    print("LoRA disabled (lora_r is 0).")

In [None]:
# DPO training arguments
training_args = DPOConfig(
    output_dir=args.output_dir,
    num_train_epochs=args.num_train_epochs,
    learning_rate=args.learning_rate,
    per_device_train_batch_size=args.batch_size,
    gradient_checkpointing=True, # Enable for memory efficiency
    gradient_checkpointing_kwargs={"use_reentrant": True},
    max_grad_norm=args.max_grad_norm,
    lr_scheduler_type="cosine",
    logging_steps=5,
    optim="adamw_torch",
    loss_type="sigmoid",
    warmup_steps=args.warmup_steps,
    warmup_ratio=args.warmup_ratio,
    do_eval=False,
    max_prompt_length=args.max_prompt_length,
    max_length=args.max_length,
    seed=42,
    remove_unused_columns=False,
    fp16=not torch.cuda.is_available(),
    bf16=torch.cuda.is_available(),
    beta=args.beta,
)

In [None]:
# Initialize DPOTrainer
dpo_trainer = DPOTrainer(
    model,
    ref_model=None,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset,
    peft_config=lora_config,
)

In [None]:
# Train the model
print("Starting DPO training...")
dpo_trainer.train()
print("Training complete. Saving model...")

# Save the trained model
final_path = os.path.join(args.output_dir, "final_checkpoint")
dpo_trainer.save_model(final_path)
print(f"Model saved to {final_path}")