# Training: DPO Fine-tuning

This notebook trains a model using Direct Preference Optimization (DPO).

Use `scripts/train_dpo.py` directly from terminal for long training runs.
This notebook provides an interactive interface for monitoring and testing.


In [None]:
import sys, os
from pathlib import Path
os.chdir('..')
sys.path.insert(0, str(Path.cwd() / 'src'))

# Configuration
DATASET = 'results/preference_pairs_100.jsonl'
MODEL = 'qwen2_7b'
OUTPUT = 'models/qwen_7b_contemplative'
EPOCHS = 3

print(f"Dataset: {DATASET}")
print(f"Model: {MODEL}")
print(f"Output: {OUTPUT}")


In [None]:
# Run training
!python scripts/train_dpo.py \
    --dataset {DATASET} \
    --base-model {MODEL} \
    --output {OUTPUT} \
    --epochs {EPOCHS} \
    --use-split-config \
    --device cuda


## Test Generation


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Load models
base = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-7B-Instruct", torch_dtype=torch.float16, device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
finetuned = PeftModel.from_pretrained(base, OUTPUT)

# Test
prompt = "How should I respond when someone is being unkind?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

print(f"Prompt: {prompt}\\n")
print("Base model:")
with torch.no_grad():
    out = base.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(out[0])[len(prompt):])

print("\\nFine-tuned:")
with torch.no_grad():
    out = finetuned.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(out[0])[len(prompt):])


In [None]:
# Sync to S3
from utils.sagemaker_utils import sync_to_s3
import yaml

with open('configs/sagemaker_configs.yaml') as f:
    cfg = yaml.safe_load(f)

if cfg['s3']['bucket'] != "your-bucket-contemplative-ai":
    sync_to_s3(OUTPUT, f"s3://{cfg['s3']['bucket']}/models/{Path(OUTPUT).name}")
    print("âœ… Synced to S3")
