# RLHF with PPOTrainer and PEFT (LoRA)
This notebook demonstrates how to fine-tune a base language model using Proximal Policy Optimization (PPO) with Parameter-Efficient Fine-Tuning (PEFT) via LoRA adapters. The PPOTrainer will instantiate a reference model internally (i.e., `ref_model=None`).

In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [2]:
from pathlib import Path
import torch
import numpy as np

# Hugging Face Transformers & Datasets
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig,
    DataCollatorWithPadding
)
from datasets import Dataset, load_dataset

# PEFT for Parameter-Efficient Fine-Tuning
from peft import PeftModel, LoraConfig, TaskType, get_peft_model

# TRL for RLHF
from trl import (
    PPOTrainer,
    PPOConfig,
    AutoModelForCausalLMWithValueHead
)
from trl.core import LengthSampler

# Evaluation
import evaluate

# Utilities
from tqdm.auto import tqdm

tqdm.pandas()

## 1. Load and Prepare Data

In [3]:
DATA_PATH = Path("../data/hellaswag_format/personal_chat_sessions_train_hellaswag.jsonl")
MIN_WORDS = 3

def load_jsonl_pydantic(path: Path):
    from shared_models import HellaSwagEntry
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            yield HellaSwagEntry.model_validate_json(line)

# Starting from your raw `Dataset`
def has_enough_words(example):
    return len(example["context"].split()) >= MIN_WORDS

data_pairs = []
for ex in load_jsonl_pydantic(DATA_PATH):
    endings = [ex.ending0, ex.ending1, ex.ending2, ex.ending3, ex.ending4]
    human_resp = endings[ex.label].strip()
    data_pairs.append({
        "context": ex.context.strip(),
        "human_resp": human_resp
    })

raw_dataset = Dataset.from_list(data_pairs)
raw_dataset = raw_dataset.filter(has_enough_words)

train_test = raw_dataset.train_test_split(test_size=0.1, seed=42)
train_ds, test_ds = train_test["train"], train_test["test"]

Filter:   0%|          | 0/22282 [00:00<?, ? examples/s]

## 2. Tokenization

In [4]:
BASE_MODEL_NAME = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
tokenizer.max_length =55

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
def tokenize_fn(examples):
    return tokenizer(
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=55
    )

train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
test_ds  = test_ds.map(tokenize_fn, batched=True, remove_columns=test_ds.column_names)

data_collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")

Map:   0%|          | 0/19955 [00:00<?, ? examples/s]

Map:   0%|          | 0/2218 [00:00<?, ? examples/s]

## 3. Configure PEFT (LoRA)

In [5]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM
)

## 4. Load Base Model with Value Head and Attach LoRA

In [6]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    BASE_MODEL_NAME,
    is_trainable=True
)

print("pad_token_id:", tokenizer.pad_token_id)
print("vocab size:", len(tokenizer))



pad_token_id: 151643
vocab size: 151665


## 5. Generation Configuration

In [7]:
gen_config = GenerationConfig.from_pretrained(BASE_MODEL_NAME)

model.generation_config = gen_config
model.generation_config.pad_token_id = tokenizer.pad_token_id

## 6. Reward Model Setup

In [8]:
REWARD_MODEL_PATH = "../data/models/reward_model_ckpts_test/checkpoint-3753"

reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_PATH)
reward_tokenizer.max_length = 128

if reward_tokenizer.pad_token is None:
    reward_tokenizer.pad_token = reward_tokenizer.eos_token

from transformers import AutoModelForSequenceClassification
reward_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_PATH,
    num_labels=1
)

value_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_PATH,
    num_labels=1
)

value_model.config.pad_token_id = reward_tokenizer.pad_token_id
reward_model.config.pad_token_id = reward_tokenizer.pad_token_id

preference_pipe = pipeline(
    'text-classification',
    model=reward_model,
    tokenizer=reward_tokenizer,
    framework="pt",
    return_all_scores=False
)

Device set to use cuda:0


## 7. PPO Trainer Configuration

In [9]:
import torch
import traceback
from transformers import GenerationConfig
import trl.trainer.utils as utils  # adjust if your utils path is different

# Keep a reference if you ever want to restore the original
_orig_generate = utils.generate

def debug_generate(
    lm_backbone: torch.nn.Module,
    queries: torch.Tensor,
    pad_token_id: int,
    generation_config: GenerationConfig,
) -> tuple[torch.Tensor, torch.Tensor]:
    context_length = queries.shape[1]
    attention_mask = queries != pad_token_id
    input_ids = torch.masked_fill(queries, ~attention_mask, 0)

     # ── NEW: skip any example with zero non-pad tokens ────────────────────────────
    # If *every* token in *all* queries is pad, we bail out in one go:
    if attention_mask.sum().item() == 0:
        # Build a “no-context” output: echo the queries + a single EOS per example
        batch_size = queries.size(0)
        # sequences: [original_queries | eos_token]
        eos_seq = queries.new_full((batch_size, 1), lm_backbone.config.eos_token_id)
        seqs = torch.cat((queries, eos_seq), dim=1)
        # logits: zeros of shape [batch_size, max_new_tokens, vocab_size]
        max_new = generation_config.max_new_tokens
        vocab   = lm_backbone.config.vocab_size
        logits  = queries.new_zeros((batch_size, max_new, vocab))
        return seqs, logits
    # ──────────────────────────────────────────────────────────────────────────────

    # ─── Pre‐generate diagnostics ───────────────────────────────────────────────
    print(f"[debug_generate] queries.shape       = {queries.shape}   device={queries.device}")
    print(f"[debug_generate] pad_token_id        = {pad_token_id}")
    print(f"[debug_generate] context_length      = {context_length}")
    print(f"[debug_generate] max(input_ids)      = {input_ids.max().item()}   vocab_size={lm_backbone.config.vocab_size}")
    print(f"[debug_generate] attention_mask.sum = {attention_mask.sum().item()} tokens unmasked")
    print(f"[debug_generate] generation_config   = {generation_config}")

    # ─── Call into the model and catch CUDA asserts exactly here ───────────────
    try:
        torch.cuda.synchronize()
        output = lm_backbone.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
        )
        torch.cuda.synchronize()
    except Exception:
        print("✖ Error inside lm_backbone.generate()")
        traceback.print_exc()
        raise

    # ─── Post‐generate diagnostics ──────────────────────────────────────────────
    print(f"[debug_generate] output.sequences.shape = {output.sequences.shape}")
    print(f"[debug_generate] # of output.scores     = {len(output.scores)}")
    if len(output.scores) > 0:
        print(f"[debug_generate] output.scores[0].shape = {output.scores[0].shape}")

    # ─── Stack and concat ──────────────────────────────────────────────────────
    try:
        logits = torch.stack(output.scores, dim=1)
    except Exception:
        print("✖ Error stacking logits")
        traceback.print_exc()
        raise

    seqs = torch.cat((queries, output.sequences[:, context_length:]), dim=1)
    print(f"[debug_generate] final seqs.shape        = {seqs.shape}")
    print(f"[debug_generate] final logits.shape      = {logits.shape}")

    return seqs, logits

# Monkey-patch the TRL utils.generate to our debug version
utils.generate = debug_generate

In [10]:
ppo_config = PPOConfig(
    output_dir   = "../data/models/rlhf_ckpts",
    num_ppo_epochs = 3,
    batch_size      = 16,
    mini_batch_size = 4
)

ppo_trainer = PPOTrainer(
    ppo_config,
    model        = model,
    ref_model    = None,
    peft_config = lora_config,
    processing_class    = tokenizer,
    reward_model = reward_model,
    value_model = value_model,
    train_dataset= train_ds,
    eval_dataset = test_ds,
    data_collator= data_collator
)

## 8. Training Loop

In [11]:
ppo_trainer.train()

===training policy===
[debug_generate] queries.shape       = torch.Size([8, 55])   device=cuda:0
[debug_generate] pad_token_id        = 151643
[debug_generate] context_length      = 55
[debug_generate] max(input_ids)      = 90159   vocab_size=151665
[debug_generate] attention_mask.sum = 351 tokens unmasked
[debug_generate] generation_config   = GenerationConfig {
  "do_sample": true,
  "max_new_tokens": 53,
  "temperature": 0.7000000999999999,
  "top_k": 0.0
}

[debug_generate] output.sequences.shape = torch.Size([8, 108])
[debug_generate] # of output.scores     = 53
[debug_generate] output.scores[0].shape = torch.Size([8, 151665])
[debug_generate] final seqs.shape        = torch.Size([8, 108])
[debug_generate] final logits.shape      = torch.Size([8, 53, 151665])


Step,Training Loss


[debug_generate] queries.shape       = torch.Size([8, 55])   device=cuda:0
[debug_generate] pad_token_id        = 151643
[debug_generate] context_length      = 55
[debug_generate] max(input_ids)      = 144736   vocab_size=151665
[debug_generate] attention_mask.sum = 399 tokens unmasked
[debug_generate] generation_config   = GenerationConfig {
  "do_sample": true,
  "max_new_tokens": 53,
  "temperature": 0.0100001,
  "top_k": 0.0
}

✖ Error inside lm_backbone.generate()


Traceback (most recent call last):
  File "C:\Users\aadhu\AppData\Local\Temp\ipykernel_1692\729358063.py", line 45, in debug_generate
    output = lm_backbone.generate(
        input_ids=input_ids,
    ...<3 lines>...
        output_scores=True,
    )
  File "C:\Users\aadhu\source\Anton\anton\.venv\Lib\site-packages\peft\peft_model.py", line 1875, in generate
    outputs = self.base_model.generate(*args, **kwargs)
  File "C:\Users\aadhu\source\Anton\anton\.venv\Lib\site-packages\trl\models\modeling_value_head.py", line 199, in generate
    return self.pretrained_model.generate(*args, **kwargs)
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "C:\Users\aadhu\source\Anton\anton\.venv\Lib\site-packages\torch\utils\_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
  File "C:\Users\aadhu\source\Anton\anton\.venv\Lib\site-packages\transformers\generation\utils.py", line 2465, in generate
    result = self._sample(
        input_ids,
    ...<5 li

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
