# RLHF with PPOTrainer and PEFT (LoRA)
This notebook demonstrates how to fine-tune a base language model using Proximal Policy Optimization (PPO) with Parameter-Efficient Fine-Tuning (PEFT) via LoRA adapters. The PPOTrainer will instantiate a reference model internally (i.e., `ref_model=None`).

In [1]:
from pathlib import Path
import torch
import numpy as np

# Hugging Face Transformers & Datasets
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    GenerationConfig,
    DataCollatorWithPadding
)
from datasets import Dataset, load_dataset

# PEFT for Parameter-Efficient Fine-Tuning
from peft import PeftModel, LoraConfig, TaskType, get_peft_model

# TRL for RLHF
from trl import (
    PPOTrainer,
    PPOConfig,
    AutoModelForCausalLMWithValueHead
)
from trl.core import LengthSampler

# Evaluation
import evaluate

# Utilities
from tqdm.auto import tqdm

tqdm.pandas()

## 1. Load and Prepare Data

In [2]:
DATA_PATH = Path("../data/hellaswag_format/personal_chat_sessions_train_hellaswag.jsonl")

def load_jsonl_pydantic(path: Path):
    from shared_models import HellaSwagEntry
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            yield HellaSwagEntry.model_validate_json(line)

data_pairs = []
for ex in load_jsonl_pydantic(DATA_PATH):
    endings = [ex.ending0, ex.ending1, ex.ending2, ex.ending3, ex.ending4]
    human_resp = endings[ex.label].strip()
    data_pairs.append({
        "context": ex.context.strip(),
        "human_resp": human_resp
    })

raw_dataset = Dataset.from_list(data_pairs)
train_test = raw_dataset.train_test_split(test_size=0.1, seed=42)
train_ds, test_ds = train_test["train"], train_test["test"]

## 2. Tokenization

In [3]:
BASE_MODEL_NAME = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
max_len = 128

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

def tokenize_fn(examples):
    return tokenizer(
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=max_len
    )

train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
test_ds  = test_ds.map(tokenize_fn, batched=True, remove_columns=test_ds.column_names)

data_collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")

Map:   0%|          | 0/20053 [00:00<?, ? examples/s]

Map:   0%|          | 0/2229 [00:00<?, ? examples/s]

## 3. Configure PEFT (LoRA)

In [4]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM
)

## 4. Load Base Model with Value Head and Attach LoRA

In [5]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    BASE_MODEL_NAME,
    is_trainable=True
)

model = get_peft_model(model, lora_config)



## 5. Generation Configuration

In [6]:
gen_config = GenerationConfig.from_pretrained(BASE_MODEL_NAME)
gen_config.min_length = 1
gen_config.max_new_tokens = 50

model.generation_config = gen_config

## 6. Reward Model Setup

In [7]:
REWARD_MODEL_PATH = "../data/models/reward_model_ckpts/checkpoint-3753"

reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_PATH)
if reward_tokenizer.pad_token is None:
    reward_tokenizer.add_special_tokens({"pad_token": "[PAD]"})

from transformers import AutoModelForSequenceClassification
reward_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_PATH,
    num_labels=1
)

value_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_PATH,
    num_labels=1
)

preference_pipe = pipeline(
    'text-classification',
    model=reward_model,
    tokenizer=reward_tokenizer,
    framework="pt",
    return_all_scores=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


## 7. PPO Trainer Configuration

In [8]:
ppo_config = PPOConfig(
    output_dir   = "../data/models/rlhf_ckpts",
    num_ppo_epochs = 3,
    batch_size      = 16,
    mini_batch_size = 4
)

ppo_trainer = PPOTrainer(
    ppo_config,
    model        = model,
    ref_model    = None,
    processing_class    = tokenizer,
    reward_model = reward_model,
    value_model = value_model,
    train_dataset= train_ds,
    eval_dataset = test_ds,
    data_collator= data_collator
)

## 8. Training Loop

In [9]:
ppo_trainer.train()

===training policy===


ValueError: Ref Output of shape torch.Size([8, 181, 151665]) doesn't have logits