### Reward Model with TRL RewardTrainer
This notebook uses the TRL `RewardTrainer` (v0.17.0) to train a reward model on HellaSwag-style chat data.


In [1]:
import random
from pathlib import Path

from datasets import Dataset
from shared_models import HellaSwagEntry

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import LoraConfig, TaskType, get_peft_model
from trl import RewardTrainer, RewardConfig

#### Data Collection

In [2]:
DATA_PATH = Path("../data/hellaswag_format/personal_chat_sessions_train_hellaswag.jsonl")

def load_jsonl_pydantic(path):
    """Yield HellaSwagEntry objects parsed with Pydantic."""
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            yield HellaSwagEntry.model_validate_json(line)

In [3]:
# Build pairwise examples
pairs = []
for ex in load_jsonl_pydantic(DATA_PATH):
    endings = [ex.ending0, ex.ending1, ex.ending2, ex.ending3, ex.ending4]
    pos_id = ex.label
    neg_id = random.choice([i for i in range(5) if i != pos_id])

    pos_txt, neg_txt = endings[pos_id].strip(), endings[neg_id].strip()
    context = ex.context.strip()

    # randomly order A/B
    if random.random() < 0.5:
        first, second, lbl = pos_txt, neg_txt, 1
    else:
        first, second, lbl = neg_txt, pos_txt, 0

    pairs.append({
        "context": context,
        "first_resp": first,
        "second_resp": second,
        "label": lbl
    })

In [4]:
# Create HF Dataset and split
dataset = Dataset.from_list(pairs)
train_test = dataset.train_test_split(test_size=0.1, seed=42)

#### Prepare for RewardTrainer
Convert to the `"chosen"` / `"rejected"` format required by RewardTrainer

In [5]:
def map_to_reward(examples):
    chosen, rejected = [], []
    for lbl, a, b in zip(examples["label"], examples["first_resp"], examples["second_resp"]):
        if lbl == 1:
            chosen.append(a)
            rejected.append(b)
        else:
            chosen.append(b)
            rejected.append(a)
    return {"chosen": chosen, "rejected": rejected}

rm_dataset = train_test.map(
    map_to_reward,
    batched=True,
    remove_columns=train_test["train"].column_names,
)

Map:   0%|          | 0/20053 [00:00<?, ? examples/s]

Map:   0%|          | 0/2229 [00:00<?, ? examples/s]

#### Model & Tokenizer

In [6]:
model_ckpt = "trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
tokenizer.max_length = 128

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

tokenizer.chat_template = getattr(self.tokenizer, "chat_template", None)    

In [7]:
# Single‐scalar head for reward
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=1,
)

model.config.pad_token_id = tokenizer.pad_token_id

#### LoRA Setup

In [8]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=4,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
)

model = get_peft_model(model, peft_config)

 #### Training Configuration

In [9]:
training_args = RewardConfig(
    output_dir="../data/models/reward_model_ckpts_test",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    max_length=128,
    disable_dropout=False,  # keep dropout active during training
)

#### Initialize & Run RewardTrainer

In [10]:
trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=rm_dataset["train"],
    eval_dataset=rm_dataset["test"],
    processing_class=tokenizer,
    peft_config=peft_config,
)


Map:   0%|          | 0/20053 [00:00<?, ? examples/s]

Map:   0%|          | 0/20053 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20053 [00:00<?, ? examples/s]

Map:   0%|          | 0/2229 [00:00<?, ? examples/s]

Map:   0%|          | 0/2229 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2229 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [11]:
print("Baseline:", trainer.evaluate())


You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Baseline: {'eval_loss': 0.6993284225463867, 'eval_model_preparation_time': 0.0012, 'eval_accuracy': 0.48742138364779874, 'eval_runtime': 1.863, 'eval_samples_per_second': 1194.852, 'eval_steps_per_second': 37.574}


In [12]:
trainer.train()

Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,0.6762,0.660428,0.0012,0.730458
2,0.6403,0.634826,0.0012,0.810872
3,0.6285,0.626181,0.0012,0.822102


TrainOutput(global_step=3753, training_loss=0.6526059222608892, metrics={'train_runtime': 200.7758, 'train_samples_per_second': 298.99, 'train_steps_per_second': 18.692, 'total_flos': 0.0, 'train_loss': 0.6526059222608892, 'epoch': 3.0})

In [13]:
print("Final:", trainer.evaluate())

Final: {'eval_loss': 0.6261805891990662, 'eval_model_preparation_time': 0.0012, 'eval_accuracy': 0.8221024258760108, 'eval_runtime': 1.751, 'eval_samples_per_second': 1271.254, 'eval_steps_per_second': 39.977, 'epoch': 3.0}
