# rlhf.py

Auto-generated implementation from the Agentic RL PhD codebase.

### Original Implementations & References
The following links point to the official or high-quality reference implementations for the papers covered in this notebook:

- https://github.com/mrahtz/learning-from-human-preferences

*Note: The code below is a simplified pedagogical implementation.*

In [None]:
import torch
import torch.nn as nn

# Paper: "Deep reinforcement learning from human preferences" (Christiano et al., 2017)
# Category: RLHF / Reward Modeling

class RewardModel(nn.Module):
    """
    The "Secret" Reward Model.
    Training Objective: CrossEntropy( P(sigma_1 > sigma_2) )
    where P(sigma_1 > sigma_2) = exp(r1) / (exp(r1) + exp(r2))
    """
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 64),
            nn.LeakyReLU(),
            nn.Linear(64, 1) # Outputs a scalar reward 'r'
        )

    def forward(self, state, action):
        return self.net(torch.cat([state, action], dim=-1))

def compute_preference_loss(reward_model, segment_1, segment_2, labels):
    """
    Args:
        segment_1: (Batch, Len, State+Act) - Trajectory A
        segment_2: (Batch, Len, State+Act) - Trajectory B
        labels: (Batch,) - 0 if A preferred, 1 if B preferred, 0.5 if tie
    """
    # Sum rewards over the trajectory segment
    r1 = reward_model(segment_1).sum(dim=1) 
    r2 = reward_model(segment_2).sum(dim=1)
    
    # Bradley-Terry Model: P(1 > 2) = sigmoid(r1 - r2)
    logits = r2 - r1 # Logits for classification (assuming label 1 means r2 is better)
    
    # Cross Entropy Loss
    loss = F.binary_cross_entropy_with_logits(logits, labels)
    return loss
