# Day 17: Reinforcement Learning from Human Feedback (RLHF) Components

In this notebook, we'll explore the key components of the RLHF pipeline and implement simplified versions to understand how they work together. We'll focus on:

1. Generating response pairs for comparison
2. Training a reward model from human preferences
3. Implementing a simplified PPO algorithm for language models
4. Comparing RLHF results with SFT

## Overview

RLHF consists of three main stages:
1. Start with an SFT model
2. Train a reward model on human preferences
3. Optimize the model with PPO using the reward model

Let's implement simplified versions of each component to understand the process.

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
from datasets import Dataset

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

## 1. Simulating an SFT Model

In a real RLHF pipeline, we would start with a model that has already been fine-tuned with SFT. For this demonstration, we'll use a small pre-trained model and simulate SFT behavior.

In [None]:
# Load a small pre-trained model
model_name = "gpt2"  # Using a small model for demonstration
tokenizer = AutoTokenizer.from_pretrained(model_name)
sft_model = AutoModelForCausalLM.from_pretrained(model_name)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    sft_model.config.pad_token_id = sft_model.config.eos_token_id

# Move model to device
sft_model = sft_model.to(device)

print(f"Model loaded: {model_name}")
print(f"Number of parameters: {sft_model.num_parameters():,}")

## 2. Generating Response Pairs for Comparison

In RLHF, we need pairs of responses to the same prompt, where humans can express a preference. Let's generate some response pairs using our simulated SFT model.

In [None]:
def generate_responses(model, prompt, num_responses=2, max_length=100, temperature=1.0):
    """Generate multiple responses to the same prompt."""
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    responses = []
    
    for _ in range(num_responses):
        with torch.no_grad():
            output = model.generate(
                inputs["input_ids"],
                max_length=max_length,
                do_sample=True,
                temperature=temperature,
                top_p=0.9,
                pad_token_id=tokenizer.pad_token_id
            )
        
        response = tokenizer.decode(output[0], skip_special_tokens=True)
        # Extract only the generated part (remove the prompt)
        response = response[len(prompt):].strip()
        responses.append(response)
    
    return responses

# Example prompts
prompts = [
    "Explain the concept of artificial intelligence to a 5-year-old.",
    "Write a short poem about nature.",
    "What are three ways to reduce stress?",
    "Describe the process of photosynthesis briefly.",
    "Give me a recipe for chocolate chip cookies."
]

# Generate response pairs
response_pairs = []
for prompt in prompts:
    # Generate with different temperatures to get diverse responses
    responses_1 = generate_responses(sft_model, prompt, num_responses=1, temperature=0.7)[0]
    responses_2 = generate_responses(sft_model, prompt, num_responses=1, temperature=1.0)[0]
    
    response_pairs.append({
        "prompt": prompt,
        "response_a": responses_1,
        "response_b": responses_2
    })

# Display an example pair
example = response_pairs[0]
print(f"Prompt: {example['prompt']}\n")
print(f"Response A:\n{example['response_a']}\n")
print(f"Response B:\n{example['response_b']}")

## 3. Simulating Human Preferences

In a real RLHF pipeline, human annotators would provide preferences between response pairs. For this demonstration, we'll simulate human preferences using some heuristics.

In [None]:
def simulate_human_preference(prompt, response_a, response_b):
    """Simulate human preference between two responses.
    
    Returns:
    - 0 if response_a is preferred
    - 1 if response_b is preferred
    """
    # Simple heuristics for preference simulation
    score_a = 0
    score_b = 0
    
    # Length preference (not too short, not too long)
    len_a = len(response_a.split())
    len_b = len(response_b.split())
    
    # Prefer responses between 20 and 100 words
    if 20 <= len_a <= 100:
        score_a += 1
    if 20 <= len_b <= 100:
        score_b += 1
    
    # Relevance to prompt (simple keyword matching)
    prompt_keywords = set(prompt.lower().split())
    response_a_keywords = set(response_a.lower().split())
    response_b_keywords = set(response_b.lower().split())
    
    overlap_a = len(prompt_keywords.intersection(response_a_keywords))
    overlap_b = len(prompt_keywords.intersection(response_b_keywords))
    
    score_a += overlap_a
    score_b += overlap_b
    
    # Coherence (simple proxy: fewer very short sentences)
    short_sentences_a = sum(1 for s in response_a.split(".") if len(s.split()) < 3)
    short_sentences_b = sum(1 for s in response_b.split(".") if len(s.split()) < 3)
    
    score_a -= short_sentences_a
    score_b -= short_sentences_b
    
    # Return preference
    if score_a > score_b:
        return 0  # Prefer response_a
    elif score_b > score_a:
        return 1  # Prefer response_b
    else:
        # If tied, randomly choose
        return random.randint(0, 1)

# Add simulated preferences to our response pairs
for pair in response_pairs:
    preference = simulate_human_preference(
        pair["prompt"], 
        pair["response_a"], 
        pair["response_b"]
    )
    pair["preference"] = preference

# Display preferences
for i, pair in enumerate(response_pairs):
    preferred = "A" if pair["preference"] == 0 else "B"
    print(f"Prompt {i+1}: Preferred response {preferred}")

## 4. Training a Reward Model

Now we'll train a reward model to predict human preferences between response pairs. The reward model takes a prompt and response as input and outputs a scalar reward value.

In [None]:
# Create more synthetic preference data for training
additional_prompts = [
    "What are the benefits of exercise?",
    "Explain how the internet works.",
    "Give me tips for learning a new language.",
    "What causes climate change?",
    "How do airplanes fly?",
    "Describe the water cycle.",
    "What are the main features of democracy?",
    "How does the human digestive system work?",
    "What are the benefits of meditation?",
    "Explain the concept of supply and demand."
]

# Generate more response pairs
for prompt in additional_prompts:
    responses_1 = generate_responses(sft_model, prompt, num_responses=1, temperature=0.7)[0]
    responses_2 = generate_responses(sft_model, prompt, num_responses=1, temperature=1.0)[0]
    
    preference = simulate_human_preference(prompt, responses_1, responses_2)
    
    response_pairs.append({
        "prompt": prompt,
        "response_a": responses_1,
        "response_b": responses_2,
        "preference": preference
    })

print(f"Total preference pairs: {len(response_pairs)}")

In [None]:
# Prepare data for reward model training
def prepare_reward_data(response_pairs):
    """Prepare data for reward model training."""
    data = []
    
    for pair in response_pairs:
        prompt = pair["prompt"]
        response_a = pair["response_a"]
        response_b = pair["response_b"]
        preference = pair["preference"]
        
        # Add chosen and rejected responses
        if preference == 0:  # Response A preferred
            chosen = response_a
            rejected = response_b
        else:  # Response B preferred
            chosen = response_b
            rejected = response_a
        
        # Add to data
        data.append({
            "prompt": prompt,
            "chosen": chosen,
            "rejected": rejected
        })
    
    return data

reward_data = prepare_reward_data(response_pairs)

# Split into train and validation sets
random.shuffle(reward_data)
train_size = int(0.8 * len(reward_data))
train_data = reward_data[:train_size]
val_data = reward_data[train_size:]

print(f"Training examples: {len(train_data)}")
print(f"Validation examples: {len(val_data)}")

In [None]:
# Create a simple reward model
class RewardModel(nn.Module):
    """Simple reward model for demonstration."""
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=1  # Single scalar output
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.logits.squeeze(-1)  # Return scalar rewards

# Initialize reward model
reward_model_name = "distilbert-base-uncased"  # Using a smaller model for demonstration
reward_tokenizer = AutoTokenizer.from_pretrained(reward_model_name)
reward_model = RewardModel(reward_model_name).to(device)

# Add padding token if it doesn't exist
if reward_tokenizer.pad_token is None:
    reward_tokenizer.pad_token = reward_tokenizer.eos_token

print(f"Reward model initialized: {reward_model_name}")

In [None]:
# Prepare data for reward model training
def tokenize_for_reward_model(examples):
    """Tokenize data for reward model training."""
    chosen_inputs = [f"Prompt: {p}\nResponse: {c}" for p, c in zip(examples["prompt"], examples["chosen"])]
    rejected_inputs = [f"Prompt: {p}\nResponse: {r}" for p, r in zip(examples["prompt"], examples["rejected"])]
    
    chosen_tokens = reward_tokenizer(
        chosen_inputs,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    
    rejected_tokens = reward_tokenizer(
        rejected_inputs,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    
    return {
        "chosen_input_ids": chosen_tokens["input_ids"],
        "chosen_attention_mask": chosen_tokens["attention_mask"],
        "rejected_input_ids": rejected_tokens["input_ids"],
        "rejected_attention_mask": rejected_tokens["attention_mask"]
    }

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
val_dataset = Dataset.from_pandas(pd.DataFrame(val_data))

# Tokenize datasets
tokenized_train_dataset = train_dataset.map(
    tokenize_for_reward_model,
    batched=True,
    remove_columns=["prompt", "chosen", "rejected"]
)

tokenized_val_dataset = val_dataset.map(
    tokenize_for_reward_model,
    batched=True,
    remove_columns=["prompt", "chosen", "rejected"]
)