### RLHF to check ACD works
This notebook uses ACD (Adversarial Contrastive Distillation) a HELLASWAG inspired data generation to fit RLHF. It also uses a portion of the dataset to measure model performance improvements.

In [21]:
from pathlib import Path
import torch
import numpy as np

# Hugging Face Transformers & Datasets
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,    
    AutoModelForSequenceClassification,
    GenerationConfig,
    DataCollatorWithPadding
)
from datasets import Dataset

# PEFT & TRL for RLHF
from peft import PeftModel, LoraConfig, TaskType, get_peft_model
from trl import (
    PPOTrainer,
    PPOConfig, 
    AutoModelForCausalLMWithValueHead,
    AutoModelForSeq2SeqLMWithValueHead,
    create_reference_model
)

from trl.core import LengthSampler

# Evaluation
import evaluate

# Utilities
from shared_models import HellaSwagEntry

from tqdm import tqdm
tqdm.pandas()

### Load Data

In [None]:
DATA_PATH = Path("../data/hellaswag_format/personal_chat_sessions_train_hellaswag.jsonl")
REWARD_MODEL_PATH = "../data/models/reward_model_ckpts/checkpoint-3762"
RLHF_CKPTS_DIR = "../data/models/rlhf_ckpts"
BASE_MODEL_NAME = "google-bert/bert-base-uncased"

In [23]:
# Function to read JSONL via Pydantic
def load_jsonl_pydantic(path: Path):
    """Yield HellaSwagEntry objects parsed with Pydantic."""
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            yield HellaSwagEntry.model_validate_json(line)

# Load records
records = list(load_jsonl_pydantic(DATA_PATH))
print(f"Loaded {len(records):,} examples")

Loaded 22,282 examples


In [24]:
def load_jsonl_pydantic(path: Path):
    """Yield HellaSwagEntry objects parsed with Pydantic."""
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            yield HellaSwagEntry.model_validate_json(line)

records = list(load_jsonl_pydantic(DATA_PATH))
print(f"Loaded {len(records):,} examples from {DATA_PATH}")

# Build context-response pairs
data_pairs = []
for ex in load_jsonl_pydantic(DATA_PATH):
    endings    = [ex.ending0, ex.ending1, ex.ending2, ex.ending3, ex.ending4]
    human_resp = endings[ex.label].strip()
    data_pairs.append({
        "context": ex.context.strip(),
        "human_resp": human_resp
    })

raw_dataset = Dataset.from_list(data_pairs)
train_test  = raw_dataset.train_test_split(test_size=0.1, seed=42)
train_ds    = train_test["train"]
test_ds     = train_test["test"]


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, device_map="auto")
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

# Tokenization function
max_len = 128

def tokenize_fn(examples):
    return tokenizer(
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=max_len,
    )

# Map tokenization over the datasets
train_ds = train_ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=["context", "human_resp"]
)
test_ds = test_ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=["context", "human_resp"]
)

# Use a data collator to batch and pad
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")

Loaded 22,282 examples from ..\data\hellaswag_format\personal_chat_sessions_train_hellaswag.jsonl


Map: 100%|██████████| 20053/20053 [00:32<00:00, 623.90 examples/s]
Map: 100%|██████████| 2229/2229 [00:02<00:00, 986.72 examples/s] 


#### Load and Prepare Base LLM + LoRA Adapter

In [25]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    print(f"\ntrainable model parameters: {trainable_model_params}\
    \nall model parameters: {all_model_params}\
    \npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%")

In [26]:
# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q","v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Load base model and attach PEFT adapter
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    BASE_MODEL_NAME,
    torch_dtype=torch.bfloat16
)

peft_model = get_peft_model(base_model,lora_config)


dppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(
    peft_model,
    torch_dtype=torch.bfloat16,
    is_trainable=True
)
ref_model = create_reference_model(dppo_model)

# Load the saved generation_config.json from the base model
# TRL throws an error otherwise.
gen_config = GenerationConfig.from_pretrained(BASE_MODEL_NAME)
gen_config.min_length = 1
gen_config.max_new_tokens = 50

dppo_model.generation_config = gen_config
ref_model.generation_config = gen_config

print_number_of_trainable_model_parameters(dppo_model)


trainable model parameters: 295425    
all model parameters: 60802049    
percentage of trainable model parameters: 0.49%


#### Prepare Human-Preference Reward Model

In [27]:
reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_PATH, device_map="auto")
if reward_tokenizer.pad_token is None:
    reward_tokenizer.add_special_tokens({"pad_token": "[PAD]"})

reward_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_PATH,
    num_labels=2
)

value_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_PATH,
    num_labels=2
)

print("Reward model labels:", reward_model.config.id2label)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Reward model labels: {0: 'LABEL_0', 1: 'LABEL_1'}


In [28]:
# Inference pipeline (raw logits)
preference_pipe = pipeline(
    'text-classification',
    model=reward_model,
    tokenizer=reward_tokenizer,
    framework="pt"
)
reward_kwargs = {"top_k": None, "function_to_apply": "none", "batch_size": 16}


Device set to use cuda:0


#### Set up PPO-Trainer

In [29]:
config = PPOConfig(
    output_dir   = RLHF_CKPTS_DIR,
    num_ppo_epochs = 3,
    mini_batch_size = 4,
    batch_size      = 16
)

# Initialize the trainer
ppo_trainer = PPOTrainer(
    config,
    model        = dppo_model,
    ref_model    = ref_model,
    processing_class    = tokenizer,
    reward_model = reward_model,
    value_model  = value_model,
    train_dataset = train_ds,
    eval_dataset  = test_ds,
    data_collator = data_collator
)

In [30]:
# Start RLHF training loop
ppo_trainer.train()

===training policy===


RuntimeError: The size of tensor a (0) must match the size of tensor b (53) at non-singleton dimension 1