### RLHF to check ACD works
This notebook uses ACD (Adversarial Contrastive Distillation) a HELLASWAG inspired data generation to fit RLHF. It also uses a portion of the dataset to measure model performance improvements.

In [1]:
from pathlib import Path
import torch
import numpy as np

# Hugging Face Transformers & Datasets
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    #AutoModelForSeq2SeqLM,    
    AutoModelForSequenceClassification,
    GenerationConfig,
    DataCollatorWithPadding
)
from datasets import Dataset

# PEFT & TRL for RLHF
from peft import PeftModel, LoraConfig, TaskType, get_peft_model
from trl import (
    PPOTrainer,
    PPOConfig, 
    AutoModelForCausalLMWithValueHead,
    #AutoModelForSeq2SeqLMWithValueHead,
    create_reference_model
)

from trl.core import LengthSampler

# Evaluation
import evaluate

# Utilities
from shared_models import HellaSwagEntry

from tqdm import tqdm
tqdm.pandas()

### Load Data

In [None]:
DATA_PATH = Path("../data/hellaswag_format/personal_chat_sessions_train_hellaswag.jsonl")
REWARD_MODEL_PATH = "../data/models/reward_model_ckpts/checkpoint-3762"
#REWARD_MODEL_PATH = "trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5"
RLHF_CKPTS_DIR = "../data/models/rlhf_ckpts"
BASE_MODEL_NAME = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"

In [3]:
# Function to read JSONL via Pydantic
def load_jsonl_pydantic(path: Path):
    """Yield HellaSwagEntry objects parsed with Pydantic."""
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            yield HellaSwagEntry.model_validate_json(line)

# Load records
records = list(load_jsonl_pydantic(DATA_PATH))
print(f"Loaded {len(records):,} examples")

Loaded 22,282 examples


In [4]:
def load_jsonl_pydantic(path: Path):
    """Yield HellaSwagEntry objects parsed with Pydantic."""
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            yield HellaSwagEntry.model_validate_json(line)

records = list(load_jsonl_pydantic(DATA_PATH))
print(f"Loaded {len(records):,} examples from {DATA_PATH}")

# Build context-response pairs
data_pairs = []
for ex in load_jsonl_pydantic(DATA_PATH):
    endings    = [ex.ending0, ex.ending1, ex.ending2, ex.ending3, ex.ending4]
    human_resp = endings[ex.label].strip()
    data_pairs.append({
        "context": ex.context.strip(),
        "human_resp": human_resp
    })

raw_dataset = Dataset.from_list(data_pairs)
train_test  = raw_dataset.train_test_split(test_size=0.1, seed=42)
train_ds    = train_test["train"]
test_ds     = train_test["test"]


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

# Tokenization function
max_len = 128

def tokenize_fn(examples):
    return tokenizer(
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=max_len,
    )

# Map tokenization over the datasets
train_ds = train_ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=train_ds.column_names
)
test_ds = test_ds.map(
    tokenize_fn,
    batched=True,
    remove_columns=test_ds.column_names
)

# Use a data collator to batch and pad
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")

Loaded 22,282 examples from ..\data\hellaswag_format\personal_chat_sessions_train_hellaswag.jsonl


Map:   0%|          | 0/20053 [00:00<?, ? examples/s]

Map:   0%|          | 0/2229 [00:00<?, ? examples/s]

#### Load and Prepare Base LLM + LoRA Adapter

In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    print(f"\ntrainable model parameters: {trainable_model_params}\
    \nall model parameters: {all_model_params}\
    \npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%")

In [6]:
# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj","v_proj"],
    lora_dropout=0.05,
    task_type=TaskType.CAUSAL_LM,
)

# Load base model and attach PEFT adapter
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    is_decoder=True
)

for name, module in base_model.named_modules():
    print(name)
    print(module)

dppo_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    BASE_MODEL_NAME,
    is_trainable=True,
)
# Create reference model (frozen copy)
ref_model = create_reference_model(dppo_model)

gen_config = GenerationConfig.from_pretrained(BASE_MODEL_NAME)
gen_config.min_length = 1
gen_config.max_new_tokens = 50

dppo_model.generation_config = gen_config
ref_model.generation_config = gen_config


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151665, 8)
    (layers): ModuleList(
      (0-1): 2 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=8, out_features=8, bias=True)
          (k_proj): Linear(in_features=8, out_features=4, bias=True)
          (v_proj): Linear(in_features=8, out_features=4, bias=True)
          (o_proj): Linear(in_features=8, out_features=8, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=8, out_features=32, bias=False)
          (up_proj): Linear(in_features=8, out_features=32, bias=False)
          (down_proj): Linear(in_features=32, out_features=8, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((8,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((8,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((8,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbedding()
  )
  (lm_head): Linear(in_fe



#### Prepare Human-Preference Reward Model

In [7]:
reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_PATH, device_map="auto")
if reward_tokenizer.pad_token is None:
    reward_tokenizer.add_special_tokens({"pad_token": "[PAD]"})

reward_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_PATH,
    num_labels=1
)

value_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_PATH,
    num_labels=1
)

print("Reward model labels:", reward_model.config.id2label)

Reward model labels: {0: 'LABEL_0'}


In [None]:
# 1. Save the original generate method
orig_generate = dppo_model.generate

# 2. Define a wrapper that injects our kwarg
def generate_with_dict(*args, **kwargs):
    # Ensure we get a ModelOutput with .logits
    kwargs.setdefault("return_dict_in_generate", True)
    return orig_generate(*args, **kwargs)

# 3. Apply the patch to both models
dppo_model.generate = generate_with_dict
ref_model.generate  = generate_with_dict

In [9]:
# Inference pipeline (raw logits)
preference_pipe = pipeline(
    'text-classification',
    model=reward_model,
    tokenizer=reward_tokenizer,
    framework="pt"
)
reward_kwargs = {"top_k": None, "function_to_apply": "none", "batch_size": 16}


Device set to use cuda:0


#### Set up PPO-Trainer

In [10]:
config = PPOConfig(
    output_dir   = RLHF_CKPTS_DIR,
    num_ppo_epochs = 3,
    mini_batch_size = 4,
    batch_size      = 16
)

# Initialize the trainer
ppo_trainer = PPOTrainer(
    config,
    model        = dppo_model,
    ref_model    = ref_model,
    processing_class    = tokenizer,
    reward_model = reward_model,
    value_model=value_model,
    peft_config      = lora_config,  
    train_dataset = train_ds,
    eval_dataset  = test_ds,
    data_collator = data_collator
)

In [11]:
# Start RLHF training loop
ppo_trainer.train()

===training policy===


AttributeError: 'tuple' object has no attribute 'logits'