### Reward Model
This notebook constructs the Reward Model that will be used in a PPO step at a later stage. Still takes the dilbert uncased and builds on top of it

In [1]:
import random
from pathlib import Path
from datasets import Dataset
from shared_models import HellaSwagEntry
from transformers import (AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)

from peft import LoraConfig, get_peft_model

import torch
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


#### Data Collection

In [2]:
DATA_PATH = Path("../data/hellaswag_format/personal_chat_sessions_train_hellaswag.jsonl")

In [3]:
def load_jsonl_pydantic(path):
    """Yield HellaSwagEntry objects parsed with Pydantic."""
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            yield HellaSwagEntry.model_validate_json(line)

# Build pairwise examples
pairs = []
for ex in load_jsonl_pydantic(DATA_PATH):
    endings = [ex.ending0, ex.ending1, ex.ending2, ex.ending3, ex.ending4]
    pos_id = ex.label
    neg_id = random.choice([i for i in range(5) if i != pos_id])

    pos_txt, neg_txt = endings[pos_id].strip(), endings[neg_id].strip()
    context = ex.context.strip()

    # randomly order A/B
    if random.random() < 0.5:
        first, second, lbl = pos_txt, neg_txt, 1
    else:
        first, second, lbl = neg_txt, pos_txt, 0

    pairs.append({
        "context": context,
        "first_resp": first,
        "second_resp": second,
        "label": lbl
    })

# Create HF Dataset
dataset = Dataset.from_list(pairs)
dataset = dataset.rename_column("label", "labels")
train_test = dataset.train_test_split(test_size=0.1, seed=42)

dataset

Dataset({
    features: ['context', 'first_resp', 'second_resp', 'labels'],
    num_rows: 22282
})

#### Tokenization

In [4]:
model_ckpt = "distilbert-base-uncased"
tokenizer  = AutoTokenizer.from_pretrained(model_ckpt)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})

def tokenize_fn(examples):
    # Hugging Face will do: [CLS] context [SEP] first_resp [SEP] second_resp [SEP]
    return tokenizer(
        examples["context"],
        [f"{a} {tokenizer.sep_token} {b}" for a, b in zip(examples["first_resp"], examples["second_resp"])],
        truncation=True,
        max_length=128,
    )

tokenized = train_test.map(tokenize_fn,
                           batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

Map: 100%|██████████| 20053/20053 [00:18<00:00, 1085.51 examples/s]
Map: 100%|██████████| 2229/2229 [00:01<00:00, 1334.53 examples/s]


#### Model LoRA Setup for Sequence Classification

In [5]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    print(f"\ntrainable model parameters: {trainable_model_params}\
    \nall model parameters: {all_model_params}\
    \npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%")

In [6]:
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=2
)
base_model.resize_token_embeddings(len(tokenizer))

# <-- Optional: attach LoRA for parameter-efficient fine-tuning -->
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=8,
    lora_alpha=32,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.05,
)
model = get_peft_model(base_model, peft_config)

print_number_of_trainable_model_parameters(model)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



trainable model parameters: 739586    
all model parameters: 67694596    
percentage of trainable model parameters: 1.09%


#### Training with Trainer

In [7]:
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    return accuracy.compute(predictions=preds, references=p.label_ids)

# TrainingArguments
args = TrainingArguments(
    output_dir="../data/models/reward_model_ckpts",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Evaluate before training
print("Baseline:", trainer.evaluate())
# Train
trainer.train()
# Final eval
print("Final:", trainer.evaluate())

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Baseline: {'eval_loss': 0.6938586235046387, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.4845222072678331, 'eval_runtime': 121.8379, 'eval_samples_per_second': 18.295, 'eval_steps_per_second': 0.575}


Epoch,Training Loss,Validation Loss,Model Preparation Time,Accuracy
1,0.6926,0.340719,0.0027,0.858232
2,0.1963,0.157722,0.0027,0.938537
3,0.1624,0.13672,0.0027,0.950202


Final: {'eval_loss': 0.13672012090682983, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.9502018842530283, 'eval_runtime': 88.629, 'eval_samples_per_second': 25.15, 'eval_steps_per_second': 0.79, 'epoch': 3.0}
