In [None]:
# Install project dependencies
!pip install -q git+https://github.com/CarperAI/trlx.git@main
!pip install -q peft==0.3.0
!pip install -q wandb==0.15.8
!pip install -q numpy==1.25.2
!pip install -q transformers==4.32.0
!pip install -q accelerate==0.22.0
!pip install datasets==2.14.5
!pip install pyarrow==18.1.0
!pip install scikit-learn==1.6.1

## Create RLHF Config

In [None]:
import random
import json
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
import sys
from typing import List

import numpy as np
from datasets import load_dataset
from transformers import RobertaTokenizer, pipeline

from sklearn.model_selection import train_test_split

import trlx
from trlx.data.default_configs import (
    ModelConfig,
    OptimizerConfig,
    PPOConfig,
    SchedulerConfig,
    TokenizerConfig,
    TrainConfig,
    TRLConfig,
)

import torch
import numpy as np

def set_seed(seed):
    """Set all seeds to make results reproducible"""
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed = 319  # Change this value to get different results
set_seed(seed)

import wandb
# login to wandb: `wandb login`
wandb.login(key="")

def get_default_config():
    default_config = TRLConfig(
        train=TrainConfig(
            seq_length=1024,
            epochs=1000,
            total_steps=1000,
            batch_size=32, #4,
            seed=seed,
            checkpoint_interval=10000,
            eval_interval=5,
            pipeline="PromptPipeline",
            trainer="AcceleratePPOTrainer",
            entity_name='eleutherai',
            project_name='pythia-rlhf',
            save_best=False
        ),
        model=ModelConfig(model_path="EleutherAI/pythia-70m", num_layers_unfrozen=-1),
        tokenizer=TokenizerConfig(tokenizer_path="EleutherAI/pythia-70m", truncation_side="left", padding_side="left",),
        optimizer=OptimizerConfig(name="adamw", kwargs=dict(lr=8e-6, betas=(0.9, 0.95), eps=1.0e-8, weight_decay=1e-6)),
        scheduler=SchedulerConfig(name="cosine_annealing", kwargs=dict(T_max=10000, eta_min=8e-6)),
        method=PPOConfig(
            name="PPOConfig",
            num_rollouts=64,
            chunk_size=16,
            ppo_epochs=4,
            init_kl_coef=0.2,
            target=6,
            horizon=10000,
            gamma=1,
            lam=0.95,
            cliprange=0.2,
            cliprange_value=0.2,
            vf_coef=1,
            scale_reward="running",
            ref_mean=None,
            ref_std=None,
            cliprange_reward=10,
            gen_kwargs=dict(
                max_new_tokens=128,
                top_k=30,
                top_p=1.0,
                do_sample=True,
            ),
        ),
    )
    return default_config

def get_config(model_size, default_config):
    config_name = model_size
    if config_name == "70M":
        # Following params from https://wandb.ai/eleutherai/pythia-rlhf/runs/do2vbz2o
        default_config.train.batch_size = 4
        default_config.train.seq_length = 512
        default_config.train.total_steps = 1
        default_config.model.model_path = "lomahony/eleuther-pythia70m-hh-sft"
        default_config.model.num_layers_unfrozen = 4
        default_config.train.checkpoint_dir = "checkpoints/ppo_hh/pythia-70m/"
        default_config.tokenizer.tokenizer_path = "EleutherAI/pythia-70m"
        default_config.optimizer.kwargs["lr"] = 3e-6
        default_config.optimizer.kwargs["weight_decay"] = 0.0006
        default_config.scheduler.kwargs["eta_min"] = 5.45e-6
        default_config.method.num_rollouts = 32
        default_config.method.target = 5.71
        default_config.method.ppo_epochs = 8
        default_config.method.chunk_size = 4

def get_toxicity_score(scores):
    "Extract value associated with a toxicity score"
    tox_scores = []
    for x in scores:
        if x['label'] == 'toxic':
            tox_scores.append(-x['score'])
        else:
            tox_scores.append(-(1-x['score']))

    return tox_scores



In [None]:
## set params
hparams = {}
model_size = '70M'

## initiaate wandb project
wandb.init(project='rlhf')
default_config = get_default_config()
get_config(model_size, default_config)

print(f"Config : {default_config}")
config = TRLConfig.update(default_config, hparams)

## Load data
# Jigsaw
dataset = load_dataset('jaredjoss/jigsaw-long-2000')["train"]
all_prompts = [{"prompt": x["prompt"], "original_output": x["original_output"]} for x in dataset]
prompts, eval_prompts = train_test_split(all_prompts, test_size=0.2, random_state=0)

model_path = 'SkolkovoInstitute/roberta_toxicity_classifier'
print(model_path)

rob_tox_fn = pipeline(
    "text-classification",
    model=model_path,
    framework="pt",  # Explicitly specify PyTorch
    padding="max_length",
    truncation=True,
    max_length=512
)

# create reward function
def reward_fn(samples, prompts, original_output, **kwargs) -> List[float]:
    reward_tokenizer = RobertaTokenizer.from_pretrained(model_path)

    reward_tokenizer.truncation_side = "left"

    samples = [s + reward_tokenizer.eos_token for s in samples]
    rewards = get_toxicity_score(rob_tox_fn(samples))

    original_samples = [p + o + reward_tokenizer.eos_token for p, o in zip(prompts, original_output)]
    original_rewards = get_toxicity_score(rob_tox_fn(original_samples))

    final_rewards = [i-j for i, j in zip(rewards, original_rewards)]
    return final_rewards



Config : {
    "method": {
        "name": "PPOConfig",
        "ppo_epochs": 8,
        "num_rollouts": 32,
        "chunk_size": 4,
        "init_kl_coef": 0.2,
        "target": 5.71,
        "horizon": 10000,
        "gamma": 1,
        "lam": 0.95,
        "cliprange": 0.2,
        "cliprange_value": 0.2,
        "vf_coef": 1,
        "scale_reward": "running",
        "ref_mean": null,
        "ref_std": null,
        "cliprange_reward": 10,
        "gen_kwargs": {
            "max_new_tokens": 128,
            "top_k": 30,
            "top_p": 1.0,
            "do_sample": true
        },
        "gen_experience_kwargs": null,
        "num_value_layers_unfrozen": 0
    },
    "model": {
        "model_path": "lomahony/eleuther-pythia70m-hh-sft",
        "model_arch_type": "causal",
        "num_layers_unfrozen": 4,
        "peft_config": null,
        "model_extra_configs": {}
    },
    "optimizer": {
        "name": "adamw",
        "kwargs": {
            "lr": 3e-06,
       

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Start Training using RLHF

In [None]:
trainer = trlx.train(
    reward_fn=reward_fn,
    prompts=prompts,
    eval_prompts=eval_prompts,
    config=config,
)

In [None]:
# Training using RLHF
folder_name = './output/roberta_tox_classifier_custom_jigsaw_70'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)
    trainer.save_pretrained(folder_name)

Saving:
