In [1]:
import random
import pandas as pd
from operator import itemgetter
import warnings
warnings.filterwarnings('ignore')
from datasets import Dataset, load_dataset
from transformers import AutoModelForSequenceClassification,AutoTokenizer,TrainingArguments, HfArgumentParser
from trl import RewardTrainer
import torch
from tqdm import tqdm
from trl import ModelConfig, RewardConfig, RewardTrainer, get_kbit_device_map, get_peft_config, get_quantization_config

In [2]:
if torch.cuda.is_available():
    print(f"Number of CUDA devices available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA GPU is not available.")

Number of CUDA devices available: 1
CUDA Device 0: NVIDIA GeForce RTX 3060 Laptop GPU


In [3]:
tqdm.pandas()

In [4]:
"""
python examples/scripts/reward_modeling.py \
    --model_name_or_path=facebook/opt-350m \
    --output_dir="reward_modeling_anthropic_hh" \
    --per_device_train_batch_size=64 \
    --num_train_epochs=1 \
    --gradient_accumulation_steps=16 \
    --gradient_checkpointing=True \
    --learning_rate=1.41e-5 \
    --report_to="wandb" \
    --remove_unused_columns=False \
    --optim="adamw_torch" \
    --logging_steps=10 \
    --evaluation_strategy="steps" \
    --max_length=512 \
"""
cmd_args = [
    "--per_device_train_batch_size=32",
    "--model_name_or_path=facebook/opt-350m",
    "--output_dir=reward_modeling_anthropic_hh",
    "--num_train_epochs=1",
    "--gradient_accumulation_steps=16",
    "--gradient_checkpointing=True",
    "--learning_rate=1.41e-5",
    "--report_to=wandb",
    "--remove_unused_columns=False",
    "--optim=adamw_torch",
    "--logging_steps=10",
    "--evaluation_strategy=steps",
    "--max_length=56",
]
################
# Config parsing
################
parser = HfArgumentParser((RewardConfig, ModelConfig))
reward_config, model_config = parser.parse_args_into_dataclasses(args=cmd_args)
reward_config.gradient_checkpointing_kwargs = dict(use_reentrant=False)
print(reward_config.per_device_train_batch_size)

32


In [5]:
################
# Model & Tokenizer
################
torch_dtype = (
    model_config.torch_dtype
    if model_config.torch_dtype in ["auto", None]
    else getattr(torch, model_config.torch_dtype)
)
quantization_config = get_quantization_config(model_config)
model_kwargs = dict(
    revision=model_config.model_revision,
    trust_remote_code=model_config.trust_remote_code,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)
tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_config.model_name_or_path, num_labels=1, **model_kwargs
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)


Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
################
# Dataset
################
raw_datasets = load_dataset("Anthropic/hh-rlhf")
# Tokenize chosen/rejected pairs of inputs
# Adapt this section to your needs for custom datasets

def preprocess_function(examples, tokenizer):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen)
        tokenized_rejected = tokenizer(rejected)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples


raw_datasets



DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 160800
    })
    test: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 8552
    })
})

In [7]:
# 打乱数据集
shuffled_dataset = raw_datasets["train"].shuffle(seed=42)
shuffled_test = raw_datasets["test"].shuffle(seed=42)

# 选择前N个样本
N = 100  # 假设我们想要1000个样本
small_dataset = shuffled_dataset.select(range(N))
small_test = shuffled_test.select(range(10))

In [8]:
# Preprocess the dataset and filter out examples that are longer than args.max_length
small_dataset = small_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    fn_kwargs={"tokenizer": tokenizer}
)
small_test = small_test.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    fn_kwargs={"tokenizer": tokenizer}
)
small_dataset = small_dataset.filter(
    lambda x: len(x["input_ids_chosen"]) <= reward_config.max_length
    and len(x["input_ids_rejected"]) <= reward_config.max_length
)
small_test = small_test.filter(
    lambda x: len(x["input_ids_chosen"]) <= reward_config.max_length
    and len(x["input_ids_rejected"]) <= reward_config.max_length
)
train_dataset = small_dataset
eval_dataset = small_test

Filter: 100%|██████████| 100/100 [00:00<00:00, 1818.13 examples/s]
Filter: 100%|██████████| 10/10 [00:00<00:00, 624.93 examples/s]


In [9]:
################
# Training
################
trainer = RewardTrainer(
    model=model,
    tokenizer=tokenizer,
    args=reward_config,
    train_dataset=small_dataset,
    eval_dataset=small_test,
    peft_config=get_peft_config(model_config),
)
trainer.train()
trainer.save_model(reward_config.output_dir)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mhaolongli_neko[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/1 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
Could not estimate the number of tokens of the input, floating-point operations will not be computed
100%|██████████| 1/1 [00:03<00:00,  3.37s/it]


{'train_runtime': 12.286, 'train_samples_per_second': 0.488, 'train_steps_per_second': 0.081, 'train_loss': 0.05599110573530197, 'epoch': 1.0}
