In [None]:
# I'll wrap this up to a .py file later

In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, AutoConfig, AutoModelForSequenceClassification
from datasets import load_dataset
from trl import ModelConfig, RewardConfig, RewardTrainer, get_kbit_device_map, get_peft_config, get_quantization_config

In [2]:
# load model from hub
import torch

# I SFT-ed a placeholder model just for testing
# the base model is facebook/opt-350m, I SFT-ed the model on some random small datasets.
# It's an AutoModelForCausalLM model (i.e. regression head, num_vocab of layer size)
model = AutoModelForCausalLM.from_pretrained("Tachi67/temp_model_sft")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

In [8]:
# hh-data
# our dataset for reward model training
dataset = load_dataset("Anthropic/hh-rlhf", data_dir="harmless-base")

In [10]:
# sample a small subset of the dataset for testing
small_sample_train = dataset['train'].shuffle(seed=42).select(range(20))
small_sample_test = dataset['test'].shuffle(seed=42).select(range(10))

In [11]:
# data process
# seen from the huggingface tutorial
def preprocess_function(examples, tokenizer=tokenizer):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen)
        tokenized_rejected = tokenizer(rejected)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples

small_sample_train = small_sample_train.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)
small_sample_test = small_sample_test.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/20 [00:00<?, ? examples/s]

Map (num_proc=4): 100%|██████████| 20/20 [00:07<00:00,  2.68 examples/s]
Map (num_proc=4): 100%|██████████| 10/10 [00:04<00:00,  2.13 examples/s]


In [16]:
# training configs

cmd_args = [
    "--per_device_train_batch_size=32",
    "--model_name_or_path=facebook/opt-350m",
    "--output_dir=reward_modeling_anthropic_hh",
    "--num_train_epochs=1",
    "--gradient_accumulation_steps=16",
    "--gradient_checkpointing=True",
    "--learning_rate=1.41e-5",
    "--report_to=wandb",
    "--remove_unused_columns=False",
    "--optim=adamw_torch",
    "--logging_steps=10",
    "--evaluation_strategy=steps",
    "--max_length=56",
]
################
# Config parsing
################
parser = HfArgumentParser((RewardConfig, ModelConfig))
reward_config, model_config = parser.parse_args_into_dataclasses(args=cmd_args)
reward_config.gradient_checkpointing_kwargs = dict(use_reentrant=False)
print(reward_config.per_device_train_batch_size)

quantization_config = get_quantization_config(model_config)
model_kwargs = dict(
    revision=model_config.model_revision,
    trust_remote_code=model_config.trust_remote_code,
    device_map=get_kbit_device_map() if quantization_config is not None else None,
    quantization_config=quantization_config,
)

32


In [24]:
# model conversion
config = AutoConfig.from_pretrained("Tachi67/temp_model_sft", num_labels=2)
model_for_classification = AutoModelForSequenceClassification.from_config(config)

# copy the original weights from the original model to the new model, except the classification head
model_for_classification.base_model = model.base_model
model_for_classification.to(device)

OPTForSequenceClassification(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(i

In [26]:
# train
trainer = RewardTrainer(
    model=model_for_classification,
    tokenizer=tokenizer,
    args=reward_config,
    train_dataset=small_sample_train,
    eval_dataset=small_sample_test,
    peft_config=get_peft_config(model_config),
)
trainer.train()

  0%|          | 0/1 [04:06<?, ?it/s]
Could not estimate the number of tokens of the input, floating-point operations will not be computed
100%|██████████| 1/1 [06:32<00:00, 392.04s/it]

{'train_runtime': 392.0451, 'train_samples_per_second': 0.051, 'train_steps_per_second': 0.003, 'train_loss': 0.04423974081873894, 'epoch': 1.0}





TrainOutput(global_step=1, training_loss=0.04423974081873894, metrics={'train_runtime': 392.0451, 'train_samples_per_second': 0.051, 'train_steps_per_second': 0.003, 'train_loss': 0.04423974081873894, 'epoch': 1.0})

In [29]:
input = "this is a conversation"
input_ids = tokenizer(input, return_tensors="pt").input_ids.to(device)
output = model_for_classification(input_ids)
output

SequenceClassifierOutputWithPast(loss=None, logits=tensor([[-0.1221, -0.2968]], device='cuda:0', grad_fn=<IndexBackward0>), past_key_values=None, hidden_states=None, attentions=None)

In [1]:
from utils import load_harmful_data

data = load_harmful_data()

  from .autonotebook import tqdm as notebook_tqdm
42537it [00:01, 36325.49it/s]
2312it [00:00, 36697.32it/s]


In [3]:
len(data)

44849