In [1]:
!pip install -q -U transformers datasets trl==0.11.3 peft  huggingface_hub

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.2/40.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m118.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.4/509.4 kB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m119.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead, AutoModelForCausalLMWithValueHead
from transformers import AutoTokenizer
from datasets import load_dataset
import torch
from tqdm import tqdm

In [3]:
## Define parameters for PPO training
ppo_config = PPOConfig(
    ## model_name="gpt2",
    model_name="distilbert/distilgpt2",
    query_dataset="gsm8k",
    reward_model="sentence-transformers/all-MiniLM-L6-v2",
    learning_rate=1.41e-05,    ## 32
    mini_batch_size=16,
    batch_size=16,
    seed=0,
    ppo_epochs=3,
    remove_unused_columns=False
)



## Step 1: Prepare dataset

In [4]:
## Build dataset for PPO training. Inputs are pre-encoded for use during training

def build_dataset(config):
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["question"])
        return sample

    ## load gsm8k with datasets
    ds = load_dataset(config.query_dataset, name='main', split="train")
    #ds = ds.shuffle(seed=42)
    #ds = ds.select(range(500))
    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")

    return ds

In [5]:
def collator(data):
    return {key: [d[key] for d in data] for key in data[0]}

# Step 2: Prepare model & ref model

In [6]:
## Create model. Change to AutoModelForSeq2SeqLMWithValueHead for seq2seq models such as t5
model     = AutoModelForCausalLMWithValueHead.from_pretrained(ppo_config.model_name, torch_dtype=torch.float16)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(ppo_config.model_name, torch_dtype=torch.float16)


model.bfloat16()

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/353M [00:00<?, ?B/s]

AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-5): 6 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear

In [7]:
## Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(ppo_config.model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
## create dataset
dataset = build_dataset(ppo_config)

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

## Step 3: Create PPO Trainer

In [9]:
## Create PPO trainer

ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
)



In [10]:
## Set device for training
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug

## Step 4: Create Reward Models

In [11]:
## Define heuristic reward function
def get_reward(answer, response, question=None):
    no_spaces_response = response.replace(' ', '')
    correct_steps = [substr.split('>>')[0].split('=')[0] for substr in answer.split('<<')[1:]]

    reward = torch.tensor(0, dtype=torch.float32)
    for step in correct_steps:
        if step in no_spaces_response:
            reward += 1

    if answer.split('####')[1] in response:
        reward += 1

    return reward

## Step 5 : Run trainer

In [12]:
generation_kwargs = {
    "min_length": -1,
    "max_new_tokens": 128,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

In [None]:
for epoch in tqdm(range(ppo_trainer.config.ppo_epochs), 'epoch: '):
    ## Iterate through dataset batches
    for batch in tqdm(ppo_trainer.dataloader):
        query_tensors = batch["input_ids"]

        ## Get response from LLM
        response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, generate_ref_response=False, **generation_kwargs)
        batch['response'] = tokenizer.batch_decode(response_tensors)

        # Compute reward score
        rewards = [get_reward(a, r, q) for a, r, q in zip(batch['answer'], batch['response'], batch['question'])]

        # Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)

## Step 6: Save Pretrained model

In [None]:
ppo_trainer.save_pretrained('test_ppo_model')