In [1]:
!nvidia-smi
import os
os.environ["CUDA_VISIBLE_DEVICES"]

Sun Nov 26 20:23:55 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:41:00.0 Off |                    0 |
| N/A   30C    P0              62W / 500W |      4MiB / 81920MiB |      0%   E. Process |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

'0'

In [2]:
from dataclasses import dataclass, field
from typing import Dict, Optional

import torch
from tqdm import tqdm
from datasets import Dataset, load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from transformers import AutoTokenizer, HfArgumentParser, TrainingArguments, AutoModelForCausalLM, pipeline, AutoModelForSequenceClassification

from trl import DPOTrainer, PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
@dataclass
class ScriptArguments:
    """
    The arguments for the DPO training script.
    """

    # data parameters
    beta: Optional[float] = field(default=0.1, metadata={"help": "the beta parameter for DPO loss"})

    # training parameters
    model_name_or_path: Optional[str] = field(
        default="gpt2",
        metadata={"help": "the location of the SFT model name or path"},
    )
    learning_rate: Optional[float] = field(default=5e-4, metadata={"help": "optimizer learning rate"})
    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
    warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})

    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "train batch size per device"})
    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "eval batch size per device"})
    gradient_accumulation_steps: Optional[int] = field(
        default=4, metadata={"help": "the number of gradient accumulation steps"}
    )
    gradient_checkpointing: Optional[bool] = field(
        default=True, metadata={"help": "whether to use gradient checkpointing"}
    )

    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})

    max_prompt_length: Optional[int] = field(default=256, metadata={"help": "the maximum prompt length"})
    max_length: Optional[int] = field(default=512, metadata={"help": "the maximum sequence length"})
    
    max_steps: Optional[int] = field(default=1000, metadata={"help": "max number of training steps"})
    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
    save_steps: Optional[int] = field(default=100, metadata={"help": "the saving frequency"})
    eval_steps: Optional[int] = field(default=100, metadata={"help": "the evaluation frequency"})

    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})

    # instrumentation
    sanity_check: Optional[bool] = field(default=False, metadata={"help": "only train on 1000 samples"})
    report_to: Optional[str] = field(
        default="none",
        metadata={
            "help": 'The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,'
            '`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. '
            'Use `"all"` to report to all integrations installed, `"none"` for no integrations.'
        },
    )
    # debug argument for distributed training
    ignore_bias_buffers: Optional[bool] = field(
        default=False,
        metadata={
            "help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See"
            "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992"
        },
    )


# DPO pipeline

In [5]:
parser = HfArgumentParser(ScriptArguments)
script_args, _ = parser.parse_args_into_dataclasses(return_remaining_strings=True)

# 1. load a pretrained model
model = AutoModelForCausalLM.from_pretrained(
    script_args.model_name_or_path,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
model.config.use_cache = False

if script_args.ignore_bias_buffers:
    # torch distributed hack
    model._ddp_params_and_buffers_to_ignore = [
        name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
    ]
    
model_ref = AutoModelForCausalLM.from_pretrained(
    script_args.model_name_or_path,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def get_dataset(
    data_dir: str = "data/rl",
    sanity_check: bool = False,
    cache_dir: str = None,
    num_proc=24,
) -> Dataset:
    """Load the stack-exchange-paired dataset from Hugging Face and convert it to the necessary format.

    The dataset is converted to a dictionary with the following structure:
    {
        'prompt': List[str],
        'chosen': List[str],
        'rejected': List[str],
    }

    Prompts are structured as follows:
      "Question: " + <prompt> + "\n\nAnswer: "
    """
    dataset = load_dataset(
        "Anthropic/hh-rlhf",
        split="train",
        cache_dir=cache_dir
    )
    original_columns = dataset.column_names

    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 1000)))

    def return_prompt_and_responses(samples) -> Dict[str, str]:
        return {
            "prompt": [sample.split("Assistant: ")[0] + "Assistant: " for sample in samples["chosen"]],
            "chosen": [sample.split("Assistant: ")[-1] for sample in samples["chosen"]],
            "rejected": [sample.split("Assistant: ")[-1] for sample in samples["rejected"]],
        }

    return dataset.map(
        return_prompt_and_responses,
        batched=True,
        num_proc=num_proc,
        remove_columns=original_columns,
    )


In [12]:
# 2. Load the Stack-exchange paired dataset
train_dataset = get_dataset(data_dir="data/rl", sanity_check=script_args.sanity_check)
train_dataset = train_dataset.filter(
    lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
    and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length
)

# 3. Load evaluation dataset
eval_dataset = get_dataset(data_dir="data/evaluation", sanity_check=True)
eval_dataset = eval_dataset.filter(
    lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
    and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length
)

In [9]:
def run_dpo(
    model, 
    model_ref, 
    tokenizer,
    train_dataset,
    eval_dataset,
    script_args
):
    # 4. initialize training arguments:
    training_args = TrainingArguments(
        per_device_train_batch_size=script_args.per_device_train_batch_size,
        per_device_eval_batch_size=script_args.per_device_eval_batch_size,
        max_steps=script_args.max_steps,
        logging_steps=script_args.logging_steps,
        save_steps=script_args.save_steps,
        gradient_accumulation_steps=script_args.gradient_accumulation_steps,
        gradient_checkpointing=script_args.gradient_checkpointing,
        learning_rate=script_args.learning_rate,
        evaluation_strategy="steps",
        eval_steps=script_args.eval_steps,
        output_dir=script_args.output_dir,
        report_to=script_args.report_to,
        lr_scheduler_type=script_args.lr_scheduler_type,
        warmup_steps=script_args.warmup_steps,
        optim=script_args.optimizer_type,
        bf16=False,
        remove_unused_columns=False,
        run_name="dpo_llama2"
    )
    
    peft_config = LoraConfig(
        r=script_args.lora_r,
        lora_alpha=script_args.lora_alpha,
        lora_dropout=script_args.lora_dropout,
        target_modules=[
            "q_proj",
            "v_proj",
            "k_proj",
            "out_proj",
            "fc_in",
            "fc_out",
            "wte",
        ],
        bias="none",
        task_type="CAUSAL_LM",
    )
    
    # 5. initialize the DPO trainer
    dpo_trainer = DPOTrainer(
        model,
        model_ref,
        args=training_args,
        beta=script_args.beta,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        peft_config=peft_config,
        max_prompt_length=script_args.max_prompt_length,
        max_length=script_args.max_length,
    )
    
    # 6. train
    dpo_trainer.train()
    dpo_trainer.save_model(script_args.output_dir)
    
    # 7. save
    output_dir = os.path.join(script_args.output_dir, "final_checkpoint")
    dpo_trainer.model.save_pretrained(output_dir)

In [10]:
run_dpo(
    model = model,
    model_ref = model_ref,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    script_args = script_args
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
100,0.6876,0.690043,0.005284,-0.002445,0.548077,0.007729,-133.418121,-120.824295,-108.246735,-107.212982
200,0.6948,0.692166,0.01565,0.007167,0.512019,0.008482,-133.322006,-120.72065,-108.023605,-106.927162
300,0.6841,0.693842,0.040703,0.028799,0.521635,0.011904,-133.105682,-120.470123,-108.418587,-107.341927
400,0.6886,0.690667,0.04188,0.020679,0.522837,0.021201,-133.186874,-120.458336,-108.100784,-107.038437
500,0.6916,0.68808,-0.02741,-0.051015,0.522837,0.023605,-133.903824,-121.151253,-107.72818,-106.608017
600,0.6931,0.686454,0.039968,0.012762,0.549279,0.027206,-133.266037,-120.477463,-107.919304,-106.817551
700,0.692,0.686655,0.050998,0.022612,0.551683,0.028386,-133.167557,-120.367172,-108.017403,-106.949471
800,0.7059,0.686112,0.046842,0.017403,0.545673,0.029439,-133.219635,-120.408722,-108.031456,-106.971756
900,0.684,0.686319,0.04204,0.013054,0.543269,0.028987,-133.263123,-120.456741,-108.01252,-106.955231
1000,0.6961,0.68628,0.045004,0.015904,0.545673,0.0291,-133.234619,-120.427109,-108.01326,-106.957031




# PPO Pipeline
Below is an inplementation of PPO

In [14]:
parser = HfArgumentParser(ScriptArguments)
script_args, _ = parser.parse_args_into_dataclasses(return_remaining_strings=True)

# 1. load a pretrained model
peft_config = LoraConfig(
    r=script_args.lora_r,
    lora_alpha=script_args.lora_alpha,
    lora_dropout=script_args.lora_dropout,
    target_modules=[
        "q_proj",
        "v_proj",
        "k_proj",
        "out_proj",
        "fc_in",
        "fc_out",
        "wte",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

model = AutoModelForCausalLMWithValueHead.from_pretrained(
    script_args.model_name_or_path,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
    device_map='cuda:0',
    peft_config=peft_config,
)
model.config.use_cache = False

if script_args.ignore_bias_buffers:
    # torch distributed hack
    model._ddp_params_and_buffers_to_ignore = [
        name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
    ]
    
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(
    script_args.model_name_or_path,
    torch_dtype=torch.float16,
    device_map='cuda:0',
)

tokenizer = AutoTokenizer.from_pretrained(script_args.model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token



In [13]:
model_seqcls = AutoModelForSequenceClassification.from_pretrained(
    'weqweasdas/hh_rlhf_rm',
    torch_dtype=torch.float16,
    device_map='cuda:0',
)

In [28]:
f'{sum(p.numel() for p in model.parameters()):,}'

'82,381,449'

In [20]:
all_inputs = tokenizer(train_dataset[0]['chosen'], return_tensors='pt')
all_inputs = {k: v.cuda() for k, v in all_inputs.items()}

In [24]:
model_seqcls(**all_inputs)[0], model_seqcls(**all_inputs).logits

(tensor([[-2.4395]], device='cuda:0', dtype=torch.float16,
        grad_fn=<IndexBackward0>),
 tensor([[-2.4395]], device='cuda:0', dtype=torch.float16,
        grad_fn=<IndexBackward0>))

In [26]:
f'{sum(p.numel() for p in model_seqcls.parameters()):,}'

'2,651,310,080'

In [12]:
def build_dataset(config, query_dataset, input_min_text_length=2, input_max_text_length=8):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        query_dataset (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets
    ds = load_dataset(query_dataset, split="train")
    ds = ds.rename_columns({"chosen": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(
            sample["review"].split("Assistant: ")[0] + "Assistant: "
        )[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

ppo_config = PPOConfig(
    model_name=script_args.model_name_or_path,
    query_dataset="Anthropic/hh-rlhf",
    reward_model="sentiment-analysis:weqweasdas/hh_rlhf_rm",

    learning_rate=script_args.learning_rate,
    log_with=None,
    batch_size=script_args.per_device_train_batch_size,
    # gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    
    early_stopping=False,
    target_kl=6.0,
    kl_penalty="kl",
    seed=0,
    use_score_scaling=False,
    use_score_norm=False,
    score_clip=None
)

# We retrieve the dataloader by calling the `build_dataset` function.
dataset = build_dataset(ppo_config, ppo_config.query_dataset)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Filter: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 160800/160800 [00:00<00:00, 247064.59 examples/s]
Map:  51%|███████████████████████████████████████████████████████████████████████████████▎                                                                            | 73924/145372 [00:15<00:15, 4723.85 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1351 > 1024). Running this sequence through the model will result in indexing errors
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 145372/145372 [00:37<00:00, 3922.26 examples/s]


In [49]:
def run_ppo( 
    model,
    model_ref,
    dataset,
    data_collator,
    ppo_config,
    script_args
):
    # 4. initialize training arguments:
    
    sent_kwargs = {"top_k": None, "function_to_apply": "none", "batch_size": ppo_config.batch_size}
    
    # 5. initialize the DPO trainer
    ppo_trainer = PPOTrainer(
        ppo_config, 
        model, 
        model_ref, 
        tokenizer, 
        dataset=dataset,
        data_collator=data_collator,
    )
    
    # We then build the sentiment analysis pipeline, passing the model name and the
    # sentiment analysis pipeline arguments. Let's also make sure to set the device
    # to the same device as the PPOTrainer.
    device = ppo_trainer.accelerator.device
    if ppo_trainer.accelerator.num_processes == 1:
        device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
        
    ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
    task, model_name = ppo_config.reward_model.split(":")
    if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
        with ds_plugin.zero3_init_context_manager(enable=False):
            sentiment_pipe = pipeline(task, model=model_name, device=device)
    else:
        sentiment_pipe = pipeline(task, model=model_name, device=device)
    
    # Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
    if sentiment_pipe.tokenizer.pad_token_id is None:
        sentiment_pipe.tokenizer.pad_token_id = tokenizer.pad_token_id
    
    if sentiment_pipe.model.config.pad_token_id is None:
        sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id
    
    # We then define the arguments to pass to the `generate` function. These arguments
    # are passed to the `generate` function of the PPOTrainer, which is a wrapper around
    # the `generate` function of the trained model.
    generation_kwargs = {
        "min_length": -1,
        "top_k": 0.0,
        "top_p": 1.0,
        # "do_sample": True,
        "pad_token_id": tokenizer.eos_token_id,
        "max_new_tokens": 32,
    }
    
    for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
        query_tensors = batch["input_ids"]
    
        # Get response from gpt2
        response_tensors = ppo_trainer.generate(query_tensors, return_prompt=False, **generation_kwargs)
        batch["response"] = tokenizer.batch_decode(response_tensors)
    
        # Compute sentiment score
        texts = [q + r for q, r in zip(batch["query"], batch["response"])]
        pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
        rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]
    
        # Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)
    
    # 7. save
    output_dir = os.path.join(script_args.output_dir, "final_checkpoint")
    ppo_trainer.model.save_pretrained(output_dir)

In [29]:
run_ppo(
    model = model,
    model_ref = model_ref,
    dataset = dataset,
    data_collator = collator,
    ppo_config = ppo_config,
    script_args = script_args
)

NameError: name 'run_ppo' is not defined

In [62]:
from tqdm import trange

ppo_trainer = PPOTrainer(
    ppo_config, 
    model, 
    model_ref, 
    tokenizer, 
    dataset=dataset,
    data_collator=collator,
)
dl = iter(ppo_trainer.dataloader)
# for i in trange(100):
batch = next(dl)
    



In [66]:
batch.keys()

dict_keys(['input_ids', 'query'])

In [67]:
next(dl).keys()

dict_keys(['input_ids', 'query'])

In [73]:
from torch.utils.data import DataLoader
new_dl = DataLoader(dataset, batch_size=8, collate_fn=collator)

In [129]:
tokenizer.encode(next(iter(new_dl))['review'], return_tensors='pt')

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [135]:
input_ids = tokenizer(next(iter(new_dl))['review'], return_tensors='pt', padding=True, truncation=True)['input_ids'].cuda()

In [163]:
out = model(input_ids).logits

AttributeError: 'tuple' object has no attribute 'logits'

In [None]:
out

In [155]:
input_ids

{'input_ids': tensor([[  198,   198, 20490,  ...,   546,   340,    13],
        [  198,   198, 20490,  ..., 50256, 50256, 50256],
        [  198,   198, 20490,  ..., 50256, 50256, 50256],
        ...,
        [  198,   198, 20490,  ..., 50256, 50256, 50256],
        [  198,   198, 20490,  ..., 50256, 50256, 50256],
        [  198,   198, 20490,  ..., 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [167]:
new_dataset = Dataset.from_dict(dataset)

AttributeError: 'Dataset' object has no attribute 'items'

In [170]:
Dataset.from_dict(dataset[[1, 2, 3]])

Dataset({
    features: ['review', 'rejected', 'input_ids', 'query'],
    num_rows: 3
})

In [172]:
from random import sample

In [173]:
new_idx = sample(range(len(dataset)), k=10000)
subset_dataset = Dataset.from_dict(dataset[new_idx])

In [174]:
len(subset_dataset)

10000

In [178]:
import copy

In [180]:
copy.deepcopy(dataset), len(dataset)

(Dataset({
     features: ['review', 'rejected', 'input_ids', 'query'],
     num_rows: 145372
 }),
 145372)

In [None]:
new_dl = iter(DataLoader(dataset, batch_size=16, collate_fn=collator))
with torch.inference_mode():
    for i in trange(500):
        batch = next(new_dl)
        input_ids = tokenizer(batch['review'], return_tensors='pt', padding=True, truncation=True)['input_ids']
        rewards = model(input_ids)[2][:, -1]