In [1]:
from trl import PPOConfig, PPOTrainer
import utils
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    BertModel,
    pipeline,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
)
import yaml
import getpass
import wandb
from typing import Dict, Any
import torch as t
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, get_peft_model
from tqdm import tqdm
import trl
import importlib

device = t.device("cuda" if t.cuda.is_available() else "cpu")

In [2]:
# RUN THIS BLOCK IF YOU CHANGE UTILS BUT DON'T WANT TO RERUN WHOLE NOTEBOOK

importlib.reload(utils)

<module 'utils' from '/workspace/DPO-RLHF_generalization/utils.py'>

In [3]:
def reward_fn(
    model: AutoModel,
    tokenizer: AutoTokenizer,
    prompt_text: list[str],
    response_text: list[str],
    device: str,
) -> list[t.FloatTensor]:
    """Compute the reward for a given response to a prompt.

    Args:
        model (AutoModel): Huggingface model.
        tokenizer (AutoTokenizer): Huggingface tokenizer.
        prompt_text (list[str]): List of strings representing the prompt.
        response_text (list[str]): List of strings representing the response.
        device (str, optional): Device to run the model on. Defaults to 'cpu'.

    Returns:
        list[float]: A list of floats representing the reward.

    """
    with t.no_grad():
        encoding = tokenizer(
            prompt_text,
            response_text,
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors='pt',
        )
        encoding = encoding.to(device)

        logits = model(**encoding).logits
        # scores = logits.cpu().numpy().flatten().tolist()

        return logits

def setup_logging(hps: Dict[str, Any], log_wandb):
    # Choose logging and checkpoint saving directory
    logdir = utils.choose_log_dir(
        f"{utils.run_dir}/{hps['dataset_name']}/training/{hps['training_algorithm']}",
        debug=hps["debug"],
    )

    # Add a couple of keys to the hps object and save it as a yaml file
    hps["logdir"] = logdir

    hps["training_kwargs"]["run_name"] = "/".join(logdir.split("/")[-2:])
    hps["user"] = getpass.getuser()
    hps["tags"] += [
        hps["dataset"]["name"],
        "training",
        hps["training_algorithm"],
    ]
    with open(f"{logdir}/hps.yaml", "w") as f:
        yaml.dump(hps, f)

    # If not in debug mode, setup wandb logging
    if not hps["debug"] or log_wandb:
        wandb.init(
            project="dpo_rlhf_generalization",
            dir=logdir,
            name=hps["training_kwargs"]["run_name"],
            config=utils.wandb_configify(hps),
            tags=hps["tags"],
            save_code=True,
            settings=wandb.Settings(code_dir="."),
        )

    print(f"Hyperparameters:\n{hps}\n")
    return logdir

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [5]:
def custom_collate(batch):
    input_ids = [item['input_ids'] for item in batch]
    queries = [item['query'] for item in batch]

    max_length = max(len(ids) for ids in input_ids)
    input_ids = [[tokenizer.pad_token_id] * (max_length - len(ids)) + ids for ids in input_ids]

    input_ids = t.tensor(input_ids)
    return {'input_ids': input_ids, 'queries': queries}
    
def tokenize(sample):
    # sample["input_ids"] = tokenizer.encode(
    #     sample["query"],
    # )

    sample["input_ids"] = tokenizer(
        sample["query"],
        truncation=True,
        max_length=512,
        padding='max_length',
        return_tensors='pt',
    )['input_ids']
    sample["input_ids"] = sample['input_ids'].squeeze(0)
    return sample


In [6]:
# RUN THIS BLOCK IF YOU CHANGE YAML FILE BUT DON'T WANT TO RERUN WHOLE NOTEBOOK

args = 'hyperparams/rlhf.yaml'
with open(
    args
) as f:
    hps = yaml.load(f, Loader=yaml.FullLoader)


In [7]:
# load model
tokenizer, model = utils.load_model(
    hps["model"],
    reward_model=False,
    eval=False,
    quantized=True,
    bnb_config=bnb_config,
)
# tokenizer.padding_side = 'left'

print(tokenizer)

model = trl.AutoModelForCausalLMWithValueHead.from_pretrained(model, load_in_4bit=True, device_map = {"": current_device}, peft_config=hps["peft_config_kwargs"])

# load reward model
reward_model = AutoModelForSequenceClassification.from_pretrained(hps["rm_path"], num_labels=1,torch_dtype=torch.bfloat16,
    load_in_4bit=True)
reward_model = reward_model(reward_model, hps["rm_peft_config"])
reward_model = reward_model.to(t.device("cuda:0")).eval()

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-Instruct-v0.2', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}




In [8]:
# Load and process dataset. Make eval set smaller for speed reasons.
dataset = utils.load_dataset(tokenizer, **hps["dataset"], debug=True)
test_size = min(len(dataset["test"]), 2_000)
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(test_size))

dataset = dataset.rename_column("prompt", "query")
dataset = dataset.map(tokenize, batched=False)
dataset = dataset.remove_columns(["chosen", "rejected"])

print("Dataset size:", len(dataset['train']))

Dataset size: 1000


In [10]:
# To keep debug runs short
hps["debug"] = True
if hps["debug"]:
    hps["training_kwargs"]["max_steps"] = 5

config = PPOConfig(
    model_name="mistralai/Mistral-7B-Instruct-v0.2",
    batch_size=hps["training_kwargs"]["batch_size"],
    gradient_accumulation_steps=hps["training_kwargs"]["gradient_accumulation_steps"],
    mini_batch_size=hps["training_kwargs"]["mini_batch_size"],
    learning_rate=float(hps["training_kwargs"]["learning_rate"]),
    is_peft_model = True
    # log_with = `wandb`,
)

# sent_kwargs = {
#     "return_all_scores": True,
#     "function_to_apply": "none",
#     "batch_size": 4,
# }
t.cuda.empty_cache()

ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=dataset['train'],
    tokenizer=tokenizer,  
)

dl = ppo_trainer.prepare_dataloader(dataset['train'], data_collator=custom_collate)
num_epochs = 2

generation_kwargs = {
    "min_length": 0.0,
    # "temperature": 0.7,
    "top_k": 1,
    "top_p": 0.9,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 100,
}

# ppo_trainer.train(dl, num_epochs = 1)

# wandb.init()

# import torch.nn.functional as F

# # Training loop
# epochs = 10
# accumulation_steps = 4  # Define your accumulation steps
# # optimizer.zero_grad()  # Reset gradients tensors

# for epoch in tqdm(range(epochs), "epoch: "):
#     for i, batch in enumerate(tqdm(ppo_trainer.dataloader)): 
#         t.cuda.empty_cache()
#         query_tensors = t.stack(batch["input_ids"])
        
#         # Manually pad the input on the left side
#         query_tensors = F.pad(query_tensors, (1, 0), value=tokenizer.pad_token_id)
        
#         # Reshape the tensor
#         query_tensors = query_tensors.view(-1)
    
#         # Get response from SFTModel
#         response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)
#         batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
    
#         # Compute reward score
#         texts = [q + r for q, r in zip(batch["query"], batch["response"])]
        
#         # Convert texts to tensors
#         input_ids = [tokenizer.encode(t, return_tensors='pt') for t in texts]
#         input_ids = t.cat(input_ids)
        
#         # Pass tensors to reward_model
#         pipe_outputs = reward_model(input_ids.to('cuda:0'))
        
#         rewards = [t.tensor(output[1]["score"]) for output in pipe_outputs]
        
#         # Run PPO step
#         stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        
#         if (i+1) % accumulation_steps == 0:  # Wait for several backward steps
#             optimizer.step()  # Now we can do an optimizer step
#             optimizer.zero_grad()  # Reset gradients tensors
            
#         ppo_trainer.log_stats(stats, batch, rewards)

allocated_memory = t.cuda.memory_allocated()
print(f"memory allocated: {allocated_memory / (2**30)} / ~80 GBs")

In [11]:
# Setting logging
logdir = setup_logging(hps, True)

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmgerov[0m. Use [1m`wandb login --relogin`[0m to force relogin


Hyperparameters:
{'model': 'mistralai/Mistral-7B-Instruct-v0.2', 'calibrated_model_path': 'sileod/deberta-v3-large-tasksource-rlhf-reward-model', 'peft_config_class': <class 'peft.tuners.lora.config.LoraConfig'>, 'peft_config_kwargs': {'r': 16, 'lora_alpha': 16, 'lora_dropout': 0.05}, 'dataset': {'name': 'Anthropic/hh-rlhf', 'data_dir': 'default'}, 'dataset_name': 'instruct', 'training_algorithm': 'rlhf', 'debug': True, 'training_kwargs': {'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'gradient_accumulation_steps': 8, 'steps': 10000, 'learning_rate': 1.41e-05, 'batch_size': 16, 'mini_batch_size': 2, 'max_steps': 5, 'run_name': 'debug/run_32'}, 'tags': ['Anthropic/hh-rlhf', 'training', 'rlhf'], 'logdir': 'data/instruct/training/rlhf/debug/run_32', 'user': 'root'}



In [None]:
epochs = 10
for epoch in tqdm(range(epochs), "epoch: "):
    for batch in tqdm(ppo_trainer.dataloader):
        
        allocated_memory = t.cuda.memory_allocated()
        print(f"memory allocated: {allocated_memory / (2**30)}")

        t.cuda.empty_cache()

        query_tensors = t.stack(batch['input_ids'],1)
        # print(query_tensors.shape)
        query_tensors = [tensor.view(-1) for tensor in query_tensors]
        #### Get response from SFTModel
        response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)

        batch["response"] = [
            tokenizer.decode(r.squeeze()) for r in response_tensors
        ]
        print(batch["response"])
        
        #### Compute reward score
        chosen_scores = list(reward_fn(reward_model, tokenizer, batch["query"], batch["response"], device).flatten())
        t.cuda.empty_cache()
        #### Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, chosen_scores)
        ppo_trainer.log_stats(stats, batch, chosen_scores)
        wandb.log(stats)

#### Save model
ppo_trainer.save_pretrained("my_ppo_model")

epoch:   0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/62 [00:00<?, ?it/s][AYou're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


memory allocated: 10.409392833709717



  2%|▏         | 1/62 [00:36<36:43, 36.12s/it][A

memory allocated: 11.402983665466309



  3%|▎         | 2/62 [01:12<36:28, 36.47s/it][A

memory allocated: 11.402983665466309



  5%|▍         | 3/62 [01:58<39:54, 40.59s/it][A

memory allocated: 11.402983665466309



  6%|▋         | 4/62 [02:33<37:12, 38.48s/it][A

memory allocated: 11.402983665466309



  8%|▊         | 5/62 [03:08<35:15, 37.12s/it][A

memory allocated: 11.402983665466309



 10%|▉         | 6/62 [03:43<34:06, 36.54s/it][A

memory allocated: 11.402983665466309



 11%|█▏        | 7/62 [04:21<34:01, 37.11s/it][A

memory allocated: 11.402983665466309



 13%|█▎        | 8/62 [04:56<32:44, 36.37s/it][A

memory allocated: 11.402983665466309



 15%|█▍        | 9/62 [05:31<31:41, 35.88s/it][A

memory allocated: 11.402983665466309



 16%|█▌        | 10/62 [06:08<31:18, 36.12s/it][A

memory allocated: 11.402983665466309



 18%|█▊        | 11/62 [06:53<33:03, 38.88s/it][A

memory allocated: 11.402983665466309



 19%|█▉        | 12/62 [07:28<31:23, 37.66s/it][A

memory allocated: 11.402983665466309



 21%|██        | 13/62 [08:12<32:29, 39.79s/it][A

memory allocated: 11.402983665466309



 23%|██▎       | 14/62 [08:47<30:39, 38.33s/it][A

memory allocated: 11.402983665466309



 24%|██▍       | 15/62 [09:23<29:18, 37.41s/it][A

memory allocated: 11.402983665466309



 26%|██▌       | 16/62 [09:58<28:13, 36.81s/it][A

memory allocated: 11.402983665466309



 27%|██▋       | 17/62 [10:33<27:11, 36.26s/it][A

memory allocated: 11.402983665466309



 29%|██▉       | 18/62 [11:09<26:29, 36.12s/it][A

memory allocated: 11.402983665466309



 31%|███       | 19/62 [11:43<25:31, 35.62s/it][A

memory allocated: 11.402983665466309



 32%|███▏      | 20/62 [12:19<24:56, 35.63s/it][A

memory allocated: 11.402983665466309



 34%|███▍      | 21/62 [12:54<24:07, 35.30s/it][A

memory allocated: 11.402983665466309



 35%|███▌      | 22/62 [13:29<23:33, 35.35s/it][A

memory allocated: 11.402983665466309



 37%|███▋      | 23/62 [14:04<22:50, 35.15s/it][A

memory allocated: 11.402983665466309



 39%|███▊      | 24/62 [14:40<22:26, 35.43s/it][A

memory allocated: 11.402983665466309



 40%|████      | 25/62 [15:15<21:44, 35.26s/it][A

memory allocated: 11.402983665466309



 42%|████▏     | 26/62 [15:50<21:11, 35.31s/it][A

memory allocated: 11.402983665466309



 44%|████▎     | 27/62 [16:25<20:35, 35.30s/it][A

memory allocated: 11.402983665466309



 45%|████▌     | 28/62 [17:05<20:42, 36.54s/it][A

memory allocated: 11.402983665466309



 47%|████▋     | 29/62 [17:44<20:34, 37.41s/it][A

memory allocated: 11.402983665466309



 48%|████▊     | 30/62 [18:29<21:04, 39.51s/it][A

memory allocated: 11.402983665466309



 50%|█████     | 31/62 [19:04<19:49, 38.36s/it][A

memory allocated: 11.402983665466309



 52%|█████▏    | 32/62 [19:40<18:50, 37.69s/it][A

memory allocated: 11.402983665466309



 53%|█████▎    | 33/62 [20:16<17:51, 36.96s/it][A

memory allocated: 11.402983665466309



 55%|█████▍    | 34/62 [20:51<16:59, 36.42s/it][A

memory allocated: 11.402983665466309



 56%|█████▋    | 35/62 [21:28<16:30, 36.68s/it][A

memory allocated: 11.402983665466309



 58%|█████▊    | 36/62 [22:03<15:40, 36.16s/it][A

memory allocated: 11.402983665466309



 60%|█████▉    | 37/62 [22:38<14:55, 35.80s/it][A

memory allocated: 11.402983665466309



 61%|██████▏   | 38/62 [23:14<14:20, 35.84s/it][A

memory allocated: 11.402983665466309



 63%|██████▎   | 39/62 [23:50<13:45, 35.90s/it][A

memory allocated: 11.402983665466309



 65%|██████▍   | 40/62 [24:35<14:09, 38.62s/it][A

memory allocated: 11.402983665466309



 66%|██████▌   | 41/62 [25:10<13:11, 37.70s/it][A

memory allocated: 11.402983665466309



 68%|██████▊   | 42/62 [25:45<12:17, 36.85s/it][A

memory allocated: 11.402983665466309



 69%|██████▉   | 43/62 [26:21<11:35, 36.61s/it][A

memory allocated: 11.402983665466309



 71%|███████   | 44/62 [26:56<10:47, 36.00s/it][A

memory allocated: 11.402983665466309



 73%|███████▎  | 45/62 [27:31<10:04, 35.57s/it][A

memory allocated: 11.402983665466309



 74%|███████▍  | 46/62 [28:16<10:14, 38.40s/it][A

memory allocated: 11.402983665466309



 76%|███████▌  | 47/62 [28:51<09:23, 37.59s/it][A

memory allocated: 11.402983665466309



 77%|███████▋  | 48/62 [29:31<08:54, 38.15s/it][A

memory allocated: 11.402983665466309



 79%|███████▉  | 49/62 [30:10<08:21, 38.57s/it][A

memory allocated: 11.402983665466309



 81%|████████  | 50/62 [30:48<07:38, 38.22s/it][A

memory allocated: 11.402983665466309



 82%|████████▏ | 51/62 [31:26<06:59, 38.16s/it][A

memory allocated: 11.402983665466309



 84%|████████▍ | 52/62 [32:04<06:22, 38.24s/it][A

memory allocated: 11.402983665466309



 85%|████████▌ | 53/62 [32:39<05:34, 37.13s/it][A

memory allocated: 11.402983665466309



 87%|████████▋ | 54/62 [33:13<04:50, 36.32s/it][A

memory allocated: 11.402983665466309



 89%|████████▊ | 55/62 [33:47<04:09, 35.69s/it][A

memory allocated: 11.402983665466309



 90%|█████████ | 56/62 [34:27<03:40, 36.77s/it][A

memory allocated: 11.402983665466309



 92%|█████████▏| 57/62 [35:07<03:09, 37.97s/it][A

memory allocated: 11.402983665466309



 94%|█████████▎| 58/62 [35:52<02:39, 39.96s/it][A

memory allocated: 11.402983665466309



 95%|█████████▌| 59/62 [36:37<02:04, 41.39s/it][A

memory allocated: 11.402983665466309



 97%|█████████▋| 60/62 [37:12<01:18, 39.43s/it][A

memory allocated: 11.402983665466309



 98%|█████████▊| 61/62 [37:50<00:39, 39.25s/it][A

memory allocated: 11.402983665466309



100%|██████████| 62/62 [38:25<00:00, 37.18s/it][A
epoch:  10%|█         | 1/10 [38:25<5:45:47, 2305.25s/it]
  0%|          | 0/62 [00:00<?, ?it/s][A

memory allocated: 11.402983665466309



  2%|▏         | 1/62 [00:35<36:28, 35.88s/it][A

memory allocated: 11.402983665466309



  3%|▎         | 2/62 [01:19<40:43, 40.72s/it][A

memory allocated: 11.402983665466309


In [1]:
assert False

AssertionError: 

# ignore below? 

In [None]:
batch = next(iter(ppo_trainer.dataloader))


In [26]:
len(dataset['train']['input_ids'])

1000

In [17]:
len(batch['queries'])

NameError: name 'batch' is not defined

In [25]:
query_tensors = batch["input_ids"]
# print(query_tensors.shape)
query_tensors = [tensor.view(-1) for tensor in query_tensors]

In [26]:
#### Get response from SFTModel
response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)

batch["response"] = [
    tokenizer.decode(r.squeeze()) for r in response_tensors
]

In [27]:
#### Compute reward score
# texts = [q + r for q, r in zip(batch["queries"], batch["response"])]
chosen_scores = list(reward_fn(reward_model, tokenizer, batch["queries"], batch["response"], device).flatten())
# rewards = [t.tensor(output[1]["score"]) for output in pipe_outputs]
print(chosen_scores)

t.cuda.empty_cache()

[tensor(0.4236, device='cuda:0')]


In [10]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sat May 11 09:36:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:CA:00.0 Off |                    0 |
| N/A   37C    P0              68W / 400W |  81013MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [29]:
#### Run PPO step
stats = ppo_trainer.step(query_tensors, response_tensors, chosen_scores)
ppo_trainer.log_stats(stats, batch, chosen_scores)

In [None]:
chosen_scores = list(reward_fn(reward_model, tokenizer, batch["queries"], batch["response"], device).flatten())

In [None]:
stats = ppo_trainer.step(query_tensors, response_tensors, chosen_scores)

In [None]:
    # I think PPO trainer fine tunes already, so we don't need this
#     peft_config = LoraConfig(
    
#     task_type=TaskType.CAUSAL_LM, inference_mode=False, r=32, lora_alpha=16, lora_dropout=0.1,
# ) # create LoRA config for the finetuning

#     model = get_peft_model(model, peft_config) # create a model ready for LoRA finetuning

#     tokenizer.pad_token = tokenizer.eos_token # need this because tokenizer doesn't have default padding

#     # fine tune!
#     training_args = TrainingArguments(
#         output_dir="./results",
#         num_train_epochs=3,
#         per_device_train_batch_size=1,
#         per_device_eval_batch_size=2,
#         warmup_steps=500,
#         weight_decay=0.01,
#         logging_dir=logdir,
#         logging_steps=10,
#         learning_rate = 1e-3,
#     )

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=dataset,
#     )
#     trainer.train()