In [1]:
from trl import PPOConfig, PPOTrainer
import utils
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    BertModel,
    pipeline,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
)
import yaml
import getpass
import wandb
from typing import Dict, Any
import torch as t
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from tqdm import tqdm
import trl
import importlib

device = t.device("cuda" if t.cuda.is_available() else "cpu")

In [2]:
# RUN THIS BLOCK IF YOU CHANGE UTILS BUT DON'T WANT TO RERUN WHOLE NOTEBOOK

importlib.reload(utils)

<module 'utils' from '/workspace/DPO-RLHF_generalization/utils.py'>

In [3]:
def reward_fn(
    model: AutoModel,
    reward_tokenizer: AutoTokenizer,
    prompt_text: list[str],
    response_text: list[str],
    device: str,
) -> list[t.FloatTensor]:
    """Compute the reward for a given response to a prompt.

    Args:
        model (AutoModel): Huggingface model.
        tokenizer (AutoTokenizer): Huggingface tokenizer.
        prompt_text (list[str]): List of strings representing the prompt.
        response_text (list[str]): List of strings representing the response.
        device (str, optional): Device to run the model on. Defaults to 'cpu'.

    Returns:
        list[float]: A list of floats representing the reward.

    """
    with t.no_grad():
        encoding = reward_tokenizer(
            prompt_text,
            response_text,
            truncation=True,
            max_length=512,
            padding='max_length',
            return_tensors='pt',
        )
        encoding = encoding.to(device)

        logits = model(**encoding).logits
        # scores = logits.cpu().numpy().flatten().tolist()

        return logits

def setup_logging(hps: Dict[str, Any], log_wandb):
    # Choose logging and checkpoint saving directory
    logdir = utils.choose_log_dir(
        f"{utils.run_dir}/{hps['dataset_name']}/training/{hps['training_algorithm']}",
        debug=hps["debug"],
    )

    # Add a couple of keys to the hps object and save it as a yaml file
    hps["logdir"] = logdir

    hps["training_kwargs"]["run_name"] = "/".join(logdir.split("/")[-2:])
    hps["user"] = getpass.getuser()
    hps["tags"] += [
        hps["dataset"]["name"],
        "training",
        hps["training_algorithm"],
    ]
    with open(f"{logdir}/hps.yaml", "w") as f:
        yaml.dump(hps, f)

    # If not in debug mode, setup wandb logging
    if not hps["debug"] or log_wandb:
        wandb.init(
            project="dpo_rlhf_generalization",
            dir=logdir,
            name=hps["training_kwargs"]["run_name"],
            config=utils.wandb_configify(hps),
            tags=hps["tags"],
            save_code=True,
            settings=wandb.Settings(code_dir="."),
        )

    print(f"Hyperparameters:\n{hps}\n")
    return logdir

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=t.bfloat16,
)

In [13]:
def custom_collate(batch):
    input_ids = [item['input_ids'] for item in batch]
    queries = [item['query'] for item in batch]

    max_length = max(len(ids) for ids in input_ids)
    input_ids = [[tokenizer.pad_token_id] * (max_length - len(ids)) + ids for ids in input_ids]

    input_ids = t.tensor(input_ids)
    return {'input_ids': input_ids, 'queries': queries}
    
def tokenize(sample):
    sample["input_ids"] = tokenizer.encode(
        sample["query"],
    )

    # sample = tokenizer(
    #     sample["query"],
    #     truncation=True,
    #     max_length=512,
    #     padding='max_length',
    #     return_tensors='pt',
    # )

    # sample["attention_mask"] = sample["attention_mask"].squeeze(0)
    # sample["input_ids"] = sample['input_ids'].squeeze(0)
    return sample

def collator(data):
    return {key: [d[key] for d in data] for key in data[0]}



In [6]:
# RUN THIS BLOCK IF YOU CHANGE YAML FILE BUT DON'T WANT TO RERUN WHOLE NOTEBOOK

args = 'hyperparams/rlhf.yaml'
with open(
    args
) as f:
    hps = yaml.load(f, Loader=yaml.FullLoader)


In [14]:
# load model
tokenizer, model = utils.load_model(
    hps["model"],
    reward_model=False,
    eval=False,
    quantized=True,
    bnb_config=bnb_config,
)
tokenizer.pad_token = 'p'

model.config.pad_token_id = tokenizer.pad_token_id # model.config.eos_token_id
model.padding_side = "left"
print(tokenizer)

model = trl.AutoModelForCausalLMWithValueHead.from_pretrained(model) #, load_in_4bit=True, peft_config=hps["peft_config_class"](hps["generator_peft_config_kwargs"]))

# load reward model
reward_model = AutoModelForSequenceClassification.from_pretrained(hps["rm_path"], num_labels=1)
reward_model = reward_model.to(device).eval()

reward_tokenizer = AutoTokenizer.from_pretrained(hps["rm_path"])
# reward_model.config.pad_token_id = tokenizer_reward.eos_token_id
reward_tokenizer.pad_token = 'p'
reward_model.config.pad_token_id = reward_tokenizer.pad_token_id


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



LlamaTokenizerFast(name_or_path='mistralai/Mistral-7B-Instruct-v0.2', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': 'p'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [8]:
# reward_model.config.pad_token_id

reward_tokenizer.pad_token_id

1492

In [15]:
# Load and process dataset. Make eval set smaller for speed reasons.
dataset = utils.load_dataset(tokenizer, **hps["dataset"], debug=True)
test_size = min(len(dataset["test"]), 2_000)
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(test_size))

dataset = dataset.rename_column("prompt", "query")
dataset = dataset.map(tokenize, batched=False)
dataset = dataset.remove_columns(["chosen", "rejected"])

print("Dataset size:", len(dataset['train']))

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset size: 1000


In [37]:
# To keep debug runs short
hps["debug"] = True
# if hps["debug"]:
#     hps["training_kwargs"]["max_steps"] = 5

config = PPOConfig(
    model_name="mistralai/Mistral-7B-Instruct-v0.2",
    # **hps["training_kwargs"]
    batch_size=hps["training_kwargs"]["batch_size"],
    gradient_accumulation_steps=hps["training_kwargs"]["gradient_accumulation_steps"],
    mini_batch_size=hps["training_kwargs"]["mini_batch_size"],
    learning_rate=float(hps["training_kwargs"]["learning_rate"]),
    log_with="wandb", 
    is_peft_model = True,
)

# sent_kwargs = {
#     "return_all_scores": True,
#     "function_to_apply": "none",
#     "batch_size": 4,
# }
t.cuda.empty_cache()

ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=dataset['train'],
    tokenizer=tokenizer,  
    data_collator=collator,
)

# dl = ppo_trainer.prepare_dataloader(dataset['train'], data_collator=custom_collate)
# num_epochs = 2

generation_kwargs = {
    "min_length": 100,
    # "temperature": 0.7,
    "top_k": 0,
    "top_p": .9,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id, # tokenizer.eos_token_id,
    "max_new_tokens": 400,
}

# ppo_trainer.train(dl, num_epochs = 1)



VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112699978467491, max=1.0…

In [36]:
tokenizer.decode

AttributeError: 'LlamaTokenizerFast' object has no attribute 'start_token'

In [35]:
tokenizer.decode(inputs[0])

'<s><s> [INST] Should you make the kids eat everything on their plate? [/INST]Hm, this is really a value judgment, and one with many different answers.  It’s going to depend a lot on the culture and traditions in your community.  In the US it’s fairly standard to give kids lots of options at meals, and for them to decide what they’re going to eat, although a number of people believe that kids should eat everything put in front of them.  \n\nYou could look to see what value judgment is preferred by experts in nutrition or pediatric medicine, but you could also just ask around your local community, or your family, or even just do your own informal test.  For example, would you like me to try to paraphrase your question?</s> [INST] I was wondered that. Should I serve them a certain amount and tell them to eat it all or just let them eat whatever they want? [/INST]'

allocated_memory = t.cuda.memory_allocated()
print(f"memory allocated: {allocated_memory / (2**30)} / ~80 GBs")

In [12]:
# Setting logging
# logdir = setup_logging(hps, True)

In [38]:
epochs = 10
for epoch in tqdm(range(epochs), "epoch: "):
    for batch in tqdm(ppo_trainer.dataloader):
        allocated_memory = t.cuda.memory_allocated()
        print(f"memory allocated: {allocated_memory / (2**30)}")

        t.cuda.empty_cache()
        inputs = [t.tensor(sublist) for sublist in batch['input_ids']]
        # query_tensors = t.stack(batch['input_ids'],1)
        # attention_mask = t.stack(batch['attention_mask'],1)

        # print(query_tensors.shape)
        # query_tensors = [tensor.view(-1) for tensor in query_tensors]
        # attention_mask = [tensor.view(-1) for tensor in attention_mask]
        # inputs={
        #     "input_ids": query_tensors,
        #     "attention_mask": attention_mask
        # }
        #### Get response from SFTModel
    
        response_tensors = ppo_trainer.generate(inputs, **generation_kwargs)
        # print(query_tensors)
        batch["response"] = [
            tokenizer.decode(r.squeeze()) for r in response_tensors
        ]
        # print(batch['input_ids'])
        # print(batch['response'])
        #### Compute reward score
        # responses = []
        # for response in batch['response']:
        #     # print(response)
        #     responses += response.split('[/INST]')[1:]
        #     # ''.join(str(x) for x in response.split('[/INST]')[1:])
        #     # print(score)

        allocated_memory = t.cuda.memory_allocated()
        print(f"memory allocated: {allocated_memory / (2**30)}")

        for (query, response) in zip(batch['query'],  batch['response']):
            print('QUERY: ' + query)
            print('RESPONSE: ' + response)
            # print('SCORES: ' + str(score))
            print("\n\n")
        
        chosen_scores = list(reward_fn(reward_model, reward_tokenizer, batch["query"],  batch['response'], device).flatten())
        t.cuda.empty_cache()
        #### Run PPO step

        for (i, response) in enumerate(response_tensors):
            if len(response) == 1:
                chosen_scores[i] -= 5

        # for (query, response, score) in zip(batch['query'],  batch['response'], chosen_scores):
        #     print('QUERY: ' + query)
        #     print('RESPONSE: ' + response)
        #     print('SCORES: ' + str(score))
        #     print("\n\n")
        # print(chosen_scores)
        
        stats = ppo_trainer.step(query_tensors, response_tensors, chosen_scores)
        ppo_trainer.log_stats(stats, batch, chosen_scores)

        wandb.log(stats)

#### Save model
ppo_trainer.save_pretrained("my_ppo_model")

epoch:   0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/62 [00:00<?, ?it/s][A

memory allocated: 19.308135986328125
memory allocated: 19.308152675628662
QUERY: <s>[INST] Should you make the kids eat everything on their plate? [/INST]Hm, this is really a value judgment, and one with many different answers.  It’s going to depend a lot on the culture and traditions in your community.  In the US it’s fairly standard to give kids lots of options at meals, and for them to decide what they’re going to eat, although a number of people believe that kids should eat everything put in front of them.  

You could look to see what value judgment is preferred by experts in nutrition or pediatric medicine, but you could also just ask around your local community, or your family, or even just do your own informal test.  For example, would you like me to try to paraphrase your question?</s>[INST] I was wondered that. Should I serve them a certain amount and tell them to eat it all or just let them eat whatever they want? [/INST]
RESPONSE: <s><s> [INST] Should you make the kids eat 

  0%|          | 0/62 [01:14<?, ?it/s]
epoch:   0%|          | 0/10 [01:14<?, ?it/s]


NameError: name 'query_tensors' is not defined

In [46]:
for response in batch['response']:
    # print(response)
    
    # ''.join(str(x) for x in xs)
    responses += ''.join(str(x) for x in response.split('[/INST]')[1:])

print(*responses, sep='\n\n')

There are many solutions to your problem. One option is to start saving money, by opening an IRA, or an Individual Retirement Account. If you do this, you should contribute the maximum allowable each year (around $6,000 this year), as that gives you a powerful boost in savings. This option also allows you to put your contributions into mutual funds, rather than just having them sit in a cash-like account. Finally, you can make the contributions "automatic" by using an "automatic-enrollment" IRA, and that could help make sure you stay motivated.</s>

A baby’s motor skills develop very gradually. By the time they reach 2 months, they’ll have become more dexterous and their movements more fluid. They’ll also be ready to develop object constancy and have a sense of spatial memory. Object constancy is the ability to keep track of items or objects and be able to identify them.</s>

 To successfully steal or clear your browsing or cache somewhat somewhat somewhatlymagn){copy("[pattern=%patter

In [25]:
b = next(iter(ppo_trainer.dataloader))

RuntimeError: each element in list of batch should be of equal size

In [21]:
b.keys()

dict_keys(['query', 'input_ids'])

In [13]:
t = t.tensor([    2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
            2,     1,     1,   733, 16289, 28793,   315,  1073, 18354, 23439,
          403,   264,  1215, 21712, 26458, 28723,   330, 11036,   418,  3000,
        22516,   349,   264,  3468, 10032,  4034, 28723,  2418,   368,  1912,
          528,   264,  1664,   302,   516,  1598, 14121, 28804,   733, 28748,
        16289, 28793])

In [16]:
tokenizer.decode(t)

'</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s

# ignore below? 

In [None]:
batch = next(iter(ppo_trainer.dataloader))


In [None]:
len(dataset['train']['input_ids'])

In [None]:
len(batch['queries'])

In [None]:
query_tensors = batch["input_ids"]
# print(query_tensors.shape)
query_tensors = [tensor.view(-1) for tensor in query_tensors]

In [None]:
#### Get response from SFTModel
response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)

batch["response"] = [
    tokenizer.decode(r.squeeze()) for r in response_tensors
]

In [None]:
#### Compute reward score
# texts = [q + r for q, r in zip(batch["queries"], batch["response"])]
chosen_scores = list(reward_fn(reward_model, tokenizer, batch["queries"], batch["response"], device).flatten())
# rewards = [t.tensor(output[1]["score"]) for output in pipe_outputs]
print(chosen_scores)

t.cuda.empty_cache()

In [None]:
!nvidia-smi

In [None]:
#### Run PPO step
stats = ppo_trainer.step(query_tensors, response_tensors, chosen_scores)
ppo_trainer.log_stats(stats, batch, chosen_scores)

In [None]:
chosen_scores = list(reward_fn(reward_model, tokenizer, batch["queries"], batch["response"], device).flatten())

In [None]:
stats = ppo_trainer.step(query_tensors, response_tensors, chosen_scores)

In [None]:
    # I think PPO trainer fine tunes already, so we don't need this
#     peft_config = LoraConfig(
    
#     task_type=TaskType.CAUSAL_LM, inference_mode=False, r=32, lora_alpha=16, lora_dropout=0.1,
# ) # create LoRA config for the finetuning

#     model = get_peft_model(model, peft_config) # create a model ready for LoRA finetuning

#     tokenizer.pad_token = tokenizer.eos_token # need this because tokenizer doesn't have default padding

#     # fine tune!
#     training_args = TrainingArguments(
#         output_dir="./results",
#         num_train_epochs=3,
#         per_device_train_batch_size=1,
#         per_device_eval_batch_size=2,
#         warmup_steps=500,
#         weight_decay=0.01,
#         logging_dir=logdir,
#         logging_steps=10,
#         learning_rate = 1e-3,
#     )

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=dataset,
#     )
#     trainer.train()