In [1]:
%%capture
!pip install transformers[torch]
!pip install trl
!pip install evaluate


In [2]:

pip install --upgrade transformers accelerate

Note: you may need to restart the kernel to use updated packages.


In [3]:

from typing import Any, Dict, List, Optional, Union
from dataclasses import dataclass, field
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, TaskType, get_peft_model,PeftModelForSequenceClassification
from transformers.utils import PaddingStrategy
import torch.nn as nn

In [4]:
%%capture
!pip install datasets==2.16.1

In [5]:
from datasets import load_dataset

In [6]:
#dataset = load_dataset('lvwerra/stack-exchange-paired',  data_dir='/Users/anantvirsingh/Desktop/huggingface_datasets/rlhf_stackexchange_paired/', verification_mode="no_checks")
dataset = load_dataset("lvwerra/stack-exchange-paired")



Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
        num_rows: 26801833
    })
    test: Dataset({
        features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
        num_rows: 4483004
    })
})

In [8]:
dataset["train"][:3]

{'qid': [6073935, 6073935, 6073935],
 'question': ["I have installed the Java 3D API on PC via the exe installer, which simply created a new directory with `j3dcore.jar`, `vecmath.jar`, `j3dutils.jar` in a lib sub-directory and `j3dcore-ogl.dll` in a bin sub-directory.\n\nNetbeans had no issues and my code compiled and executed smoothly, however once I built my project and tried to run it from the command prompt I got an `UnsatisfiedLinkError` saying that `no j3dcore-ogl in java.library.path`. \n\nGoogle came to the rescue and gave me 3 viable solutions:\n\n* by copying the dll file into my JRE's bin directory\n* by adding the path of the dll file to the library path (`java -Djava.library.path=dllpath`)\n* load the dll in the program with `System.load()` (I couldn't get this one to work, actually)\n\nMy question is: Is there an elegant solution to this problem, that I missed? \n\nIt seems tedious that for each different PC someone would like to use this program on, he'd have to either 

In [9]:
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

In [10]:
import os
output_model_name = "reward_model_peft_lora"
output_dir = "/Users/anantvirsingh/Desktop/huggingface/transformers/output_models/"
learning_rate = "1e-5"
train_epochs = 10
reward_model_checkpoint = "gpt2"
rl_model_checkpoint = "gpt2"
os.environ["HUGGINGFACE_TOKEN"] = ""

In [11]:

# HuggingFace Trainer API takes TrainingArguments as input (although optional). So lets set config for training
training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate=1e-5,
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=10,
    remove_unused_columns=False,
    label_names=[],
    bf16=True,
    logging_strategy="steps",
    logging_steps=10
)

In [12]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(reward_model_checkpoint, use_auth_token=os.getenv("HUGGINGFACE_TOKEN"))
tokenizer.pad_token = tokenizer.eos_token



In [13]:
# Define PEFT config parameters for LoRA

peft_config = LoraConfig(
    task_type = TaskType.SEQ_CLS, # based on task specified here, task_type helps set correct head & correct loss fn.
    inference_mode = False,
    r = 8, # Set rank for LoRA matrices
    lora_alpha=32,
    lora_dropout=0.1
)

In [14]:
# Set reward model from checkpoint. num_labels = 1 as we want our reward model to output a scalar
reward_model = AutoModelForSequenceClassification.from_pretrained(
    reward_model_checkpoint, num_labels=1,  use_auth_token=os.getenv("HUGGINGFACE_TOKEN")
)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Now we have defined LoRA config and we also have defined a reward_model, lets now feed both these int get_peft_model()
# and get a PEFT model
reward_model_peft = PeftModelForSequenceClassification(reward_model, peft_config)
#reward_model_peft = get_peft_model(peft_config, reward_model)
reward_model_peft.print_trainable_parameters()

trainable params: 295,680 || all params: 124,736,256 || trainable%: 0.2370




In [16]:
# Need to do this for gpt2, because it doesn't have an official pad token.
tokenizer.pad_token = tokenizer.eos_token
reward_model_peft.config.pad_token_id = tokenizer.eos_token_id

In [17]:
"""
Input : input_data_batch i.e batch of Dataset :
        Dataset({
                features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
                num_rows: 26801833
            })
For each question + response, we combine then together as 
(
    Question : input_sample["question"]
    
    Answer : input_sample["response"]
)
and we create 2 columns

Column 1 : (
    Question : input_sample["question"]
    
    Answer : input_sample["response_j"] # This is the response ranked better by human annotators
)
Column 2 : (
    Question : input_sample["question"]
    
    Answer : input_sample["response_k"] # This is the response ranked lower by human annotators
)
Each column will be a large string and hence will be tokenized. So the tokenizer will return 'input_ids' for that column. 
Along with that, tokenizer will also return attention_mask.

So in total we will return 4 columns
Column 1 : tokenized_j
Column 2 : attention_mask_j
Column 3 : tokenized_k
Column 2 : attention_mask_k
"""
def preprocess_function(input_data_batch):
    new_data_dict = {
        "input_ids_j" : [],
        "attention_mask_j" : [],
        "input_ids_k" : [],
        "attention_mask_k" : []
    }
    
    # Iterate over all examples in batch
    for question, response_j, response_k in zip(input_data_batch["question"], input_data_batch["response_j"], input_data_batch["response_k"]):
        tokenized_j = tokenizer(" Question : " + question + "\n \n Answer : " + response_j, truncation=True, max_length=512)
        tokenized_k = tokenizer(" Question : " + question + "\n \n Answer : " + response_k, truncation=True, max_length=512)
    
        new_data_dict["input_ids_j"].append(tokenized_j["input_ids"])
        new_data_dict["attention_mask_j"].append(tokenized_j["attention_mask"])
        new_data_dict["input_ids_k"].append(tokenized_k["input_ids"])
        new_data_dict["attention_mask_k"].append(tokenized_k["attention_mask"])
    
    # We just return new_data_dict, HF Datasets map() will take care of creating a new Dataset object with new_data_dict
    # and keep appending to it. HG Datasets does a lot boilerplate stuff for us
    return new_data_dict
    

In [18]:
# Now we apply above preprocess function to each element/batch in our dataset
final_train_dataset = dataset["train"].select(range(1000)).map(
    preprocess_function,
    batched = True,
    remove_columns = dataset["train"].column_names # We just care about numbers(input_ids, attention_mask etc.), remove all text
)

final_eval_dataset = dataset["test"].select(range(1000)).map(
    preprocess_function,
    batched = True,
    remove_columns = dataset["test"].column_names # We just care about numbers(input_ids, attention_mask etc.), remove all text
)

In [19]:
#output = reward_model_peft(input_ids = final_train_dataset["input_ids_chosen"], attention_mask = final_train_dataset["attention_mask_chosen"])

In [20]:
final_train_dataset

Dataset({
    features: ['input_ids_j', 'attention_mask_j', 'input_ids_k', 'attention_mask_k'],
    num_rows: 1000
})

In [21]:
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: tokenizer
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        features_j = []
        features_k = []
        for feature in features:
            features_j.append(
                {
                    "input_ids": feature["input_ids_j"],
                    "attention_mask": feature["attention_mask_j"],
                }
            )
            features_k.append(
                {
                    "input_ids": feature["input_ids_k"],
                    "attention_mask": feature["attention_mask_k"],
                }
            )
        batch_j = self.tokenizer.pad(
            features_j,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch_k = self.tokenizer.pad(
            features_k,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids_j": batch_j["input_ids"],
            "attention_mask_j": batch_j["attention_mask"],
            "input_ids_k": batch_k["input_ids"],
            "attention_mask_k": batch_k["attention_mask"],
            "return_loss": True,
        }
        return batch


In [22]:
# Define the metric that we'll use for validation.
import evaluate
accuracy = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, _ = eval_pred
    # Here, predictions is rewards_j and rewards_k.
    # We want to see how much of the time rewards_j > rewards_k.
    predictions = np.argmax(predictions, axis=0)
    labels = np.zeros(predictions.shape)
    return accuracy.compute(predictions=predictions, references=labels)

In [23]:
# Since we are using a custom pairwise loss as per : 
# 1. https://huyenchip.com/2023/05/02/rlhf.html#phase_3_rlhf
# 2. https://huggingface.co/blog/stackllama
# We override the compute_loss() function of Trainer class
class RewardTrainer(Trainer):
    def compute_loss(self, reward_model_peft, inputs, return_outputs = False):
        
        # Forward pass with batch j i.e chosen answer
        rewards_for_chosen_answers = reward_model_peft(input_ids = inputs["input_ids_j"], attention_mask = inputs["attention_mask_j"])[0]
        
        # Forward pass with batch k i.e rejected answer
        rewards_for_rejected_answers = reward_model_peft(input_ids = inputs["input_ids_k"], attention_mask = inputs["attention_mask_k"])[0]
        
        # Pairwise loss calculation for batch with dimenions x * y
        loss = -nn.functional.logsigmoid(rewards_for_chosen_answers - rewards_for_rejected_answers).mean()
        
        return loss

In [24]:
# Now we can use the RewardTrainer class same way we can use Trainer class i.e pass TrainingConfig to Trainer and then train model 

trainer = RewardTrainer(
    model = reward_model_peft,
    args = training_args,
    train_dataset=final_train_dataset,
    eval_dataset=final_eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer, max_length=256),
)

In [None]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss


