In [1]:
%%capture
!pip install transformers[torch]
!pip install trl
!pip install evaluate
!pip install transformers accelerate
!pip install bitsandbytes==0.44.0
!pip install accelerate --upgrade



In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, BitsAndBytesConfig
from typing import Any, Dict, List, Optional, Union
from dataclasses import dataclass, field
from peft import LoraConfig, TaskType, get_peft_model,PeftModelForSequenceClassification
from transformers.utils import PaddingStrategy
import torch.nn as nn

In [3]:
#reward_model_checkpoint = "prajjwal1/bert-small"
reward_model_checkpoint = "ibm-granite/granite-7b-base"
dataset_checkpoint = "Anthropic/hh-rlhf"
output_dir = "D:\huggingface"

In [4]:
# Load dataset

train_dataset = load_dataset(dataset_checkpoint, split="train")
test_dataset = load_dataset(dataset_checkpoint, split="test")

Downloading readme:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/743k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/875k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [20]:
# Define Tokenizer
tokenizer = AutoTokenizer.from_pretrained(reward_model_checkpoint, truncation=True)
# Set the pad_token to be the same as eos_token
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.pad_token)

</s>


In [6]:
train_dataset

Dataset({
    features: ['chosen', 'rejected'],
    num_rows: 160800
})

In [7]:
for feature in train_dataset.features:
    print(feature)

chosen
rejected


In [8]:
""" Preprocessing : tokenize the data

Input -> 'chosen' , output ->  'input_ids_chosen' , 'attention_mask_chosen'
Input -> 'rejected' , output ->  'input_ids_rejected' , 'attention_mask_rejected'

So total 4 columns in output dataset and each column to be tokenized


Returns : new HuggingFace Dataset object. map() function will create a new Dataset from the value we return from map().
map() will be applied to each element of original dataset and since the same value is returned for every map() operation,
it becomes easier to append it to the new Dataset object

"""
def preprocess_function(examples_batch):
    new_dataset_schema = {
        "input_ids_chosen" : [],
        "attention_mask_chosen" : [],
        "input_ids_rejected" : [],
        "attention_mask_rejected" : []
    }
    
    # Iterate over columns in Dataset
    for chosen, rejected in zip(examples_batch["chosen"], examples_batch["rejected"]):
        tokenized_chosen = tokenizer(chosen, max_length=512, truncation=True)
        tokenized_rejected = tokenizer(rejected, max_length=512, truncation=True)

        new_dataset_schema["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_dataset_schema["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_dataset_schema["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_dataset_schema["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])
    
    return new_dataset_schema
    
final_tokenized_train_dataset = train_dataset.map(
                                    preprocess_function,
                                    batched = True,
                                    remove_columns = train_dataset.column_names
                                )

final_tokenized_test_dataset = test_dataset.map(
                                    preprocess_function,
                                    batched = True,
                                    remove_columns = test_dataset.column_names
                                )

Map:   0%|          | 0/160800 [00:00<?, ? examples/s]

Map:   0%|          | 0/8552 [00:00<?, ? examples/s]

In [9]:
final_tokenized_train_dataset

Dataset({
    features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 160800
})

In [10]:
"""
Define DataCollator. This will pad the batch. Since accepted column needs to be passed to model separately in 1st pass along with attention_mask_accepted
and rejected column needs to be passed in second pass along with attention_mask_rejected, so we separate out these two
i.e we create a separate padding logic just like in previous reward model training.

Input : Batch of examples generated from below Dataset (Unpadded) in a list of dictionaries of type List[Dict[str, Any]].
This batch is generated by HuggingFace Trainer class using PyTorch DataLoader API

[
    {
        "input_ids_chosen" :[],
        "attention_mask_chosen" : [],
        "input_ids_rejected" : [],
        "attention_mask_rejected" : []
    },
    {
        "input_ids_chosen" :[],
        "attention_mask_chosen" : [],
        "input_ids_rejected" : [],
        "attention_mask_rejected" : []
    }, 
    .
    .
    .
    . 
]

[Dataset({
    features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 160800
})] ---> unpadded 

Output : Padded batch

Dataset({
    features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 160800
}) ---> padded 

Since 'chosen' and 'rejected' columns are of different lengths, we need to pad their input_ids and attention_mask separately

"""
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: tokenizer
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, Any]:
        accepted = [] # we process accepted and rejected separately in 2 forward passes. So we pad them separately too to make process more efficient
        rejected = []

        for example in batch:
            accepted.append(
                {
                    # Here we use "input_ids" as key because this batch will be passed to our tokenizers pad() method
                    # and pad() only expects "input_ids" and "attention_maks" as keys
                    "input_ids" : example["input_ids_chosen"],
                    "attention_mask" : example["attention_mask_chosen"]
                }
            )
            rejected.append(
                {
                    "input_ids" : example["input_ids_rejected"],
                    "attention_mask" : example["attention_mask_rejected"]
                }
            )
        padded_batch_accepeted = tokenizer.pad(
                                accepted,
                                padding=True,
                                return_tensors=self.return_tensors,
                                verbose=True)

        padded_batch_rejected = tokenizer.pad(
                                rejected,
                                padding=True,
                                return_tensors=self.return_tensors,
                                verbose=True)
        
    
        batch = {
            "input_ids_chosen" : padded_batch_accepeted["input_ids"],
            "attention_mask_chosen" : padded_batch_accepeted["attention_mask"],
            "input_ids_rejected" : padded_batch_rejected["input_ids"],
            "attention_mask_rejected" : padded_batch_rejected["attention_mask"],
            "return_loss" : True
        }
        return batch

In [11]:
# # Set up quantization config for 4-bit
# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,     # Enables 4-bit quantization
#     bnb_4bit_use_double_quant=True,  # Optional: Uses double quantization for further compression
#     bnb_4bit_quant_type='nf4',  # Quantization type: 'nf4' recommended for LLaMA
# )

In [12]:
"""
Since we need to fine tune an existing LLM for our reward model, LoRA would be a good choice here. We will just train
the LoRA layers and keep rest of the model frozen
"""

# LoRA config
peft_config = LoraConfig(
    task_type = TaskType.SEQ_CLS, # based on task specified here, task_type helps set correct head & correct loss fn.
    inference_mode = False,
    r = 4, # Set rank for LoRA matrices
    lora_alpha=32,
    lora_dropout=0.1
)



In [13]:
import bitsandbytes as bnb
print(bnb.__version__)


0.44.0


In [14]:
# Get base reward model from huggingface hub which we will tunr with LoRA
reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_checkpoint, num_labels = 1, load_in_4bit=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ibm-granite/granite-7b-base and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# We now have peft_config and base model. We can easily get a LoRA model through the method get_peft_model()

peft_reward_model = get_peft_model(reward_model, peft_config)
peft_reward_model.print_trainable_parameters()

trainable params: 2,101,248 || all params: 6,609,448,960 || trainable%: 0.0318


In [16]:
# Lets define custom pairwise loss by overriding compute_loss function of Trainer class

class RewardTrainer(Trainer):
    
    # Input -> a batch coming from our custom DataCollator 
    def compute_loss(self, peft_reward_model, inputs, return_outputs = False):
        
        # R_a reward for forward pass 1 with chosen inputs
        rewards_for_chosen_answers = peft_reward_model(input_ids = inputs["input_ids_chosen"], attention_mask = inputs["attention_mask_chosen"])[0]
        
        # R_b reward for forward pass 2 with rejected inputs
        rewards_for_rejected_answers = peft_reward_model(input_ids = inputs["input_ids_rejected"], attention_mask = inputs["attention_mask_rejected"])[0]
        
        # Pairwise loss calculation for batch with dimenions x * y
        loss = -nn.functional.logsigmoid(rewards_for_chosen_answers - rewards_for_rejected_answers).mean()
        
        return loss

In [21]:
# Now we have our final dataset, data_collator, PEFT model, custom loss function defined. All we need to do it
# create training arguments for our Trainer 

# HuggingFace Trainer API takes TrainingArguments as input (although optional). So lets set config for training

training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate=1e-6,
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=100,
    remove_unused_columns=False,
    label_names=[],
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size = 1
)

In [22]:
trainer = RewardTrainer(
            model = peft_reward_model,
            args = training_args,
            train_dataset=final_tokenized_train_dataset,
            eval_dataset=final_tokenized_test_dataset,
            data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer, max_length=128))

In [None]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss
