## Optimizing the finetuned custom GPT2 using Reinforcement Learning from Human Feedback (RLHF) 

Instead of human feedback as a reward mechanism, we use a generation evaluation metric like BERTScore here.

##### Prerequisite

In [None]:
!pip install jupyter==1.0.0
!pip install ipywidgets==8.0.4
!pip install transformers==4.26.0
!pip install datasets==2.9.0
!pip install wandb==0.13.9
!pip install -e git+https://arunprsh:43211b1b75fad82266961eff3b85a061b53daae5@github.com/lvwerra/trl.git@v0.2.1#egg=trl

#### Imports 

In [2]:
from trl import AutoModelForCausalLMWithValueHead
from transformers import GPT2Tokenizer
from transformers import set_seed
from datasets import load_dataset
import matplotlib.pyplot as plt
from datasets import Dataset
from random import choices
from trl import PPOTrainer
from trl import PPOConfig
from tqdm import tqdm
import transformers 
import pandas as pd
import numpy as np
import ipywidgets
import datasets
import logging
import jupyter
import random
import torch
import wandb
import time
import trl
import os

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [3]:
from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

##### Setup logging

In [4]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

##### Log versions of dependencies 

In [5]:
logger.info(f'[Using transformers version: {transformers.__version__}]')
logger.info(f'[Using datasets version: {datasets.__version__}]')
logger.info(f'[Using wandb version: {wandb.__version__}]')
logger.info(f'[Using trl version: {trl.__version__}]')

[Using transformers version: 4.26.0]
[Using datasets version: 2.9.0]
[Using wandb version: 0.13.9]
[Using trl version: 0.2.1]


#### Setup essentials 

In [6]:
np.random.seed(123)
tqdm.pandas()
set_seed(123)

In [7]:
!wandb login 8489739d838b89d2f424147f354f9db40517c1c9

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [8]:
path = os.path.abspath('01-rlhf-Copy1.ipynb')
os.environ['WANDB_NOTEBOOK_NAME'] = path

##### Set constants 

In [9]:
config = PPOConfig(
    model_name="lvwerra/gpt2-imdb",
    learning_rate=1.41e-5,
    log_with="wandb",
)

sent_kwargs = {
    "return_all_scores": True,
    "function_to_apply": "none",
    "batch_size": config.forward_batch_size
}

In [32]:
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should 
    customize this function to train the model on its own dataset.
    
    Args:
        dataset_name (`str`): 
            The name of the dataset to be loaded.
    
    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets
    ds = load_dataset(dataset_name, split='train[:2%]')

    ds = ds.rename_columns({'text': 'review'})
    ds = ds.filter(lambda x: len(x["review"])>200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[:input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type='torch')
    return ds

In [33]:
dataset = build_dataset(config)
dataset



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/495 [00:00<?, ?ex/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['review', 'label', 'input_ids', 'query'],
    num_rows: 495
})

In [34]:
dataset[0]

{'review': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far 

In [35]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [36]:
collator(dataset).keys()

dict_keys(['review', 'label', 'input_ids', 'query'])

In [37]:
coll = collator(dataset)

In [38]:
len(coll['label'])

495

In [39]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token

In [40]:
tokenizer

GPT2TokenizerFast(name_or_path='lvwerra/gpt2-imdb', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'})

In [41]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668582783313467, max=1.0…

In [42]:
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb")
sentiment_pipe

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x7f4d72b9fbb0>

In [43]:
text = 'this movie was really bad!!'
sentiment_pipe(text, **sent_kwargs)



[[{'label': 'NEGATIVE', 'score': 2.335048198699951},
  {'label': 'POSITIVE', 'score': -2.726576089859009}]]

In [44]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)


generation_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id
}

In [45]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    print(epoch)

2it [00:00, 17.89it/s]

0
1





In [46]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch['input_ids']
    print('QT', query_tensors)

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
        
    print('RT', response_tensors)
    batch['response'] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
    
    print('Res', [tokenizer.decode(r.squeeze()) for r in response_tensors])
    
    
    

    #### Compute sentiment score
    texts = [q + r for q,r in zip(batch['query'], batch['response'])]
    print('Text', texts)
    for text in texts:
        print('Text--->', text)
        print()
    
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    print('Pipe_output', pipe_outputs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
    print('Rewards', rewards)
    
    #### Run PPO step 
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    print('Stats', stats)
    print()
    print()
    print('Batch', batch)
    ppo_trainer.log_stats(stats, batch, rewards)
    

0it [00:00, ?it/s]

QT [tensor([ 39, 603]), tensor([37603,  2431,   286]), tensor([14385,    11,  2921,   340,   257,   352]), tensor([ 1639,   423,   284, 21099,  8114,  1632]), tensor([1858,  389, 2089, 6918,   11, 7818, 6918]), tensor([1212, 3807,  468]), tensor([  32, 1256]), tensor([ 3198,   286,   262,   749, 23374,  7328]), tensor([  40, 2048, 1444]), tensor([   40,   373,   523, 13321,  6568]), tensor([ 2782, 43011,    11,   314,  1064,   978,  6319]), tensor([ 3666,  1393,   287, 40349,  4285]), tensor([1212,  318,  340]), tensor([8332,  257]), tensor([15496,    13,   314,   716,  3362,   371]), tensor([  464, 22732, 11102]), tensor([30513,   706]), tensor([9693, 7036, 3913,  402, 3861, 6089]), tensor([1890,  262, 1700,   11,  428, 2646,  318]), tensor([  40, 6209]), tensor([  40,  550, 1239]), tensor([  40, 6497,  510]), tensor([ 818,  281, 2230,  284]), tensor([2061,  373,  281]), tensor([1212, 2646,  318]), tensor([2504,  338,  407]), tensor([19156,  8121]), tensor([2396,   11,  810]), tensor(

1it [07:17, 437.77s/it]

QT [tensor([   40,   651,   284,   262, 22041]), tensor([24102,   257,  9082,   286]), tensor([22017,    11,   262]), tensor([  40, 2497]), tensor([   32,  1787,   532, 36741,   611]), tensor([1212, 2646,  318,  655]), tensor([28065,   373,  3668,  3596,   357]), tensor([  464, 48177, 29512,  2196,   286,  5199, 26431]), tensor([1212,  318,  257, 3807]), tensor([ 1532,   345,  1053,  1683,   587, 26775]), tensor([ 1212,  3807, 25669]), tensor([22017,    11,   257,  3807,   546, 19170,  4819]), tensor([1212,  460]), tensor([986, 482, 323]), tensor([ 464, 7235]), tensor([1212,  983,  373,  925,  416]), tensor([  16, 1374,  318,  340,  326]), tensor([ 464, 5290, 3807,  314,  423, 1775,  287]), tensor([    7,  4303, 49713]), tensor([1212,  468,  284,  307,  262]), tensor([ 464, 3195]), tensor([    1,   818, 25325,    11, 46675, 23343]), tensor([1212,  318, 1107,  257,  649, 1877]), tensor([1212,  318,  281, 1242, 2646,  326]), tensor([   7, 8130]), tensor([ 2061,   281, 13277,   284]), ten

1it [08:18, 498.42s/it]

Pipe_output [[{'label': 'NEGATIVE', 'score': -1.1404156684875488}, {'label': 'POSITIVE', 'score': 1.2314541339874268}], [{'label': 'NEGATIVE', 'score': 0.7295585870742798}, {'label': 'POSITIVE', 'score': -1.0518388748168945}], [{'label': 'NEGATIVE', 'score': -0.6965909600257874}, {'label': 'POSITIVE', 'score': 0.6715760231018066}], [{'label': 'NEGATIVE', 'score': -0.8130236864089966}, {'label': 'POSITIVE', 'score': 0.8802645802497864}], [{'label': 'NEGATIVE', 'score': 0.4954832196235657}, {'label': 'POSITIVE', 'score': -0.6878676414489746}], [{'label': 'NEGATIVE', 'score': 2.539947509765625}, {'label': 'POSITIVE', 'score': -2.918731212615967}], [{'label': 'NEGATIVE', 'score': 0.18088404834270477}, {'label': 'POSITIVE', 'score': -0.33418038487434387}], [{'label': 'NEGATIVE', 'score': -0.8305007219314575}, {'label': 'POSITIVE', 'score': 0.9284577369689941}], [{'label': 'NEGATIVE', 'score': -1.6749396324157715}, {'label': 'POSITIVE', 'score': 1.8882185220718384}], [{'label': 'NEGATIVE', '




ValueError: Batch size (256) does not match number of examples - but got 239 for: queries