Fine tune the FLAN T5 model with reinforcement learning (PPO) and PEFT to generate less toxic summaries.
The reward model is a binary classifier which predicts 'hate' or 'not hate'. You will use PPO (Proximal Policy  Optimization) to fine tune and reduce the toxicity of the model. 

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

#AutoModelForSequenceClassification is used for the reward model and toxicity evaluation


#trl = transformer reinforcement learning lybrary
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler 

import torch 
import evaluate 

import numpy as np
import pandas as pd 

from tqdm import tqdm 
tqdm.pandas()


In [2]:
model_name = 'google/flan-t5-base'
hugging_face_dataset_name = 'knkarthick/dialogsum'
dataset_original = load_dataset(hugging_face_dataset_name)
dataset_original

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [3]:
#DATASET PREPROCESSING

#Take only a part of the dataset, with dialogues of a particular length. Wrap the dialogue in each prompt
# and tokenize it. Save token ids in the field 'input_ids' and decoded version of the prompts in the 
# field 'query'.

def build_dataset(model_name,
                dataset_name,
                input_min_text_length,
                input_max_text_length):

    #Use only the train part of the dataset
    dataset = load_dataset(dataset_name, split='train')
    #Filter the dialogues between min_length and max_length
    dataset = dataset.filter(lambda x: len(x['dialogue']) > input_min_text_length and len(x['dialogue']) < input_max_text_length)

    #prepare the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map='auto')
    #Setting devoce_map = 'auto' allows to automatically switch between CPU and GPU

    def tokenize(sample):
        prompt = f"""
        Summarize the following conversation.
        
        {sample['dialogue']}
        
        Summary:"""

        sample['input_ids'] = tokenizer.encode(prompt)
        #This is a query requirement for the PPO library
        sample['query'] = tokenizer.decode(sample['input_ids'])
        return sample

    #Tokenize each dialogue 
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type = 'torch')

    #Split the dataset into training and test 
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

    return dataset_splits

dataset = build_dataset(model_name=model_name,
                        dataset_name = hugging_face_dataset_name,
                        input_min_text_length = 200,
                        input_max_text_length = 1000)

print(dataset)

    

Filter:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/10016 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 8012
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 2004
    })
})


In [4]:
#Here we pull the PEFT model of the 2nd LAB, but the checkpoint trained on the full dataset, not only a part of it
aws s3 cp --recursive s3://dlai-generative-ai/models/peft-dialogue-summary-checkpoint/ ./peft-dialogue-summary-checkpoint-from-s3/

SyntaxError: invalid syntax (3492281895.py, line 2)

In [5]:
ls -alh ./peft-dialogue-summary-checkpoint-from-s3/adapter_model.bin

Formato del parametro non corretto - "peft-dialogue-summary-checkpoint-from-s3".


In [6]:
def print_number_of_trainable_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'Trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of model parameters: {100.0*trainable_model_params/all_model_params}'

In [None]:
#Add the adapter to the original FLAN T5 model. In the previous lab we were adding the fully adapter only 
#for inferences, so there was no need for LoRA configs to do that. Now you need to pass the configs to the 
#constructed PEFT model, also putting the is_trainable to True.

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=['q','v'],
    lora_dropout = 0.05,
    bias='none',
    task_type = TaskType.SEQ_2_SEQ_LM # FLAN T5
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)

peft_model = PeftModel.from_pretrained(model,
                                        './peft-dialogue-summary-checkpoint-from-s3/',
                                        lora_config=lora_config,
                                        torch_dtype=torch.float16,
                                        device_map='auto',
                                        is_trainable=True)

#Note that only 1.4 % of the model's original parameters is trainable
print(f'PEFT model parameters to be updated: \n {print_number_of_trainable_parameters(peft_model)} \n')

In [None]:
#Prepare the PPO model passing the instruct-fine-tuned-PEFT model to it.
#PPO will be used to optimize the RL policy against the reward model.
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model,
                                                                torch_dtype=torch.bfloat16,
                                                                is_trainable=True)

#During PPO only a few parameters are updated. Specifically the parameters of the ValueHead. The number of 
#trainable parameters can be computed as (n+1)*m where n is the number of input units (here 768) and m is 
#the number of output units (you have m=1). The +1 term accounts for the bias term.

print(f'PPO model parameters to be updated (ValueHead  769 params): \n {print_number_of_trainable_parameters(ppo_model)}\n')


In [None]:
#Now create a copy of the PPO which will not be fine tuned (reference model). The reference model represents
#the LLM before detoxification. None of the parameters of the PPO reference model will be updated.
ref_model = create_reference_model(ppo_model)
print(f'Reference model parameters to be updated: \n {print_number_of_trainable_parameters(ref_model)}\n')


In [None]:
#Reinforcement Learning is a part of Machine Learning, in which agents take actions in an environment 
#aimed at maximizing their cumulative rewards. The agent's behavior is defined by the policy.
#The goal of reinforcement learning is to learn an optimal or nearly-optimal policy that maximizes 
# the reward function.

# In the previous section the original policy is based on the instruct PEFT model. This is the LLM before detoxification.
# Now you need to define the reward model encouraging the agent to detoxify the dialogue summaries. 
# The intuitive approach would be to do some form of sentiment analysis across two classes (hate-nothate)
# and give a higher reward if there is a chanche of getting class nothate as output.
# Use facebook ROBERTA based hate speech model for REWARD MODEL. This model will output logits and then 
# predict probabilities across two classes: HATE or NOTHATE. The logits of the output NOTHATE will be taken as 
# a positive reward. Then the model will be fine-tuned with PPO using those reward values. 
# Create the reqired model class for the ROBERTA model. You will also need to create the corresponding tokenizer.
# hothate = 1 hate = 0 classes.

In [None]:
toxicity_model_name = 'facebook/roberta-hate-speech-dynabench-r4-target'
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map='auto')
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map='auto')
print(toxicity_model.config.id2label)

In [None]:
non_toxic_context = 'I want to kiss you.'

toxicity_input_ids = toxicity_tokenizer(non_toxic_context, return_tensors='pt').input_ids

logits = toxicity_model(input_ids=toxicity_input_ids).logits
print(f'logits [nothate, hate]' {logits.tolist()[0]})

probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [nothate, hate]: {probabilities}')

not_hate_index = 0
nothate_reward = (logits[:, not_hate_index]).to_list()
print(f'Reward (high): {nothate_reward}')

In [None]:
#USE hugging face pipeline to simplify the usage of the reward model

device = 0 if torch.cuda.is_available() else 'cpu'

sentiment_pipe = pipeline('sentiment-analysis',
                            model = toxicity_model_name,
                            device=device)

reward_logits_kwargs = {
    'top_k': None,
    'function_to_apply': 'none',
    'batch_size': 16
}

reward_probabilities_kwargs = {
    'top_k': None,
    'function_to_apply': 'softmax',
    'batch_size': 16
}

print('Reward model output for non toxic context:')
print(sentiment_pipe(non_toxic_context, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_context, **reward_probabilities_kwargs))

#CAN ALSO TRY WITH TOXIC TEXTS 

In [None]:
#EVALUATE TOXICITY
#To evaluate the model before and after fine-funing/detoxification you need to set up a metric. The toxicity 
#score is a value between 0 and 1 where 1 indicates the maximum toxicity.

toxicity_evaluator = evaluate.load('toxicity',
                                    toxicity_model_name,
                                    module_type='measurement',
                                    toxic_label='hate')



In [None]:
toxicity_score = toxicity_evaluator.compute(predictions=[non_toxic_context])

print('Toxicity score: ')
print(toxicity_score['toxicity'])

#ALSO FOR NEGATIVE INPUT SENTENCE 

In [None]:
#This evaluator can be used to compute the toxicity of the dialogues prepared in the forst part of the lab.
# You will need to pass the test dataset (dataset['test']), the same tokenizer, the frozen PEFT model and the 
# toxicity evaluator. Wrap all of this in the evaluate toxicity function

def evaluate_toxicity(model,
                    toxicity_evaluator,
                    tokenizer,
                    dataset,
                    num_samples):

    max_new_tokens = 100

    toxicities = []
    input_texts = []

    for i, sample in tqdm(enumerate(dataset)):
        input_text = sample['query']

        if i > num_samples:
            break

        input_ids = tokenizer(input_text, return_tensors='pt', padding=True).input_ids

        generation_config = GenerationConfig(max_new_tokens=max_new_tokens,
                                            top_k=0.0,
                                            top_p=1.0,
                                            do_sample=True)

        response_token_ids = model.generate(input_ids=input_ids, generation_config=generation_config)

        generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)

        toxicity_score = toxicity_evaluator.compute(predictions = [(input_text + '' + generated_text)])
        toxicities.append(toxicity_score['toxicity'])

    mean = np.mean(toxicities)
    std = np.std(toxicities)

    return mean, std 

In [None]:
#Perform the calculation of toxicity before fine-tuning / detoxification

mean_before_detoxification, std_before_detoxification = evaluate_toxicity(
                                                                            model=ref_model,
                                                                            toxicity_evaluator=toxicity_evaluator,
                                                                            tokenizer=tokenizer,
                                                                            dataset=dataset['test'],
                                                                            num_samples=10)

print(f'Toxicity [mean, std] before DETOX: [{mean_before_detoxification}, {std_before_detoxification}]' )


In [None]:
# INITIALIZE PPO Trainer 
# Set up the configuration parameters. Load the PPO model and the tokenizer. You will also load a frozen version 
# of the model: ref-model. The first model is optimized while the second one is frozen and serves as a 
# reference for the KL divergence term.

In [None]:
learning_rate = 1.41e-5
max_ppo_epochs = 1
mini_batch_size = 4
batch_size = 16 

config = PPOConfig(
    model_name = model_name,
    learning_rate = learning_rate,
    ppo_epochs = max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

ppo_trainer = PPOTrainer(config = config, 
                        model = ppo_model,
                        ref_model = ref_model,
                        tokenizer = tokenizer,
                        dataset = dataset['train'],
                        data_collator = collator)

In [None]:
#FINE TUNE THE MODEL 
#fine tuning loop consists in the following steps 
# 1) Get the query responses from the policy LLM (PEFT model)
# 2) Get sentiments for query/responses from hate speech roberta
# 3) Optimize policy with PPO using (query, response, reward) triplets

# You will see the following metrics running:
# objective KL: minimize kl divergence 
# ppo/returns/mean maximize mean returns 
# ppo/policy/advantages_mean maximize advantages 



In [None]:
output_min_length = 100
output_max_length = 400

output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    'min_length': 5,
    'top_k': 0.0,
    'top_p': 1.0,
    'do_sample': True
}


reward_kwargs = {
    'top_k': None,
    'function_to_apply': 'none',
    'batch_size': 16
}


max_ppo_steps =10

for step, batch in tqdm(enumerate(ppo_trainer.data_loader)):
    if step >= max_ppo_steps:
        break
    
    prompt_tensors = batch['input_ids']
    #GET RESPONSE FROM FLAN T5 / PEFT LLM
    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()
        generation_kwargs['max_new_tokens'] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)

        summary_tensors.append(summary.squeeze()[-max_new_tokens:])

    #This is needed to be called response
    batch['response'] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    #Compute reward outputs 
    query_response_pairs = [q + r for q,r in zip(batch['query'], batch['response'])]

    rewards = sentiment_pipe(query_response_pairs, **reward_kwargs)

    #Use nothate item because this is the score for the positive nothate class 

    reward_tensors = [torch.tensor(reward[not_hate_index]['score']) for reward in rewards]

    #run PPO step 

    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)

    ppo_trainer.log_stats(stats, batch, reward_tensors)

    print(f'objective KL': {stats['objective/kl']})
    print(f'ppo/returns mean': {stats['ppo/returns/mean']})
    print(f'ppo/policy/advantage mean': {stats['ppo/policy/advantages_mean']})
    


In [None]:
#compare original LLM and detoxified LLM with toxicity score 

batch_size=20
compare_results = {}

df_batch = dataset['test'][0:batch_size]

compare_results['query'] = df_batch['query']
prompt_tensors = df_batch['input_ids']

summary_tensors_ref = []
summary_tensors = []

for i in tqdm(range(batch_size)):
    gen_len = output_length_sampler()

    generation_kwargs['max_new_tokens'] = gen_len

    summary = ref_model.generate(
        input_ids = torch.as_tensor(prompt_tensors[1]).unsqueeze(dim=0).to(device),
        **generation_kwargs
    ).squeeze()[-gen_len:]

    summary_tensors_ref.append(summary)


    summary = ppo_model.generate(
        input_ids = torch.as_tensor(prompt_tensors[1]).unsqueeze(dim=0).to(device),
        **generation_kwargs
    ).squeeze()[-gen_len:]

    summary_tensors.append(summary)

#Decoded responses

compare_results['response_before'] = [tokenizer.decode(summary_tensors_ref[i]) for i in range(batch_size)]
compare_results['response_after'] = [tokenizer.decode(summary_tensors[i]) for i in range(batch_size)]

# Sentiment analysis 
texts_before = [d + s for d, s in zip(compare_results['query'], compare_results['response_before'])]
rewards_before = sentiment_pipe(texts_before, **reward_kwargs)
compare_results['reward_before'] = [reward[not_hate_index]['score'] for reward in rewards_before]

texts_after = [d + s for d, s in zip(compare_results['query'], compare_results['response_after'])]
rewards_after = sentiment_pipe(texts_after, **reward_kwargs)
compare_results['reward_after'] = [reward[not_hate_index]['score'] for reward in rewards_after]