Fine tune a LLM (FLAN-T5) specifically for summarization. First try FULL fine tuning
then PEFT fine tuning. ROUGE metrics for evaluation is used.
PEFT has more memory/computational advantages, despite the slightly lower performance when 
compared with FULL FINE TUNING.
NB: Check to have 8 GB CPU installed 32 GB

In [1]:
#CHECK TO HAVE THESE PACKAGES INSTALLED:
#torch, torchdata, transformers, datasets, evaluate, rouge_score, loralib, peft

In [2]:
#imports 
import torch
import time
import evaluate 
import pandas as pd
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
from datasets import load_dataset

In [3]:
#load the DialogSum Dataset
hugging_face_dataset_name = 'knkarthick/dialogsum'
dataset = load_dataset(hugging_face_dataset_name)
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
model_name = 'google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [5]:
def print_number_of_trainable_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f'Trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of model parameters: {100.0*trainable_model_params/all_model_params}'

In [6]:
print(print_number_of_trainable_parameters(original_model))

Trainable model parameters: 247577856
all model parameters: 247577856
percentage of model parameters: 100.0


In [7]:
#Zero shot inference testing
index = 200
dialogue = dataset['test'][index]['dialogue']
summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:"""

inputs = tokenizer(prompt, return_tensors='pt')
output = tokenizer.decode(
    original_model.generate(
        inputs['input_ids'], max_new_tokens=200)[0],
        skip_special_tokens=True)

dash_line = '-'.join('' for x in range(100))

print(dash_line)
print(f'Input prompt: \n{prompt}')
print(dash_line)
print(f'Baseline summary: \n{summary}')
print(dash_line)
print(f'Model generated summary: \n {output}')


---------------------------------------------------------------------------------------------------
Input prompt: 

Summarize the following conversation.

#Person1#: Have you considered upgrading your system?
#Person2#: Yes, but I'm not sure what exactly I would need.
#Person1#: You could consider adding a painting program to your software. It would allow you to make up your own flyers and banners for advertising.
#Person2#: That would be a definite bonus.
#Person1#: You might also want to upgrade your hardware because it is pretty outdated now.
#Person2#: How can we do that?
#Person1#: You'd probably need a faster processor, to begin with. And you also need a more powerful hard disc, more memory and a faster modem. Do you have a CD-ROM drive?
#Person2#: No.
#Person1#: Then you might want to add a CD-ROM drive too, because most new software programs are coming out on Cds.
#Person2#: That sounds great. Thanks.

Summary:
-------------------------------------------------------------------

In [8]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation. \n\n'
    end_prompt = '\n\n Summary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding='max_length', truncation=True, return_tensors='pt').input_ids
    example['labels'] = tokenizer(example['summary'], padding='max_length', truncation=True, return_tensors='pt').input_ids
    return example


In [9]:
#The dataset is divided into train, validation and test
#The tokenizer works with different batches

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary'])


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
})

In [11]:
tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

In [12]:
print(f'Shapes of the datasets:')
print(f'Training: {tokenized_datasets['train'].shape}')
print(f'Validation: {tokenized_datasets['validation'].shape}')
print(f'Test: {tokenized_datasets['test'].shape}')

Shapes of the datasets:
Training: (125, 2)
Validation: (5, 2)
Test: (15, 2)


In [13]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 125
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15
    })
})


In [14]:
output_dir = f'./dialogue-summary-training-{str(int(time.time()))}'
training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate = 1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1
)

trainer = Trainer(
    model = original_model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

In [None]:
#trainer.train()
#Instruction fine tuning of the model here.

In [22]:
#Download a checkpoint
aws s3 cp --recursive s3://dlai-generative-ai/models/flan-dialogue-summary-checkpoint/ ./flan-dialogue-summary-checkpoint/

SyntaxError: invalid syntax (2003088059.py, line 2)

In [23]:
ls -alh ./flan-dialogue-summary-checkpoint/pytorch_model.bin

Opzione non valida - "flan-dialogue-summary-checkpoint".


In [24]:
instruct_model = AutoModelForSeq2SeqLM.from_pretrained('./flan-dialogue-summary-checkpoint', torch_dtype=torch.bfloat16)


HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './flan-dialogue-summary-checkpoint'.

OBVIOUSLY DOESN'T WORK ;)

In [None]:
#TEST QUALITATIVELY THE FINE TUNED MODEL

index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:"""

input_ids = tokenizer(prompt, return_tensors='pt').input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config = GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'Baseline human summary:\n {human_baseline_summary}')
print(dash_line)
print(f'Original_model: \n {original_model_text_output}')
print(dash_line)
print(f'Instruct model: \n {instruct_model_text_output}')


In [None]:
rouge = evaluate.load('rouge')

In [None]:
#Evaluate Quantitatively the model
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []

for _, dialogue in enumerate(dialogues):
    prompt= f"""
Summarize the following conversation.

{dialogue}

Summary:"""
    
    input_ids = tokenizer(prompt, return_tensors='pt').input_ids

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    instruct_model_outputs = instruct_model.generate(input_ids = input_ids, generation_config = GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True) 
    instruct_model_summaries.append(instruct_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, instruct_model_summaries))

df = pd.DataFrame(zipped_summaries, columns=['human_based_summaries', 'original_model_summaries', 'instruct_model_summaries'])
df

In [None]:
#Evaluate ROUGE metrics
original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

print('Original model')
print(original_model_results)
print('Instruct model')
print(instruct_model_results)

In [None]:
#See results in a larger test dataset
results = pd.read_csv('data/dialogue-summary-training-results.csv')

human_baseline_summaries = results['human_baseline_summaries'].values
original_model_summaries = results['original_model_summaries'].values
instruct_model_summaries = results['instruct_model_summaries'].values 

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references = human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_summaries,
    references=human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

print('Original model')
print(original_model_results)
print('Instruct model')
print(instruct_model_results)

In [None]:
print('Absolute Percentage improvement of INSTRUCT MODEL over HUMAN BASELINE')
improvement = (np.array(list(instruct_model_results.values())) - np.array(list(original_model_results.values())))

for key, value in zip(instruct_model_results.keys(), improvement):
    print(f'{key}, {value*100:.2f}%')

Perform PEFT: instead of full fine tuning as did before, here we focus on Parameter Efficient Fine Tuning 
such as LoRA and prompt tuning (not prompt engineering!). 
LoRA allows to fine tune a low rank representation of the original model, with fewer computational resources, and at inference time the LoRA fine tuned model is added to the original model to evaluate the results.
REDUCTION OF MEMORY.

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, #RANK
    lora_alpha = 32,
    target_modules=['q', 'v'],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = TaskType.SEQ_2_SEQ_LM # FLAN T5
)

In [None]:
peft_model = get_peft_model(original_model, lora_config)
print(print_number_of_trainable_parameters(peft_model))
#HERE we train only 1.4% of the full model parameters

In [None]:
output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output=output_dir,
    auto_find_batch_size = True,
    learning_rate = 1e-3,
    num_training_epochs = 1,
    logging_steps = 1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset = tokenized_datasets['train']
)


In [None]:
peft_trainer.train()

peft_model_path = './peft-dialogue-summary-checkpoint-local'

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

In [3]:

#PEFT model is trained only on a subset of data. To obtain the model trained on the full dataset:
aws s3 cp --recursive s3://dlai-generative-ai/models/peft-dialogue-summary-checkpoint/ ./peft-dialogue-summary-checkpoint-from-s3/


SyntaxError: invalid syntax (3552743787.py, line 2)

In [4]:
ls -al ./peft-dialogue-summary-checkpoint-from-s3/adapter_model.bin

Formato del parametro non corretto - "peft-dialogue-summary-checkpoint-from-s3".


Reload the original NON fine tuned model FLAN-T5. You set trainable=False because the plan is to make
inferences.

In [None]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base', torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')

peft_model = PeftModel.from_pretrained(
    peft_model_base,
    './peft-dialogue-summary-checkpoint-from-s3/',
    torch_dtype=torch.bfloat16,
    is_trainable=False
)


In [None]:
print(print_number_of_trainable_parameters(peft_model))

In [None]:
#EVALUATE THE MODEL

index = 200
dialogue = dataset['test'][index]['dialogue']
baseline_human_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:"""


input_ids = tokenizer(prompt, return_tensors='pt').input_ids

original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'Baseline human summary:\n {human_baseline_summary} ')
print(dash_line)
print(f'Original model:\n {original_model_text_output}')
print(dash_line)
print(f'Instruct Model: \n {instruct_model_text_output}')
print(dash_line)
print(f'Peft Model: \n {peft_model_text_output}')


In [None]:
dialogues = dataset['test'][0:10]['dialogue']
human_baseline_summaries = dataset['test'][0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in enumerate(dialogues):
    prompt = f"""
Summarize the following conversation. 

{dialogue}

Summary:"""
    
    input_ids = tokenizer(prompt, return_tensors = 'pt').input_ids 

    human_baseline_text_output = human_baseline_summaries[idx]

    original_model_outputs = original_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_summaries.append(original_model_text_output)

    instruct_model_outputs = instruct_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_summaries.append(instruct_model_text_output)

    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries',
                                               'instruct_model_summaries', 'peft_model_summaries'])   

In [None]:
rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions = original_model_summaries,
    references = human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True
)

instruct_model_results = rouge.compute(
    predictions = instruct_model_summaries,
    references = human_baseline_summaries[0:len(instruct_model_summaries)],
    use_aggregator = True,
    use_stemmer=True
)

peft_model_results = rouge.compute(
    predictions = peft_model_summaries,
    references = human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator = True,
    use_stemmer=True
)

print('Original model')
print(original_model_results)
print('Instruct model')
print(instruct_model_results)
print('Peft model')
print(peft_model_results)


In [None]:
#You can also look the performance on the larger dataset