In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import PeftModel
import torch

In [2]:
dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(dataset_name)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [4]:
checkpoint = "google/flan-t5-small"

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

full_finetuned_model = AutoModelForSeq2SeqLM.from_pretrained('./dialogue-summary-training/checkpoint-31000')

peft_finetuned_checkpoint = "./peft-dialogue-summary-checkpoint-local"
peft_finetuned_model = PeftModel.from_pretrained(model, peft_finetuned_checkpoint,
                                  torch_dtype=torch.bfloat16, is_trainable=False)
peft_finetuned_tokenizer = AutoTokenizer.from_pretrained(peft_finetuned_checkpoint)

In [5]:
def print_number_of_trainable_model_params(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()


    print("Trainable model parameters: ", trainable_model_params)
    print("All model parameters: ", all_model_params)
    print("Percentage of trainable model parameters: ", (trainable_model_params / all_model_params) * 100)

In [6]:
print_number_of_trainable_model_params(model)

Trainable model parameters:  0
All model parameters:  77305216
Percentage of trainable model parameters:  0.0


In [7]:
print_number_of_trainable_model_params(full_finetuned_model)

Trainable model parameters:  76961152
All model parameters:  76961152
Percentage of trainable model parameters:  100.0


In [8]:
print_number_of_trainable_model_params(peft_finetuned_model)

Trainable model parameters:  0
All model parameters:  77305216
Percentage of trainable model parameters:  0.0


In [None]:
index = 200

dialogue = dataset['train'][index]['dialogue']
summary = dataset['train'][index]['summary']

prompt1 = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

for prompt in [prompt1]:
    inputs = tokenizer(prompt, return_tensors='pt')
    output = tokenizer.decode(
        model.generate(
            inputs['input_ids'],
            max_new_tokens=200
        )[0],
        skip_special_tokens=True
    )

    full_finetuned_model_output = tokenizer.decode(
        full_finetuned_model.generate(
            inputs['input_ids'],
            max_new_tokens=200
        )[0],
        skip_special_tokens=True
    )

    peft_finetuned_model_output = peft_finetuned_tokenizer.decode(
        peft_finetuned_model.generate(
            input_ids=inputs['input_ids'],
            max_new_tokens=200
        )[0],
        skip_special_tokens=True
    )

    dash_line = '-'.join('' for x in range(100))
    print(dash_line)
    print(f'INPUT PROMPT:\n{prompt}')
    print(dash_line)
    print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
    print(dash_line)
    print(f'MODEL GENERATION - ZERO SHOT:\n{output}')
    print(dash_line)
    print(f'FULL FINETUNED MODEL GENERATION:\n{full_finetuned_model_output}')
    print(dash_line)
    print(f'PEFT LoRA FINETUNED MODEL GENERATION:\n{peft_finetuned_model_output}')

In [None]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary'
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example['dialogue']]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example['summary'], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'dialogue', 'summary', 'topic'])

In [None]:
tokenized_dataset

In [None]:
# for sampling purpose
tokenized_dataset.filter(lambda example, index: index % 1000 == 0, with_indices=True)

In [None]:
full_finetuned_model = AutoModelForSeq2SeqLM.from_pretrained('./dialogue-summary-training/checkpoint-31000')

In [None]:
from transformers import GenerationConfig

In [None]:
index = 200
dialogue = dataset['test'][index]['dialogue']
human_baseline_summary = dataset['test'][index]['summary']

prompt = f"""
Summarize the following conversation.

{dialogue}

Summary:
"""

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

original_model_outputs = model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

instruct_model_outputs = full_finetuned_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))
instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{human_baseline_summary}')
print(dash_line)
print(f'ORIGINAL MODEL:\n{original_model_text_output}')
print(dash_line)
print(f'INSTRUCT MODEL:\n{instruct_model_text_output}')