In [5]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch \
    torchdata --quiet

%pip install \
    transformers \
    datasets \
    evaluate \
    rouge_score \
    loralib \
    peft==0.3.0 --quiet

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: distro-info 0.23ubuntu1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: python-debian 0.1.36ubuntu1 has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of python-debian or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.
[33mDEPRECATION: distro-info 0.23ubuntu1 has a non-standard version number. pip 23.3 will enforce this behaviou

In [6]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

### Load dataset and LLM pre-trained model

In [7]:
huggingface_dataset_name = "knkarthick/dialogsum"

dataset = load_dataset(huggingface_dataset_name)

dataset

Found cached dataset csv (/home/azadeh/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [8]:
# Load the model and tokenizer
model_name='google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)




In [9]:
# count the number of trainable parameters in the model 
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(original_model))

trainable model parameters: 247577856
all model parameters: 247577856
percentage of trainable model parameters: 100.00%


In [10]:
# tokenize the dataset
def tokenize_modified_prompt(example):
    prompt = "Summarize the following dialogue:\n\n"
    conclusion = "\n\nSummary:"
    prompt = [prompt+dialogue+conclusion for dialogue in example["dialogue"]]
    
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation = True, return_tensors = "pt").input_ids
    
    example['labels'] = tokenizer(example["summary"], padding = "max_length", truncation= True, return_tensors="pt").input_ids
    return example

tokenized_datasets = dataset.map(tokenize_modified_prompt,batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary',])
    



Loading cached processed dataset at /home/azadeh/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-72dbc6c7491dfbea.arrow
Loading cached processed dataset at /home/azadeh/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-9d6b2cff6318e8ed.arrow
Loading cached processed dataset at /home/azadeh/.cache/huggingface/datasets/knkarthick___csv/knkarthick--dialogsum-931380d0e19583fc/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-55d1958fb83e126a.arrow


In [11]:
#To save some time of training, I will subsample the dataset:
# tokenized_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 500
    })
})


###  Full weight Fine-Tuning the Model with the Preprocessed Dataset

In [21]:
output_dir = "./models/"

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-3,
    num_train_epochs=1,
    weight_decay=0.01
#     per_device_train_batch_size=8
#     max_steps = 8
    
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)


trainer.train()
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)



Step,Training Loss
500,0.3907
1000,0.1008
1500,0.0954


('./models/tokenizer_config.json',
 './models/special_tokens_map.json',
 './models/tokenizer.json')

In [22]:
# generate test samples
rouge = evaluate.load('rouge')
GroundTruth = []
original_model_text = []
instruct_model_text = []
peft_model_text = []




model_name='google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)









model_name='./models'
tokenizer = AutoTokenizer.from_pretrained(model_name)
instruct_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)









for idx,data in enumerate(dataset["test"][0:10]["dialogue"]):
    
    prompt = "Summarize the following dialogue:\n\n"
    conclusion = "\n\nSummary:"
    
    prompt = prompt+data+conclusion
    
    
    GroundTruth.append(dataset["test"][idx]["summary"])
    
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    
    

    original_model_outputs = original_model.generate(input_ids = input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_text.append(original_model_text_output)
    
    
    instruct_model_outputs = instruct_model.generate(input_ids = input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_text.append(instruct_model_text_output)
    
    
    
    
print(len(original_model_text))
print(len(GroundTruth))

original_model_results = rouge.compute(
predictions=original_model_text,
references=GroundTruth,
use_aggregator=True,
use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_text,
    references=GroundTruth,
    use_aggregator=True,
    use_stemmer=True,
)


print('ORIGINAL MODEL:')
print(original_model_results)

print('INSTRUCT MODEL:')
print(instruct_model_results)
   


10
10
ORIGINAL MODEL:
{'rouge1': 0.30248625548625546, 'rouge2': 0.10409523809523809, 'rougeL': 0.2452330902330902, 'rougeLsum': 0.2488035838035838}
INSTRUCT MODEL:
{'rouge1': 0.39837704797137347, 'rouge2': 0.14291248120480274, 'rougeL': 0.3049506389012045, 'rougeLsum': 0.3066952433947717}


### Setup the PEFT/LoRA model for Fine-Tuning

In [23]:
from peft import LoraConfig, get_peft_model, TaskType



lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32, #scaling factor for the weight matrices
    target_modules=["q", "v"],
    lora_dropout=0.05,  #dropout probability of the LoRA layers
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [24]:
peft_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [25]:
from huggingface_hub import login
import os




peft_training_args = TrainingArguments(
    output_dir="./peft_models/",
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
       
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [26]:
peft_trainer.train()

peft_model_path="./peft_models/"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)




Step,Training Loss
500,1.6837
1000,0.1217
1500,0.116


('./peft_models/tokenizer_config.json',
 './peft_models/special_tokens_map.json',
 './peft_models/tokenizer.json')

In [27]:
from peft import PeftModel, PeftConfig
rouge = evaluate.load('rouge')
model_name='google/flan-t5-base'


peft_model_path="./peft_models/"
peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
tokenizer_ins = AutoTokenizer.from_pretrained("./models")

peft_model_ = PeftModel.from_pretrained(peft_model_base, 
                                       './peft_models', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)
tokenizer_peft = AutoTokenizer.from_pretrained(peft_model_path)


# peft_model_ = peft_model_.to('cuda')

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
instruct_model = AutoModelForSeq2SeqLM.from_pretrained("./models/checkpoint-1500")


In [28]:

# generate test samples

GroundTruth = []
original_model_text = []
instruct_model_text = []
peft_model_text = []


for idx,data in enumerate(dataset["test"][0:10]["dialogue"]):
    prompt = "Summarize the following dialogue:\n\n"
    conclusion = "\n\nSummary:"
    prompt = prompt+data+conclusion
    
    
    GroundTruth.append(dataset["test"][idx]["summary"])
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    input_ids_peft = tokenizer_peft(prompt, return_tensors="pt").input_ids
    
    original_model_outputs = original_model.generate(input_ids = input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)
    original_model_text.append(original_model_text_output)
    
    
    instruct_model_outputs = instruct_model.generate(input_ids = input_ids,generation_config=GenerationConfig(max_new_tokens=200))
    instruct_model_text_output = tokenizer_ins.decode(instruct_model_outputs[0], skip_special_tokens=True)
    instruct_model_text.append(instruct_model_text_output)
    
    
    peft_model_outputs = peft_model_.generate(input_ids = input_ids_peft,generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer_peft.decode(peft_model_outputs[0], skip_special_tokens=True)
    peft_model_text.append(peft_model_text_output)
    
    
    
    
print(len(original_model_text))
print(len(GroundTruth))

original_model_results = rouge.compute(
predictions=original_model_text,
references=GroundTruth,
use_aggregator=True,
use_stemmer=True,
)

instruct_model_results = rouge.compute(
    predictions=instruct_model_text,
    references=GroundTruth,
    use_aggregator=True,
    use_stemmer=True,
)


peft_model_results = rouge.compute(
    predictions=peft_model_text,
    references=GroundTruth,
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('INSTRUCT MODEL:')
print(instruct_model_results)
print('PEFT MODEL:')
print(peft_model_results)

    
    
    
    

10
10
ORIGINAL MODEL:
{'rouge1': 0.30248625548625546, 'rouge2': 0.10409523809523809, 'rougeL': 0.2452330902330902, 'rougeLsum': 0.2488035838035838}
INSTRUCT MODEL:
{'rouge1': 0.40351388618903283, 'rouge2': 0.14508081280186477, 'rougeL': 0.31029032869186324, 'rougeLsum': 0.31187514206166267}
PEFT MODEL:
{'rouge1': 0.40059691413991805, 'rouge2': 0.129298581499801, 'rougeL': 0.3025987734334751, 'rougeLsum': 0.3015411951637584}
