<a href="https://colab.research.google.com/github/badoil/ML/blob/master/peft_model_fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate peft

Installing collected packages: tokenizers, safetensors, xxhash, dill, responses, multiprocess, huggingface-hub, transformers, datasets, evaluate, accelerate, peft
Successfully installed accelerate-0.22.0 datasets-2.14.5 dill-0.3.7 evaluate-0.4.0 huggingface-hub-0.16.4 multiprocess-0.70.15 peft-0.5.0 responses-0.18.0 safetensors-0.3.3 tokenizers-0.13.3 transformers-4.33.1 xxhash-3.3.0


In [2]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer

In [4]:
import torch
import pandas as pd
import numpy
import evaluate

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
device

device(type='cuda')

In [9]:
!unzip drive-download-20230908T114816Z-001.zip

Archive:  drive-download-20230908T114816Z-001.zip
  inflating: adapter_config.json     
  inflating: tokenizer_config.json   
  inflating: README.md               
  inflating: special_tokens_map.json  
  inflating: tokenizer.json          
  inflating: adapter_model.bin       


In [None]:
dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(dataset_name)
dataset

In [None]:
model_name = "google/flan-t5-base"
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)  # 원래 32float을 16float으로 경량화해서 모델 불러옴
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [11]:
parameter_num = 0
for name, param in original_model.named_parameters():
  if param.requires_grad:
    parameter_num += param.numel()
print(parameter_num)

247577856


In [None]:
i = 5
X = dataset['test'][i]['dialogue']
Y = dataset['test'][i]['summary']
prompt = f"""
summarize the following dialogue
{X}
summary:
"""
print(prompt)

In [None]:
input = tokenizer(prompt, return_tensors = 'pt')
input

In [None]:
output_tokens = original_model.generate(
    input['input_ids'],
    max_new_tokens=200
)[0]
output = tokenizer.decode(output_tokens, skip_special_tokens=True)
output

In [None]:
Y

In [None]:
def tokenize_function(dataset):
  instruction = f"""summarize the following dialogue"""
  end_prompt = f"""summary:"""

  prompts = [f"{instruction}\n\n{text}\n\n{end_prompt}" for text in  dataset['dialogue']]
  dataset['input_ids'] = tokenizer(prompts, padding="max_length", truncation=True, return_tensors='pt').input_ids
  dataset['labels'] = tokenizer(dataset['summary'], padding="max_length", truncation=True, return_tensors='pt').input_ids
  return dataset

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [17]:
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'dialogue', 'summary', 'topic'])

In [None]:
tokenized_dataset

In [23]:
out_dir = './flant5_dialogsum_finetuned'
training_args = TrainingArguments(
    output_dir = out_dir,
    learning_rate = 1e-5,
    num_train_epochs = 1,
    weight_decay = 0.01,
    logging_steps = 1,
    max_steps = 1   # 1step 은 한번의 batch size 돌아갈때 의미, 파라미터가 2억개라 한번만 돌려야함
)

In [25]:
trainer = Trainer(
    model = original_model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation']
)

In [None]:
trainer.train()

In [27]:
one_step_ft_t5 = trainer.model

In [None]:
one_step_ft_t5

In [29]:
# gpu memory 비우기
del trainer
torch.cuda.empty_cache()

In [19]:
# PEFT: Parameter Efficient Fine Tuning
# LORA: LOw-Rank-Adaption-of-large language model
from peft import LoraConfig, get_peft_model, TaskType, PeftModel

In [20]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=['q', 'v'],
    lora_dropout = 0.01,
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [21]:
peft_model = get_peft_model(original_model, lora_config)

In [22]:
parameter_num = 0
for name, param in peft_model.named_parameters():
  if param.requires_grad:
    parameter_num += param.numel()
print(parameter_num)

3538944


In [None]:
out_dir = './peft_flant5_dialogsum_finetuned'
training_args = TrainingArguments(
    output_dir = out_dir,
    auto_find_batch_size = True,
    learning_rate = 1e-3,  # 파라미터 많이 줄어서 le 좀 키워줌
    num_train_epochs = 1,
    weight_decay = 0.01,
    # logging_steps = 1,
    # max_steps = 1   # 1step 은 한번의 batch size 돌아갈때 의미, 파라미터가 2억개라 한번만 돌려야함
)

trainer = Trainer(
    model = peft_model,
    args = training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation']
)

trainer.train()

In [24]:
# load model from local
peft_model_from_local = PeftModel.from_pretrained(AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16),
                        '/content/peft_model', torch_dtype=torch.bfloat16, is_trainable=False)

In [None]:
peft_model_from_local.to(device)

In [None]:
!pip install rouge_score

In [None]:
rouge = evaluate.load('rouge')