In [1]:
# !pip install peft

출처
- https://huggingface.co/docs/peft/quicktour
- https://huggingface.co/bigscience/mt0-large

#### 기본 모델 설정 및 테스트

In [1]:
import os
import torch

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "0,1,2,3"

In [2]:
# 기본 모델 설정
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
checkpoint = "bigscience/mt0-large"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, 
                                              torch_dtype=torch.float16,
                                              device_map='auto'
                                              )
    
# 토크나이저 지정
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 테스트
input_texts = ["Translate to English: Je t’aime.",
               "Translate to Korean: Je t’aime."]

for input_text in input_texts:
    print("-"*30)
    inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs)
    output_text = tokenizer.decode(outputs[0])
    print(f"입력 : {input_text}")
    print(f"출력 : {output_text}")

------------------------------




입력 : Translate to English: Je t’aime.
출력 : <pad> I love you.</s>
------------------------------
입력 : Translate to Korean: Je t’aime.
출력 : <pad> 我爱你。</s>


#### 데이터세트 가공
- (1) 가공후 json으로 저장
- (2) load_dataset 모듈을 사용하여 데이터세트 로드

In [4]:
# 데이터 로드 및 지정
import json
DATA_PATH = "dataset/raw/TL_한국어-다국어_kofr_교양/"
SAVE_PATH = "dataset/processed/fr_ko.json"
save_lst = []
for i in os.listdir(DATA_PATH):
    data = json.load(open(DATA_PATH+i))
    save_lst.append({"source":"Translate to Korean: "+data["최종번역문"],
                     "target":data["원문"]})
with open(SAVE_PATH, "w") as json_file:
    json.dump(save_lst, json_file, indent=4, ensure_ascii=False)

In [5]:
from datasets import load_dataset
dataset = load_dataset('json', data_files=SAVE_PATH)
dataset

Generating train split: 16350 examples [00:00, 278032.00 examples/s]


DatasetDict({
    train: Dataset({
        features: ['target', 'source'],
        num_rows: 16350
    })
})

In [6]:
def preprocess_function(examples):
    '''기본적인 토크나이징 방식'''
    return tokenizer(examples["source"], examples["target"], padding="max_length", truncation=True, max_length=128)

def tokenize_function(examples):
    '''Seq2Seq에서의 토크나이징 방식'''
    # 입력 문장을 토크나이징
    model_inputs = tokenizer(examples["source"], padding="max_length", truncation=True, max_length=128)
    
    # 타겟 문장을 토크나이징하여 디코더 입력을 생성
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target"], padding="max_length", truncation=True, max_length=128)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/16350 [00:00<?, ? examples/s]

Map: 100%|██████████| 16350/16350 [00:01<00:00, 8228.56 examples/s]


In [8]:
# 학습 및 테스트로 분할
train_test_split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.2)
train_dataset = train_test_split_dataset['train']
test_dataset = train_test_split_dataset['test']

Trainer Args 지정

In [9]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

training_args = Seq2SeqTrainingArguments(
    output_dir="model/bigscience/mt0-large-lora",
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [10]:
os.environ["WANDB_PROJECT"]="huggingface"
os.environ["WANDB_ENTITY"]="changwoochoi"
os.environ["WANDB_NAME"]="ccw_finetuning_test"
os.environ["TOKENIZERS_PARALLELISM"]="false"

In [11]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mchangwoo7463[0m ([33mchangwoochoi[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,14.1924,


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=818, training_loss=8.67505539425428, metrics={'train_runtime': 738.8741, 'train_samples_per_second': 17.703, 'train_steps_per_second': 1.107, 'total_flos': 9778890276864000.0, 'train_loss': 8.67505539425428, 'epoch': 1.0})

In [12]:
trainer.save_model(output_dir=training_args.output_dir)

In [13]:
# 테스트
input_texts = ["Translate to English: Je t’aime.",
               "Translate to Korean: Je t’aime."]

for input_text in input_texts:
    print("-"*30)
    inputs = tokenizer.encode(input_text, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs)
    output_text = tokenizer.decode(outputs[0])
    print(f"입력 : {input_text}")
    print(f"출력 : {output_text}")

------------------------------




입력 : Translate to English: Je t’aime.
출력 : <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
------------------------------
입력 : Translate to Korean: Je t’aime.
출력 : <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


#### PEFT 적용

In [10]:
from peft import LoraConfig, TaskType

In [11]:
# peft 관련 configuration
'''
task_type : task 종류
inference_mode : 추론에 사용할지 유무
r : rank 값
lora_alpha : LoRA 레이어에 사용되는 스케일링 인자
lora_dropout : LoRA 레이어에 사용되는 드롭아웃 값
'''
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, 
                         inference_mode=False, 
                         r=8, 
                         lora_alpha=32, 
                         lora_dropout=0.1)

In [12]:
# 기본 모델에 PEFT 적용
# from peft import get_peft_model
# peft_model = get_peft_model(model, peft_config)
# peft_model.print_trainable_parameters()

In [16]:
# if torch.cuda.is_available():
#     peft_model = torch.nn.DataParallel(peft_model)
if torch.cuda.is_available():
    model = torch.nn.DataParallel(model)

In [17]:
# 학습 관련 configuration
from transformers import TrainingArguments
from datasets import load_metric

training_args = TrainingArguments(
    output_dir="model/bigscience/mt0-large-lora",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

def compute_metrics(pred):
    bleu_metric = load_metric("bleu")
    predictions, labels = pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # 토크나이저가 패딩으로 추가한 부분을 제거
    labels = [[label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # BLEU 점수 계산
    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}

In [18]:
# os.environ["MASTER_ADDR"] = "localhost"
# os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
# os.environ["RANK"] = "0"
# os.environ["LOCAL_RANK"] = "0"
# os.environ["WORLD_SIZE"] = "1"
# os.environ["TOKENIZERS_PARALLELISM"]= "true"

# 학습 수행
from trl import SFTTrainer
# trainer = SFTTrainer(
#     model=peft_model,
#     peft_config=peft_config,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
#     packing=False,
#     dataset_text_field = 'text'
# )

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    packing=False,
    dataset_text_field = 'text'
)


trainer.train()



ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided []

In [28]:
len(eval_dataset[-1]['input_ids'])

2048

In [None]:
model.save_pretrained("")

In [None]:
from huggingface_hub import notebook_login

notebook_login()
model.push_to_hub("your-name/bigscience/mt0-large-lora")