## 데이터 및 평가 지표 불러오기

In [4]:
from datasets import load_from_disk

datasets = load_from_disk("resources/raw_data/decoder_data/train")

In [5]:
datasets

Dataset({
    features: ['ID', 'text', 'noise_added_text'],
    num_rows: 496
})

In [8]:
print(datasets['noise_added_text'][0])
print(datasets['text'][0])

듀얼심 아이& -반기 출시설 솔+…알뜰> &대]
듀얼심 아이폰 하반기 출시설 솔솔…알뜰폰 기대감


## Pre-trained 모델 및 토크나이저 불러오기

In [None]:
from transformers import AutoConfig,AutoModelForSeq2SeqLM,AutoTokenizer

In [None]:
model_name = "KETI-AIR/ke-t5-large"

In [None]:
config = AutoConfig.from_pretrained(
    model_name,
    cache_dir=None,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=None,
    use_fast=True,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    config=config,
    cache_dir=None,
)

## 설정하기

In [None]:
def tokenize_count_function(examples):
    # Tokenize the 'cleaned_NSGREC' column and calculate the length of the tokens
    text_tokenized_length = [len(tokenizer.tokenize(text)) for text in examples['text']]
    noise_text_tokenized_length = [len(tokenizer.tokenize(text)) for text in examples['noise_added_text']]
    
    # Return the original examples with the new 'tokenized_length' column added
    examples["text_tokenized_length"] = text_tokenized_length
    examples["noise_text_tokenized_length"] = noise_text_tokenized_length
    
    return examples

In [None]:
tokenized_datasets=datasets.map(tokenize_count_function,batched=True)
print(tokenized_datasets)
print(tokenized_datasets[0])

In [None]:
import matplotlib.pyplot as plt
import numpy as np

text_tokenized_length = tokenized_datasets['text_tokenized_length']
noise_text_tokenized_length = tokenized_datasets['noise_text_tokenized_length']

cleaned_Max=max(text_tokenized_length)
cleaned_Min=min(text_tokenized_length)
cleaned_Mean=np.mean(text_tokenized_length)

noise_Max=max(noise_text_tokenized_length)
noise_Min=min(noise_text_tokenized_length)
noise_Mean=np.mean(noise_text_tokenized_length)

fig,ax=plt.subplots(1,2,figsize=(16,7))
ax[0].hist(text_tokenized_length, bins=10, edgecolor='black')
ax[0].set_title('Histogram of Tokenized Lengths')
ax[0].set_xlabel('Tokenized Length')
ax[0].set_ylabel('Frequency')
plt.text(0.7, 0.9, f'max:{cleaned_Max}|min:{cleaned_Min}|mean:{int(cleaned_Mean)}', color='black', transform=ax[0].transAxes)

ax[1].hist(noise_text_tokenized_length, bins=10, edgecolor='black')
ax[1].set_title('Histogram of Tokenized Lengths')
ax[1].set_xlabel('Tokenized Length')
ax[1].set_ylabel('Frequency')
plt.text(0.7, 0.9, f'max:{noise_Max}|min:{noise_Min}|mean:{int(noise_Mean)}', color='black', transform=ax[1].transAxes)

plt.show()

In [None]:
max_source_length = 30
max_target_length = 30
padding = "max_length"
preprocessing_num_workers = 12
num_beams = 3
num_train_epochs = 5
train_batch_size = 64
eval_batch_size = 8
learning_rate = 5e-4

## 전처리하기

In [None]:
def preprocess_function(examples):
     # Using both 'noise_added_text' and 'text' together in the tokenizer call
    model_inputs = tokenizer(examples['noise_added_text'],
                             text_target=examples['text'],
                             max_length=max_source_length, 
                             truncation=True,
                             padding=padding, 
                             return_tensors='pt')
    
    return model_inputs


In [None]:
datasets=datasets.map(
            preprocess_function,
            batched=True,
            num_proc=preprocessing_num_workers,
            load_from_cache_file=False,
            remove_columns=datasets.column_names,
            )

In [None]:
datasets

In [None]:
print(tokenizer.decode(datasets[0]['input_ids']))
print(tokenizer.decode(datasets[0]['labels']))

## Fine-tuning하기

In [None]:
from transformers import (
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
            tokenizer,
            model=model,
        )

In [None]:
examples = data_collator(datasets)
examples.keys()

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir='outputs',
    do_train=True,          # 학습은 진행
    do_eval=False,          # 평가를 진행하지 않음
    per_device_train_batch_size=train_batch_size,
    predict_with_generate=False,  # 예측을 생성하지 않음 (필요 없으므로 False로 설정)
    num_train_epochs=num_train_epochs,
    save_strategy = 'epoch',
    evaluation_strategy = 'no',   # 평가 전략을 'no'로 설정
    save_total_limit = 2,
    logging_strategy = 'epoch',
    load_best_model_at_end = False,  # 평가를 하지 않으므로 베스트 모델 로드 불필요
    learning_rate = learning_rate,
    remove_unused_columns = True,
    # report_to="none"
)


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
train_result = trainer.train()

In [None]:
import torch

text="듀얼심 아이& -반기 출시설 솔+...알뜰> &대]"


# 모델을 실행할 디바이스 설정 (GPU가 있으면 cuda로, 없으면 cpu로 설정)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 모델을 해당 디바이스로 이동
model.to(device)

# 입력 텍스트 토크나이징 및 디바이스로 이동
inputs = tokenizer(text, return_tensors="pt").to(device)


# 모델에 입력 넣기 (디코딩을 위한 예측 생성)
with torch.no_grad():  # 학습이 아니라 추론이므로 grad 계산 비활성화
    generated_ids = model.generate(inputs["input_ids"])
    
    
predicted_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_ids[0])
print(predicted_text)


In [None]:
print(f"입력 텍스트: {text}")
print(f"모델의 예측: {predicted_text}")

In [18]:
my_masked_text = [
  "The kid went to the <extra_id_0>.",
  "The dog likes <extra_id_0> and also <extra_id_1>."
]

inputs = tokenizer(
  my_masked_text,    # tokenizer will encode each string in your list
  padding="longest", # need to pad if encoded strings are different of lengths
  return_tensors="pt", 
)

sequence_ids = model.generate(
  input_ids=inputs["input_ids"],
  attention_mask=inputs["attention_mask"]
)
sequences = tokenizer.batch_decode(sequence_ids)

### **콘텐츠 라이선스**

<font color='red'><b>**WARNING**</b></font> : **본 교육 콘텐츠의 지식재산권은 재단법인 네이버커넥트에 귀속됩니다. 본 콘텐츠를 어떠한 경로로든 외부로 유출 및 수정하는 행위를 엄격히 금합니다.** 다만, 비영리적 교육 및 연구활동에 한정되어 사용할 수 있으나 재단의 허락을 받아야 합니다. 이를 위반하는 경우, 관련 법률에 따라 책임을 질 수 있습니다. 모델 라이선스 : MIT License

