In [1]:
import pandas as pd
import torch
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, TrainerCallback
from datasets import Dataset
import gc

In [2]:
torch.cuda.empty_cache()
gc.collect()

114

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# 모델과 토크나이저 로드
model_name = "paust/pko-t5-base"  # 또는 "paust/pko-t5-large"
fine_tuned_model_path = "./pko_best_model"  # FT된 모델 가중치 경로

In [5]:
# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_name)
# SafeTensors 지원 옵션 추가
model = AutoModelForSeq2SeqLM.from_pretrained(
    fine_tuned_model_path,
    use_safetensors=True,  # SafeTensors 모델 로드
    device_map="auto"  # 자동으로 GPU 메모리 최적화
)

In [6]:
# 데이터 로드
augmented_data = pd.read_csv("./data/augmented_train_input.csv", encoding="utf-8-sig")

In [7]:
# 데이터 전처리 함수
def preprocess_data(examples):
    prefix = (
        "당신은 한국어 리뷰 복원 전문가입니다.\n"
        "당신의 임무는 난독화된 한글 리뷰를 분석하고, 이를 자연스럽고 명확한 원래 의미의 한글 리뷰로 복원하는 것입니다.\n"
        "난독화된 리뷰의 단어를 원본 단어로 복원하고, 띄어쓰기와 문장 구조도 원래대로 복원하세요.\n"
        "문맥을 분석하여 자연스럽고 의미 있는 복원을 수행하며, 출력은 오직 한국어로만 작성하십시오.\n\n"
    )
    
    inputs = [prefix + text for text in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
# Hugging Face Dataset 변환
train_dataset = Dataset.from_pandas(augmented_data).map(preprocess_data, batched=True)

Map:   0%|          | 0/22526 [00:00<?, ? examples/s]



In [9]:

# Best Model 저장 콜백
class SaveBestModelCallback(TrainerCallback):
    def __init__(self):
        self.best_loss = float('inf')
        self.best_model_path = "./best_model_augmented"

    def on_log(self, args, state, control, **kwargs):
        for log in state.log_history:
            if "loss" in log and log["loss"] < self.best_loss:
                self.best_loss = log["loss"]
                print(f"New best training loss: {self.best_loss}. Saving model to {self.best_model_path}")
                kwargs["model"].save_pretrained(self.best_model_path)
                kwargs["tokenizer"].save_pretrained(self.best_model_path)

In [10]:
# 훈련 파라미터 설정
training_args = TrainingArguments(
    output_dir="./pko_augmented",
    evaluation_strategy="no",
    learning_rate=3e-5,  # Fine-tuning에는 일반적으로 작은 학습률 사용
    per_device_train_batch_size=8,
    num_train_epochs=15,  # 추가 학습이므로 15 Epoch 정도로 설정
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs_augmented",
    logging_steps=100,
    fp16=torch.cuda.is_available(),
)



In [11]:
# Trainer 정의
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    callbacks=[SaveBestModelCallback()]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [12]:
# 모델 추가 훈련
trainer.train()

[2025-02-16 14:53:45,806] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
[34m[1mwandb[0m: Currently logged in as: [33m20211367[0m ([33m20211367-sungshin-women-s-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,0.0059
200,0.0061
300,0.0057
400,0.0058
500,0.0062
600,0.0049
700,0.0047
800,0.0054
900,0.005
1000,0.0049


New best training loss: 0.0059. Saving model to ./best_model_augmented
New best training loss: 0.0057. Saving model to ./best_model_augmented
New best training loss: 0.0049. Saving model to ./best_model_augmented
New best training loss: 0.0047. Saving model to ./best_model_augmented
New best training loss: 0.0045. Saving model to ./best_model_augmented
New best training loss: 0.0041. Saving model to ./best_model_augmented
New best training loss: 0.0038. Saving model to ./best_model_augmented
New best training loss: 0.0035. Saving model to ./best_model_augmented
New best training loss: 0.0034. Saving model to ./best_model_augmented
New best training loss: 0.0032. Saving model to ./best_model_augmented
New best training loss: 0.0031. Saving model to ./best_model_augmented
New best training loss: 0.0028. Saving model to ./best_model_augmented
New best training loss: 0.0027. Saving model to ./best_model_augmented
New best training loss: 0.0025. Saving model to ./best_model_augmented
New be

TrainOutput(global_step=42240, training_loss=0.001574061460048666, metrics={'train_runtime': 24731.2995, 'train_samples_per_second': 13.662, 'train_steps_per_second': 1.708, 'total_flos': 2.4590529925742592e+17, 'train_loss': 0.001574061460048666, 'epoch': 15.0})