In [1]:
import pandas as pd
import torch
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, TrainerCallback
from datasets import Dataset
import gc

In [2]:
torch.cuda.empty_cache()
gc.collect()

114

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
# Î™®Îç∏Í≥º ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú
model_name = "paust/pko-t5-base"  # ÎòêÎäî "paust/pko-t5-large"
fine_tuned_model_path = "./pko_best_model"  # FTÎêú Î™®Îç∏ Í∞ÄÏ§ëÏπò Í≤ΩÎ°ú

In [5]:
# ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú
tokenizer = AutoTokenizer.from_pretrained(model_name)
# SafeTensors ÏßÄÏõê ÏòµÏÖò Ï∂îÍ∞Ä
model = AutoModelForSeq2SeqLM.from_pretrained(
    fine_tuned_model_path,
    use_safetensors=True,  # SafeTensors Î™®Îç∏ Î°úÎìú
    device_map="auto"  # ÏûêÎèôÏúºÎ°ú GPU Î©îÎ™®Î¶¨ ÏµúÏ†ÅÌôî
)

In [6]:
# Îç∞Ïù¥ÌÑ∞ Î°úÎìú
augmented_data = pd.read_csv("./data/augmented_train_input.csv", encoding="utf-8-sig")

In [7]:
# Îç∞Ïù¥ÌÑ∞ Ï†ÑÏ≤òÎ¶¨ Ìï®Ïàò
def preprocess_data(examples):
    prefix = (
        "ÎãπÏã†ÏùÄ ÌïúÍµ≠Ïñ¥ Î¶¨Î∑∞ Î≥µÏõê Ï†ÑÎ¨∏Í∞ÄÏûÖÎãàÎã§.\n"
        "ÎãπÏã†Ïùò ÏûÑÎ¨¥Îäî ÎÇúÎèÖÌôîÎêú ÌïúÍ∏Ä Î¶¨Î∑∞Î•º Î∂ÑÏÑùÌïòÍ≥†, Ïù¥Î•º ÏûêÏó∞Ïä§ÎüΩÍ≥† Î™ÖÌôïÌïú ÏõêÎûò ÏùòÎØ∏Ïùò ÌïúÍ∏Ä Î¶¨Î∑∞Î°ú Î≥µÏõêÌïòÎäî Í≤ÉÏûÖÎãàÎã§.\n"
        "ÎÇúÎèÖÌôîÎêú Î¶¨Î∑∞Ïùò Îã®Ïñ¥Î•º ÏõêÎ≥∏ Îã®Ïñ¥Î°ú Î≥µÏõêÌïòÍ≥†, ÎùÑÏñ¥Ïì∞Í∏∞ÏôÄ Î¨∏Ïû• Íµ¨Ï°∞ÎèÑ ÏõêÎûòÎåÄÎ°ú Î≥µÏõêÌïòÏÑ∏Ïöî.\n"
        "Î¨∏Îß•ÏùÑ Î∂ÑÏÑùÌïòÏó¨ ÏûêÏó∞Ïä§ÎüΩÍ≥† ÏùòÎØ∏ ÏûàÎäî Î≥µÏõêÏùÑ ÏàòÌñâÌïòÎ©∞, Ï∂úÎ†•ÏùÄ Ïò§ÏßÅ ÌïúÍµ≠Ïñ¥Î°úÎßå ÏûëÏÑ±ÌïòÏã≠ÏãúÏò§.\n\n"
    )
    
    inputs = [prefix + text for text in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
# Hugging Face Dataset Î≥ÄÌôò
train_dataset = Dataset.from_pandas(augmented_data).map(preprocess_data, batched=True)

Map:   0%|          | 0/22526 [00:00<?, ? examples/s]



In [9]:

# Best Model Ï†ÄÏû• ÏΩúÎ∞±
class SaveBestModelCallback(TrainerCallback):
    def __init__(self):
        self.best_loss = float('inf')
        self.best_model_path = "./best_model_augmented"

    def on_log(self, args, state, control, **kwargs):
        for log in state.log_history:
            if "loss" in log and log["loss"] < self.best_loss:
                self.best_loss = log["loss"]
                print(f"New best training loss: {self.best_loss}. Saving model to {self.best_model_path}")
                kwargs["model"].save_pretrained(self.best_model_path)
                kwargs["tokenizer"].save_pretrained(self.best_model_path)

In [10]:
# ÌõàÎ†® ÌååÎùºÎØ∏ÌÑ∞ ÏÑ§Ï†ï
training_args = TrainingArguments(
    output_dir="./pko_augmented",
    evaluation_strategy="no",
    learning_rate=3e-5,  # Fine-tuningÏóêÎäî ÏùºÎ∞òÏ†ÅÏúºÎ°ú ÏûëÏùÄ ÌïôÏäµÎ•† ÏÇ¨Ïö©
    per_device_train_batch_size=8,
    num_train_epochs=15,  # Ï∂îÍ∞Ä ÌïôÏäµÏù¥ÎØÄÎ°ú 15 Epoch Ï†ïÎèÑÎ°ú ÏÑ§Ï†ï
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs_augmented",
    logging_steps=100,
    fp16=torch.cuda.is_available(),
)



In [11]:
# Trainer Ï†ïÏùò
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    callbacks=[SaveBestModelCallback()]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [12]:
# Î™®Îç∏ Ï∂îÍ∞Ä ÌõàÎ†®
trainer.train()

[2025-02-16 14:53:45,806] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
[34m[1mwandb[0m: Currently logged in as: [33m20211367[0m ([33m20211367-sungshin-women-s-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,0.0059
200,0.0061
300,0.0057
400,0.0058
500,0.0062
600,0.0049
700,0.0047
800,0.0054
900,0.005
1000,0.0049


New best training loss: 0.0059. Saving model to ./best_model_augmented
New best training loss: 0.0057. Saving model to ./best_model_augmented
New best training loss: 0.0049. Saving model to ./best_model_augmented
New best training loss: 0.0047. Saving model to ./best_model_augmented
New best training loss: 0.0045. Saving model to ./best_model_augmented
New best training loss: 0.0041. Saving model to ./best_model_augmented
New best training loss: 0.0038. Saving model to ./best_model_augmented
New best training loss: 0.0035. Saving model to ./best_model_augmented
New best training loss: 0.0034. Saving model to ./best_model_augmented
New best training loss: 0.0032. Saving model to ./best_model_augmented
New best training loss: 0.0031. Saving model to ./best_model_augmented
New best training loss: 0.0028. Saving model to ./best_model_augmented
New best training loss: 0.0027. Saving model to ./best_model_augmented
New best training loss: 0.0025. Saving model to ./best_model_augmented
New be

TrainOutput(global_step=42240, training_loss=0.001574061460048666, metrics={'train_runtime': 24731.2995, 'train_samples_per_second': 13.662, 'train_steps_per_second': 1.708, 'total_flos': 2.4590529925742592e+17, 'train_loss': 0.001574061460048666, 'epoch': 15.0})