In [None]:
def preprocess_data(dataframe, max_input_length=128, max_target_length=128):
    dataframe = dataframe.copy()
    dataframe['input_text'] = dataframe['input'].apply(lambda x: f"복원: {x}")
    dataframe['target_text'] = dataframe['output']
    dataframe = dataframe[['input_text', 'target_text']]
    return dataframe

# Preprocess the training data
preprocessed_train_df = preprocess_data(train_df)

In [1]:
!nvidia-smi

Sun Jan 19 14:25:11 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA TITAN RTX               Off | 00000000:3B:00.0 Off |                  N/A |
| 40%   27C    P8               7W / 280W |      6MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# FineTuning

In [1]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils import clip_grad_norm_
from transformers import AutoTokenizer, T5ForConditionalGeneration, get_scheduler
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
import gc

In [2]:
class Config:
    def __init__(self):
        self.model_name = "paust/pko-t5-base"
        self.max_length = 512
        self.batch_size = 1  # 배치 사이즈 축소
        self.num_epochs = 3
        self.learning_rate = 1e-4
        self.gradient_accumulation_steps = 16  # 그래디언트 누적 증가
        self.weight_decay = 0.01
        self.seed = 42
        self.data_sample_frac = 0.1 #데이터셋 샘플링 비율 (디버깅 용도로 사용) 

In [3]:
def clear_gpu_memory():
    """GPU 메모리 정리 함수"""
    gc.collect()
    torch.cuda.empty_cache()

In [4]:
# System Prompt 정의
SYSTEM_PROMPT = """당신은 난독화된 한글 리뷰를 원래 형태로 복원하는 전문가입니다.
주어진 난독화된 리뷰를 자연스러운 한국어로 복원해주세요.
단어 간격과 길이는 입력값과 동일하게 유지해야 합니다."""

In [5]:
# Configuration 설정
config = Config()

In [6]:
#Step 1. 데이터 로드 및 전처리 
def preprocess_data(dataframe):
    dataframe = dataframe.copy()
    dataframe['input_text'] = dataframe['input'].apply(lambda x: f"복원: {x}")
    dataframe['target_text'] = dataframe['output']
    return dataframe[['input_text', 'target_text']]

train_df = pd.read_csv('../data/train.csv', encoding='utf-8-sig')
preprocessed_df = preprocess_data(train_df)
train_data, val_data = train_test_split(preprocessed_df, test_size=0.1, random_state=42)
print("Data Preprocessed.")

Data Preprocessed.


In [7]:
# Step 2: 모델 및 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = T5ForConditionalGeneration.from_pretrained(config.model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model and Tokenizer Loaded.")

Model and Tokenizer Loaded.


In [8]:
# Step 4: 데이터셋 클래스 정의 및 생성
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length, max_target_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.dataframe.iloc[idx]
        input_text = f"{SYSTEM_PROMPT}\n\n입력: {item['input_text']}"
        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer(
            item['target_text'],
            max_length=self.max_target_length,
            padding=True,
            truncation=True,
            return_tensors="pt"
        )
        labels = target_encoding['input_ids'].clone()
        labels[labels == self.tokenizer.pad_token_id] = -100
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': labels.squeeze(),
        }

In [9]:
prompt_tokens = tokenizer(SYSTEM_PROMPT, return_tensors="pt")['input_ids']
prompt_length = prompt_tokens.size(1)
max_input_length = min(prompt_length + config.max_length + 10, tokenizer.model_max_length)
print(f"Prompt length: {prompt_length}, Final max_input_length: {max_input_length}")

Prompt length: 65, Final max_input_length: 587


In [10]:
train_dataset = ReviewDataset(train_data, tokenizer, max_input_length, config.max_length)
val_dataset = ReviewDataset(val_data, tokenizer, max_input_length, config.max_length)
print("Datasets Created.")

Datasets Created.


In [11]:
# Step 4: Optimizer와 Scheduler 설정
optimizer = AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
num_training_steps = len(train_dataset) // config.batch_size * config.num_epochs
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [14]:
# Step 5: Training Function
def train_model(
    model, train_dataset, val_dataset, tokenizer, optimizer, scheduler, device, num_epochs=3, batch_size=4, max_grad_norm=1.0
):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        step = 0
        accumulated_steps = 0  # Gradient accumulation counter

        for batch in train_loader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            loss = loss / gradient_accumulation_steps  # Normalize loss for accumulation
            loss.backward()
            total_loss += loss.item()

            accumulated_steps += 1

            if accumulated_steps % gradient_accumulation_steps == 0:
                clip_grad_norm_(model.parameters(), max_grad_norm)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                step += 1

            if step % 20 == 0:
                print(f"Epoch {epoch + 1}, Step {step}, Loss: {loss.item():.4f}")

            if step % 100 == 0:
                model.eval()
                with torch.no_grad():
                    sample_input_ids = batch['input_ids'][0].unsqueeze(0).to(device)
                    sample_attention_mask = batch['attention_mask'][0].unsqueeze(0).to(device)
                    generated_output = model.generate(
                        input_ids=sample_input_ids,
                        attention_mask=sample_attention_mask,
                        max_length=512,
                        num_beams=5,
                        early_stopping=True
                    )
                    decoded_output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
                    print(f"Step {step}: Generated Output: {decoded_output}")
                model.train()

        print(f"Epoch {epoch + 1}, Training Loss: {total_loss / len(train_loader):.4f}")

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                val_loss += outputs.loss.item()
        print(f"Epoch {epoch + 1}, Validation Loss: {val_loss / len(val_loader):.4f}")
        model.train()

In [15]:
# Step 6: Train the Model
clear_gpu_memory()
train_model(
    model=model,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    tokenizer=tokenizer,
    optimizer=optimizer,
    scheduler=lr_scheduler,
    device=device,
    num_epochs=config.num_epochs,
    batch_size=config.batch_size
)
clear_gpu_memory()

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 1.56 MiB is free. Process 1085502 has 4.72 GiB memory in use. Process 1089231 has 4.72 GiB memory in use. Process 1160209 has 10.44 GiB memory in use. Process 1356499 has 1.88 GiB memory in use. Process 1643078 has 1.87 GiB memory in use. Of the allocated memory 1.66 GiB is allocated by PyTorch, and 13.89 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Step 9: 모델 저장
trainer.save_model("./final-model")
tokenizer.save_pretrained("./final-model")
print("Model and tokenizer saved.")

In [None]:
# 12. 간단한 추론 테스트
def generate_text(text, model, tokenizer, max_length=128):
    inputs = tokenizer(f"복원: {text}", return_tensors="pt", max_length=max_length, truncation=True).to(model.device)
    outputs = model.generate(**inputs, max_length=max_length, num_beams=5)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# 테스트 데이터에서 샘플 추출
test_sample = test_df['input'].iloc[0]
restored_text = generate_text(test_sample, model, tokenizer)
print(f"Input: {test_sample}")
print(f"Restored: {restored_text}")