In [13]:
# !pip install pytorch-lightning torchmetrics  datasets transformers

In [14]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
import pytorch_lightning as pl
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [15]:
# 1. 데이터셋 정의
class KoEnTranslationDataset(Dataset):
    """AI Hub 한국어-영어 번역 데이터셋을 로드하고 토큰화하는 클래스"""
    def __init__(self, data, src_lang='ko', tgt_lang='en', max_length=128):
        self.src_lang = src_lang  # 소스 언어 (한국어)
        self.tgt_lang = tgt_lang  # 타겟 언어 (영어)
        self.max_length = max_length  # 최대 시퀀스 길이
        # 한국어 지원 다국어 토크나이저
        self.src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        # 영어 전용 토크나이저
        self.tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.data = data  # pandas DataFrame 형식의 데이터 (ko, en 컬럼 포함)

    def __len__(self):
        """데이터셋 크기 반환"""
        return len(self.data)

    def __getitem__(self, idx):
        """특정 인덱스의 데이터 반환"""
        src_text = self.data.iloc[idx][self.src_lang]  # 한국어 문장
        tgt_text = self.data.iloc[idx][self.tgt_lang]  # 영어 문장

        # 한국어 토큰화
        src_encoding = self.src_tokenizer(src_text, max_length=self.max_length, padding='max_length',
                                          truncation=True, return_tensors='pt')
        # 영어 토큰화
        tgt_encoding = self.tgt_tokenizer(tgt_text, max_length=self.max_length, padding='max_length',
                                          truncation=True, return_tensors='pt')

        return {
            'src_input_ids': src_encoding['input_ids'].squeeze(0),  # [max_length]
            'src_attention_mask': src_encoding['attention_mask'].squeeze(0),
            'tgt_input_ids': tgt_encoding['input_ids'].squeeze(0),
            'tgt_attention_mask': tgt_encoding['attention_mask'].squeeze(0)
        }

In [16]:
# 2. 데이터 모듈
class KoEnTranslationDataModule(pl.LightningDataModule):
    """PyTorch Lightning 데이터 모듈로 학습, 검증, 테스트 데이터 제공"""
    def __init__(self, file_path, batch_size=32, max_length=128):
        super().__init__()
        self.file_path = file_path  # 한영 데이터셋 CSV 파일 경로
        self.batch_size = batch_size
        self.max_length = max_length

    def setup(self, stage=None):
        """데이터 로드 및 train/val/test 분리"""
        # CSV 파일 로드 (ko, en 컬럼 가정)
        data = csv_reader(self.file_path)
        # 70% train, 15% val, 15% test로 분리
        train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
        val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

        self.train_dataset = KoEnTranslationDataset(train_data)
        self.val_dataset = KoEnTranslationDataset(val_data)
        self.test_dataset = KoEnTranslationDataset(test_data)

    def train_dataloader(self):
        """학습 데이터 로더"""
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2)

    def val_dataloader(self):
        """검증 데이터 로더"""
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)

    def test_dataloader(self):
        """테스트 데이터 로더"""
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)

In [17]:
# 3. Seq2Seq 번역 모델
class Seq2SeqTranslator(pl.LightningModule):
    """한국어-영어 번역을 위한 Seq2Seq 모델"""
    def __init__(self, src_vocab_size, tgt_vocab_size, tgt_tokenizer, embedding_dim=256, hidden_dim=512,
                 num_layers=2, dropout=0.5, learning_rate=1e-3):
        super().__init__()
        self.save_hyperparameters(ignore=['tgt_tokenizer'])  # tgt_tokenizer는 저장하지 않음
        self.tgt_tokenizer = tgt_tokenizer  # 영어 토크나이저 저장

        # 인코더: 한국어 문장을 히든 상태로 변환
        self.src_embedding = nn.Embedding(src_vocab_size, embedding_dim, padding_idx=0)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                               batch_first=True, dropout=dropout if num_layers > 1 else 0)
        # self.encoder = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers,
        #                      batch_first=True, dropout=dropout if num_layers > 1 else 0)
        # 디코더: 영어 문장 생성
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embedding_dim, padding_idx=0)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
                               batch_first=True, dropout=dropout if num_layers > 1 else 0)
        # self.decoder = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers,
        #                      batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim, tgt_vocab_size)  # 출력 레이어

        self.dropout = nn.Dropout(dropout)
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)  # 패딩 토큰 무시

    def forward(self, src_input_ids, tgt_input_ids, teacher_forcing_ratio=0.5):
        """순전파: 인코딩 후 디코딩"""
        # 인코더
        src_embedded = self.dropout(self.src_embedding(src_input_ids))  # [batch_size, src_len, embedding_dim]
        _, hidden = self.encoder(src_embedded)  # hidden: [num_layers, batch_size, hidden_dim]

        # 디코더
        batch_size = src_input_ids.size(0)
        max_len = tgt_input_ids.size(1)
        outputs = torch.zeros(batch_size, max_len, self.hparams.tgt_vocab_size).to(self.device)

        decoder_input = tgt_input_ids[:, 0].unsqueeze(1)  # 첫 토큰 (<sos>)
        for t in range(1, max_len):
            decoder_embedded = self.dropout(self.tgt_embedding(decoder_input))
            decoder_output, hidden = self.decoder(decoder_embedded, hidden)
            output = self.fc(decoder_output.squeeze(1))
            outputs[:, t, :] = output

            # Teacher Forcing 적용
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            decoder_input = tgt_input_ids[:, t].unsqueeze(1) if teacher_force else top1.unsqueeze(1)

        return outputs

    def training_step(self, batch, batch_idx):
        """학습 단계: 손실 계산"""
        src_input_ids = batch['src_input_ids']
        tgt_input_ids = batch['tgt_input_ids']

        outputs = self(src_input_ids, tgt_input_ids, teacher_forcing_ratio=0.5)
        outputs = outputs[:, 1:, :].reshape(-1, self.hparams.tgt_vocab_size)  # <sos> 제외
        targets = tgt_input_ids[:, 1:].reshape(-1)

        loss = self.criterion(outputs, targets)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        """검증 단계: 손실 계산"""
        src_input_ids = batch['src_input_ids']
        tgt_input_ids = batch['tgt_input_ids']

        outputs = self(src_input_ids, tgt_input_ids, teacher_forcing_ratio=0.0)
        outputs = outputs[:, 1:, :].reshape(-1, self.hparams.tgt_vocab_size)
        targets = tgt_input_ids[:, 1:].reshape(-1)

        loss = self.criterion(outputs, targets)
        self.log('val_loss', loss, on_epoch=True, prog_bar=True)

    def test_step(self, batch, batch_idx):
        """테스트 단계: 손실 계산"""
        src_input_ids = batch['src_input_ids']
        tgt_input_ids = batch['tgt_input_ids']

        outputs = self(src_input_ids, tgt_input_ids, teacher_forcing_ratio=0.0)
        outputs = outputs[:, 1:, :].reshape(-1, self.hparams.tgt_vocab_size)
        targets = tgt_input_ids[:, 1:].reshape(-1)

        loss = self.criterion(outputs, targets)
        self.log('test_loss', loss, on_epoch=True)

    def configure_optimizers(self):
        """옵티마이저 설정: Adam"""
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)

    def translate(self, src_input_ids, max_len=128):
        """추론: 한국어 문장을 영어로 번역"""
        self.eval()
        with torch.no_grad():
            src_embedded = self.src_embedding(src_input_ids)
            _, hidden = self.encoder(src_embedded)

            outputs = []
            decoder_input = torch.tensor([self.tgt_tokenizer.cls_token_id]).unsqueeze(0).to(self.device)  # <sos>
            for _ in range(max_len):
                decoder_embedded = self.tgt_embedding(decoder_input)
                decoder_output, hidden = self.decoder(decoder_embedded, hidden)
                output = self.fc(decoder_output.squeeze(1))
                pred_token = output.argmax(1).item()
                if pred_token == self.tgt_tokenizer.sep_token_id:  # <eos>에서 종료
                    break
                outputs.append(pred_token)
                decoder_input = torch.tensor([pred_token]).unsqueeze(0).to(self.device)

        return self.tgt_tokenizer.decode(outputs, skip_special_tokens=True)

In [22]:
# AI Hub: 한국어-영어 번역 말뭉치 다운로드
#
# 4. 학습 실행

def csv_reader(file_path):
  return pd.read_csv(file_path, encoding='cp949')

def main():
    batch_size = 32
    max_epochs = 1

    # 데이터셋 경로 (Colab에 업로드 필요)
    file_path = 'https://drive.google.com/uc?id=1X3OhxmD6huuChSjIovKlawXUItnXK-El'  # google drive
    data_module = KoEnTranslationDataModule(file_path=file_path, batch_size=batch_size)

    # 모델 초기화
    sample_dataset = KoEnTranslationDataset(csv_reader(file_path).head(1))
    src_vocab_size = sample_dataset.src_tokenizer.vocab_size
    tgt_vocab_size = sample_dataset.tgt_tokenizer.vocab_size
    tgt_tokenizer = sample_dataset.tgt_tokenizer  # 토크나이저 전달
    model = Seq2SeqTranslator(
        src_vocab_size=src_vocab_size,
        tgt_vocab_size=tgt_vocab_size,
        tgt_tokenizer=tgt_tokenizer  # tgt_tokenizer 추가
    )

    # 트레이너 설정
    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator='gpu' if torch.cuda.is_available() else 'cpu',
        devices=1,
        log_every_n_steps=10,
        enable_progress_bar=True
    )

    # 학습 및 테스트
    trainer.fit(model, data_module)
    trainer.test(model, datamodule=data_module)

    # 예시 번역
    sample = data_module.test_dataset[0]['src_input_ids'].unsqueeze(0).to(model.device)
    translated = model.translate(sample)
    print(f"Translated: {translated}")

if __name__ == "__main__":
    main()

INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name          | Type             | Params | Mode 
-----------------------------------------------------------
0 | src_embedding | Embedding        | 30.6 M | train
1 | encoder       | LSTM             | 3.7 M  | train
2 | tgt_embedding | Embedding        | 7.8 M  | train
3 | decoder       | LSTM             | 3.7 M  | train
4 | fc            | Linear           | 15.7 M | train
5 | dropout       | Dropout          | 0      | train
6 | criter

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

{'src_input_ids': tensor([  101,  9663, 11018,  9670, 89523,  9966, 70915, 12453,  9435, 33654,
         9056, 64722,  8870,  8855, 16985, 48549,   119,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,    