In [1]:
!pip install pytorch-lightning torchmetrics  datasets transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.5-py3-none-any.whl (832 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m832.4/832.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting datasets
  Downloading datasets-4.3.0-py3-none-any.whl (506 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lightning-uti

In [3]:
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting scipy>=1.8.0
  Downloading scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.7/37.7 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.4/308.4 kB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed jo

In [4]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
import pytorch_lightning as pl
import random
import pandas as pd
from sklearn.model_selection import train_test_split

In [5]:
# 1. 데이터셋 정의
class KoEnTranslationDataset(Dataset):
    """AI Hub 한국어-영어 번역 데이터셋을 로드하고 토큰화하는 클래스"""
    def __init__(self, data, src_lang='ko', tgt_lang='en', max_length=128):
        self.src_lang = src_lang  # 소스 언어 (한국어)
        self.tgt_lang = tgt_lang  # 타겟 언어 (영어)
        self.max_length = max_length  # 최대 시퀀스 길이
        self.src_texts = data['ko'].tolist()  # 원본 한국어 텍스트 저장
        self.src_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.tgt_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.data = data  # pandas DataFrame 형식의 데이터 (ko, en 컬럼 포함)


    def __len__(self):
        """데이터셋 크기 반환"""
        return len(self.data)

    def __getitem__(self, idx):
        """특정 인덱스의 데이터 반환"""
        src_text = self.data.iloc[idx][self.src_lang]  # 한국어 문장
        tgt_text = self.data.iloc[idx][self.tgt_lang]  # 영어 문장
        src_encoding = self.src_tokenizer(src_text, max_length=self.max_length, padding='max_length',
                                          truncation=True, return_tensors='pt')
        tgt_encoding = self.tgt_tokenizer(tgt_text, max_length=self.max_length, padding='max_length',
                                          truncation=True, return_tensors='pt')
        return {
            'src_input_ids': src_encoding['input_ids'].squeeze(0),  # [max_length]
            'src_attention_mask': src_encoding['attention_mask'].squeeze(0),
            'tgt_input_ids': tgt_encoding['input_ids'].squeeze(0),
            'tgt_attention_mask': tgt_encoding['attention_mask'].squeeze(0),
            'src_text': self.src_texts[idx]  # 원본 텍스트 추가
        }

In [6]:
# 2. 데이터 모듈
class KoEnTranslationDataModule(pl.LightningDataModule):
    """PyTorch Lightning 데이터 모듈로 학습, 검증, 테스트 데이터 제공"""
    def __init__(self, file_path, batch_size=32, max_length=128):
        super().__init__()
        self.file_path = file_path  # 한영 데이터셋 CSV 파일 경로
        self.batch_size = batch_size
        self.max_length = max_length

    def setup(self, stage=None):
        """데이터 로드 및 train/val/test 분리"""
        data = csv_reader(self.file_path)
        train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
        val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
        self.train_dataset = KoEnTranslationDataset(train_data)
        self.val_dataset = KoEnTranslationDataset(val_data)
        self.test_dataset = KoEnTranslationDataset(test_data)

    def train_dataloader(self):
        """학습 데이터 로더"""
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2)

    def val_dataloader(self):
        """검증 데이터 로더"""
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)

    def test_dataloader(self):
        """테스트 데이터 로더"""
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=2)

In [7]:
# 3. Transformer 모델 정의
class TransformerTranslator(pl.LightningModule):
    def __init__(self, src_vocab_size, tgt_vocab_size, tgt_tokenizer, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        super().__init__()
        self.tgt_tokenizer = tgt_tokenizer
        self.d_model = d_model

        # 변경: 임베딩 레이어 (Transformer에 맞게 d_model 크기로 설정)
        self.src_embedding = nn.Embedding(src_vocab_size, d_model)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = self._generate_positional_encoding(max_len=128, d_model=d_model)

        # 변경: Transformer 모델 정의 (PyTorch의 nn.Transformer 사용)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )

        # 변경: 최종 출력 레이어 (d_model -> tgt_vocab_size)
        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.criterion = nn.CrossEntropyLoss(ignore_index=self.tgt_tokenizer.pad_token_id)

    # 추가: 위치 인코딩 생성 함수
    def _generate_positional_encoding(self, max_len, d_model):
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)

    # 추가: 마스크 생성 함수 (패딩 및 미래 토큰 마스킹)
    def _generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

    def forward(self, src_input_ids, tgt_input_ids, src_attention_mask, tgt_attention_mask):
        src_embedded = self.src_embedding(src_input_ids) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float))
        tgt_embedded = self.tgt_embedding(tgt_input_ids) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float))

        # 변경: 위치 인코딩 추가
        src_embedded = src_embedded + self.positional_encoding[:, :src_input_ids.size(1), :].to(self.device)
        tgt_embedded = tgt_embedded + self.positional_encoding[:, :tgt_input_ids.size(1), :].to(self.device)

        src_embedded = src_embedded.permute(1, 0, 2)  # (batch_size, seq_len, d_model) -> (seq_len, batch_size, d_model)
        tgt_embedded = tgt_embedded.permute(1, 0, 2)

        # 변경: Transformer 입력에 맞는 마스크 생성
        tgt_mask = self._generate_square_subsequent_mask(tgt_input_ids.size(1)).to(self.device)
        src_key_padding_mask = ~src_attention_mask.bool()
        tgt_key_padding_mask = ~tgt_attention_mask.bool()

        # 변경: Transformer 모델 호출
        output = self.transformer(
            src=src_embedded,
            tgt=tgt_embedded,
            tgt_mask=tgt_mask,
            src_key_padding_mask=src_key_padding_mask,
            tgt_key_padding_mask=tgt_key_padding_mask
        )

        output = output.permute(1, 0, 2)  # (seq_len, batch_size, d_model) -> (batch_size, seq_len, d_model)
        output = self.fc(output)
        return output

    def training_step(self, batch, batch_idx):
        src_input_ids = batch['src_input_ids']
        tgt_input_ids = batch['tgt_input_ids']
        src_attention_mask = batch['src_attention_mask']
        tgt_attention_mask = batch['tgt_attention_mask']

        # 변경: Teacher forcing을 위해 tgt_input_ids를 한 칸씩 이동
        tgt_input = tgt_input_ids[:, :-1]
        tgt_output = tgt_input_ids[:, 1:]
        tgt_attention_mask = tgt_attention_mask[:, :-1]

        output = self(src_input_ids, tgt_input, src_attention_mask, tgt_attention_mask)
        loss = self.criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        self.log('train_loss', loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        src_input_ids = batch['src_input_ids']
        tgt_input_ids = batch['tgt_input_ids']
        src_attention_mask = batch['src_attention_mask']
        tgt_attention_mask = batch['tgt_attention_mask']

        tgt_input = tgt_input_ids[:, :-1]
        tgt_output = tgt_input_ids[:, 1:]
        tgt_attention_mask = tgt_attention_mask[:, :-1]

        output = self(src_input_ids, tgt_input, src_attention_mask, tgt_attention_mask)
        loss = self.criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        self.log('val_loss', loss, prog_bar=True)
        return loss

    def test_step(self, batch, batch_idx):
        src_input_ids = batch['src_input_ids']
        tgt_input_ids = batch['tgt_input_ids']
        src_attention_mask = batch['src_attention_mask']
        tgt_attention_mask = batch['tgt_attention_mask']

        tgt_input = tgt_input_ids[:, :-1]
        tgt_output = tgt_input_ids[:, 1:]
        tgt_attention_mask = tgt_attention_mask[:, :-1]

        output = self(src_input_ids, tgt_input, src_attention_mask, tgt_attention_mask)
        loss = self.criterion(output.reshape(-1, output.size(-1)), tgt_output.reshape(-1))
        self.log('test_loss', loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=0.0001)

    # 변경: 번역 함수 (Greedy Decoding 방식으로 Transformer 디코딩)
    def translate(self, src_input_ids, max_len=128):
        self.eval()
        with torch.no_grad():
            src_input_ids = src_input_ids.to(self.device)
            src_embedded = self.src_embedding(src_input_ids) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float))
            src_embedded = src_embedded + self.positional_encoding[:, :src_input_ids.size(1), :].to(self.device)
            src_embedded = src_embedded.permute(1, 0, 2)

            # 변경: 디코딩 시작 토큰 설정
            decoder_input = torch.tensor([[self.tgt_tokenizer.cls_token_id]], dtype=torch.long).to(self.device)
            outputs = []

            for _ in range(max_len):
                decoder_embedded = self.tgt_embedding(decoder_input) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float))
                decoder_embedded = decoder_embedded + self.positional_encoding[:, :decoder_input.size(1), :].to(self.device)
                decoder_embedded = decoder_embedded.permute(1, 0, 2)

                tgt_mask = self._generate_square_subsequent_mask(decoder_input.size(1)).to(self.device)
                output = self.transformer(src=src_embedded, tgt=decoder_embedded, tgt_mask=tgt_mask)
                output = output.permute(1, 0, 2)
                output = self.fc(output[:, -1, :])
                pred_token = output.argmax(1).item()

                if pred_token == self.tgt_tokenizer.sep_token_id:
                    break
                outputs.append(pred_token)
                decoder_input = torch.cat((decoder_input, torch.tensor([[pred_token]], dtype=torch.long).to(self.device)), dim=1)

        return self.tgt_tokenizer.decode(outputs, skip_special_tokens=True)

In [None]:
# 4. 학습 실행 (기존과 동일)
def csv_reader(file_path):
    return pd.read_csv(file_path, encoding='cp949')

def main():
    batch_size = 32
    max_epochs = 10
    file_path = 'https://drive.google.com/uc?id=1X3OhxmD6huuChSjIovKlawXUItnXK-El'
    data_module = KoEnTranslationDataModule(file_path=file_path, batch_size=batch_size)

    sample_dataset = KoEnTranslationDataset(csv_reader(file_path).head(1))
    src_vocab_size = sample_dataset.src_tokenizer.vocab_size
    tgt_vocab_size = sample_dataset.tgt_tokenizer.vocab_size
    tgt_tokenizer = sample_dataset.tgt_tokenizer

    # 변경: Transformer 모델로 초기화
    model = TransformerTranslator(
        src_vocab_size=src_vocab_size,
        tgt_vocab_size=tgt_vocab_size,
        tgt_tokenizer=tgt_tokenizer
    )

    trainer = pl.Trainer(
        max_epochs=max_epochs,
        accelerator='gpu' if torch.cuda.is_available() else 'cpu',
        devices=1,
        log_every_n_steps=10,
        enable_progress_bar=True
    )

    trainer.fit(model, data_module)
    trainer.test(model, datamodule=data_module)

    # 테스트 데이터셋에서 첫 번째 샘플 가져오기
    sample = data_module.test_dataset[0]
    src_input_ids = sample['src_input_ids'].unsqueeze(0).to(model.device)
    src_text = sample['src_text']  # 원본 한국어 텍스트 직접 사용

    # 번역 수행
    translated = model.translate(src_input_ids)

    # 출력
    print(f"원본 한국어: {src_text}")
    print(f"번역된 영어: {translated}")

if __name__ == "__main__":
    main()

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/elicer/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
  return _C._get_float32_matmul_precision()
You are using a CUDA device ('NVIDIA A100 80GB PCIe MIG 1g.10gb') that has Tensor Cores. To properly u

Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.49it/s]

/home/elicer/.local/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:79: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 32. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.


Epoch 0:  45%|████▌     | 99/219 [01:21<01:38,  1.22it/s, v_num=0, train_loss=5.990]