# 한-영 번역기 만들기 (v5.1: Transformer 최종 수정)
---

## Step 1. 라이브러리 설치 및 임포트

In [1]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

# !apt-get install -y fonts-nanum

# !pip install Korpora
# !git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
# !ls
# %cd Mecab-ko-for-Google-Colab/
# !bash install_mecab-ko_on_colab_light_220429.sh

# %cd ..

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import os
import re
import numpy as np
from konlpy.tag import Mecab
from collections import Counter
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.optim as optim
import random
import time
from tqdm import tqdm
import spacy
import math

import logging

logging.getLogger("matplotlib.font_manager").setLevel(logging.ERROR)

fontpath = "/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf"
fontprop = fm.FontProperties(fname=fontpath, size=12)
plt.rcParams["font.family"] = fontprop.get_name()

print(f"설정된 폰트: {fontprop.get_name()}")

설정된 폰트: NanumBarunGothic


## Step 2. 데이터 준비 및 전처리

In [3]:
data_dir = 'data'
train_kor_path = os.path.join(data_dir, 'korean-english-park.train.ko')
train_eng_path = os.path.join(data_dir, 'korean-english-park.train.en')
dev_kor_path = os.path.join(data_dir, 'korean-english-park.dev.ko')
dev_eng_path = os.path.join(data_dir, 'korean-english-park.dev.en')
test_kor_path = os.path.join(data_dir, 'korean-english-park.test.ko')
test_eng_path = os.path.join(data_dir, 'korean-english-park.test.en')

with open(train_kor_path, "r") as f: train_kor_raw = f.read().splitlines()
with open(train_eng_path, "r") as f: train_eng_raw = f.read().splitlines()
with open(dev_kor_path, "r") as f: dev_kor_raw = f.read().splitlines()
with open(dev_eng_path, "r") as f: dev_eng_raw = f.read().splitlines()
with open(test_kor_path, "r") as f: test_kor_raw = f.read().splitlines()
with open(test_eng_path, "r") as f: test_eng_raw = f.read().splitlines()

In [4]:
mecab = Mecab()
spacy_eng = spacy.load('en_core_web_sm')

def preprocess_corpus(kor_sentence, eng_sentence):
    kor_sentence = kor_sentence.lower().strip()
    eng_sentence = eng_sentence.lower().strip()
    kor_sentence = re.sub(r"[^가-힣?.!,]+", " ", kor_sentence)
    eng_sentence = re.sub(r"([?.!,])", r" \1 ", eng_sentence)
    eng_sentence = re.sub(r'[^a-zA-Z?.!,]+', ' ', eng_sentence)
    eng_sentence = re.sub(r'\s+', ' ', eng_sentence)
    return kor_sentence, eng_sentence.strip()

def tokenize_corpus(kor_raw, eng_raw, max_len=40):
    kor_corpus, eng_corpus = [], []
    cleaned_corpus = list(set(zip(kor_raw, eng_raw)))
    for kor, eng in cleaned_corpus:
        kor_prep, eng_prep = preprocess_corpus(kor, eng)
        kor_tokens = mecab.morphs(kor_prep)
        eng_tokens_raw = [token.text for token in spacy_eng.tokenizer(eng_prep)]
        eng_tokens = ['<start>'] + eng_tokens_raw + ['<end>']
        if len(kor_tokens) <= max_len and len(eng_tokens) <= max_len:
            kor_corpus.append(kor_tokens)
            eng_corpus.append(eng_tokens)
    return kor_corpus, eng_corpus

train_kor_corpus, train_eng_corpus = tokenize_corpus(train_kor_raw, train_eng_raw)

In [5]:
def build_vocab(corpus, max_vocab_size=10000):
    counter = Counter()
    for sentence in corpus:
        counter.update(sentence)
    vocab = counter.most_common(max_vocab_size - 4)
    word_to_idx = {word: i+4 for i, (word, _) in enumerate(vocab)}
    word_to_idx['<pad>'] = 0
    word_to_idx['<unk>'] = 1
    word_to_idx['<start>'] = 2
    word_to_idx['<end>'] = 3
    idx_to_word = {idx: word for word, idx in word_to_idx.items()}
    return word_to_idx, idx_to_word

# build_vocab 함수에 전달하는 max_vocab_size는 모델의 VOCAB_SIZE와 일치해야 합니다.
MAX_VOCAB_SIZE = 10000
kor_word_to_idx, kor_idx_to_word = build_vocab(train_kor_corpus, MAX_VOCAB_SIZE)
eng_word_to_idx, eng_idx_to_word = build_vocab(train_eng_corpus, MAX_VOCAB_SIZE)

In [6]:
def text_to_sequence(corpus, word_to_idx):
    sequences = []
    for sentence in corpus:
        sequence = [word_to_idx.get(word, word_to_idx['<unk>']) for word in sentence]
        sequences.append(sequence)
    return sequences

dev_kor_corpus, dev_eng_corpus = [], []
for kor, eng in zip(dev_kor_raw, dev_eng_raw):
    kor_prep, eng_prep = preprocess_corpus(kor, eng)
    kor_tokens = mecab.morphs(kor_prep)
    eng_tokens_raw = [token.text for token in spacy_eng.tokenizer(eng_prep)]
    eng_tokens = ['<start>'] + eng_tokens_raw + ['<end>']
    dev_kor_corpus.append(kor_tokens)
    dev_eng_corpus.append(eng_tokens)

test_kor_corpus, test_eng_corpus = [], []
for kor, eng in zip(test_kor_raw, test_eng_raw):
    kor_prep, eng_prep = preprocess_corpus(kor, eng)
    kor_tokens = mecab.morphs(kor_prep)
    eng_tokens_raw = [token.text for token in spacy_eng.tokenizer(eng_prep)]
    eng_tokens = ['<start>'] + eng_tokens_raw + ['<end>']
    test_kor_corpus.append(kor_tokens)
    test_eng_corpus.append(eng_tokens)

train_kor_sequences = text_to_sequence(train_kor_corpus, kor_word_to_idx)
train_eng_sequences = text_to_sequence(train_eng_corpus, eng_word_to_idx)
dev_kor_sequences = text_to_sequence(dev_kor_corpus, kor_word_to_idx)
dev_eng_sequences = text_to_sequence(dev_eng_corpus, eng_word_to_idx)
test_kor_sequences = text_to_sequence(test_kor_corpus, kor_word_to_idx)
test_eng_sequences = text_to_sequence(test_eng_corpus, eng_word_to_idx)

In [7]:
class TranslationDataset(Dataset):
    def __init__(self, src_sequences, trg_sequences):
        self.src_sequences = src_sequences
        self.trg_sequences = trg_sequences

    def __len__(self):
        return len(self.src_sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.src_sequences[idx]), torch.tensor(self.trg_sequences[idx])

def collate_fn(batch):
    src_batch, trg_batch = [], []
    for src_sample, trg_sample in batch:
        src_batch.append(src_sample)
        trg_batch.append(trg_sample)
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=kor_word_to_idx['<pad>'])
    trg_padded = pad_sequence(trg_batch, batch_first=True, padding_value=eng_word_to_idx['<pad>'])
    return src_padded, trg_padded

BATCH_SIZE = 64

train_dataset = TranslationDataset(train_kor_sequences, train_eng_sequences)
valid_dataset = TranslationDataset(dev_kor_sequences, dev_eng_sequences)
test_dataset = TranslationDataset(test_kor_sequences, test_eng_sequences)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

## Step 3. 트랜스포머 모델 설계 (수정된 최종 버전)

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        div_term = torch.exp(torch.arange(0, emb_size, 2) * (-math.log(10000.0) / emb_size))
        position = torch.arange(maxlen).unsqueeze(1)
        pos_embedding = torch.zeros(maxlen, emb_size)
        pos_embedding[:, 0::2] = torch.sin(position * div_term)
        pos_embedding[:, 1::2] = torch.cos(position * div_term)
        pos_embedding = pos_embedding.unsqueeze(0)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:, :token_embedding.size(1), :])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, emb_size, nhead, src_vocab_size, tgt_vocab_size, dim_feedforward, dropout):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = nn.Transformer(d_model=emb_size, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, batch_first=True)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src, trg, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src, src_mask):
        return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt, memory, tgt_mask):
        return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

## Step 4. 모델 학습 및 검증

In [9]:
# 하이퍼파라미터 (v5.1 수정)
# 단어 사전 최대 크기를 상수로 정의
VOCAB_SIZE = 10000
SRC_VOCAB_SIZE = VOCAB_SIZE
TGT_VOCAB_SIZE = VOCAB_SIZE

EMB_SIZE = 256
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 64
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 모델 초기화 시, 고정된 단어 사전 크기를 전달
transformer = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM, 0.1)
model = transformer.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=eng_word_to_idx['<pad>'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

### 학습 및 평가 함수 정의 (트랜스포머용, v5.1 수정)

In [10]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = None  # 인코더는 마스크 필요 없음

    src_padding_mask = (src == kor_word_to_idx['<pad>'])
    tgt_padding_mask = (tgt == eng_word_to_idx['<pad>'])
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

def train_epoch(model, iterator, optimizer):
    model.train()
    losses = 0
    progress_bar = tqdm(iterator, desc="Training...")
    for src, tgt in progress_bar:
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:, :-1]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

        optimizer.zero_grad()

        tgt_out = tgt[:, 1:]
        loss = criterion(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    return losses / len(iterator)

def evaluate(model, iterator):
    model.eval()
    losses = 0
    progress_bar = tqdm(iterator, desc="Evaluating...")
    with torch.no_grad():
        for src, tgt in progress_bar:
            src = src.to(device)
            tgt = tgt.to(device)

            tgt_input = tgt[:, :-1]

            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

            logits = model(src, tgt_input, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, src_padding_mask)

            tgt_out = tgt[:, 1:]
            loss = criterion(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
            losses += loss.item()

    return losses / len(iterator)

### 학습 루프 실행

In [11]:
NUM_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(1, NUM_EPOCHS+1):
    start_time = time.time()
    train_loss = train_epoch(model, train_loader, optimizer)
    end_time = time.time()
    valid_loss = evaluate(model, valid_loader)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'translator-ko-en-v5.1-transformer.pt')

    print(f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {valid_loss:.3f}, Epoch time = {(end_time - start_time):.3f}s")

Training...: 100%|██████████| 987/987 [00:52<00:00, 18.65it/s, loss=5.78]
  output = torch._nested_tensor_from_mask(
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 44.79it/s]


Epoch: 1, Train loss: 6.135, Val loss: 5.610, Epoch time = 52.935s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.90it/s, loss=5.28]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 47.59it/s]


Epoch: 2, Train loss: 5.554, Val loss: 5.288, Epoch time = 52.222s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.80it/s, loss=5.3] 
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.47it/s]


Epoch: 3, Train loss: 5.304, Val loss: 5.099, Epoch time = 52.509s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.85it/s, loss=5.27]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.48it/s]


Epoch: 4, Train loss: 5.129, Val loss: 4.955, Epoch time = 52.372s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.85it/s, loss=4.94]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.91it/s]


Epoch: 5, Train loss: 4.992, Val loss: 4.852, Epoch time = 52.352s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.84it/s, loss=4.56]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.24it/s]


Epoch: 6, Train loss: 4.877, Val loss: 4.764, Epoch time = 52.391s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.83it/s, loss=4.6] 
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.57it/s]


Epoch: 7, Train loss: 4.777, Val loss: 4.684, Epoch time = 52.406s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.82it/s, loss=4.65]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.55it/s]


Epoch: 8, Train loss: 4.689, Val loss: 4.618, Epoch time = 52.457s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.84it/s, loss=4.47]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 49.34it/s]


Epoch: 9, Train loss: 4.609, Val loss: 4.558, Epoch time = 52.379s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.84it/s, loss=4.45]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.03it/s]


Epoch: 10, Train loss: 4.536, Val loss: 4.509, Epoch time = 52.381s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.85it/s, loss=4.29]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.41it/s]


Epoch: 11, Train loss: 4.468, Val loss: 4.462, Epoch time = 52.378s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.86it/s, loss=4.65]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.38it/s]


Epoch: 12, Train loss: 4.407, Val loss: 4.426, Epoch time = 52.344s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.82it/s, loss=4.51]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 49.29it/s]


Epoch: 13, Train loss: 4.348, Val loss: 4.388, Epoch time = 52.437s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.80it/s, loss=4.49]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.42it/s]


Epoch: 14, Train loss: 4.294, Val loss: 4.355, Epoch time = 52.501s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.84it/s, loss=4.15]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 47.82it/s]


Epoch: 15, Train loss: 4.241, Val loss: 4.318, Epoch time = 52.394s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.86it/s, loss=4.13]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 49.37it/s]


Epoch: 16, Train loss: 4.192, Val loss: 4.297, Epoch time = 52.326s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.87it/s, loss=4.14]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 48.10it/s]


Epoch: 17, Train loss: 4.145, Val loss: 4.272, Epoch time = 52.302s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.86it/s, loss=4.4] 
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 49.29it/s]


Epoch: 18, Train loss: 4.100, Val loss: 4.244, Epoch time = 52.345s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.87it/s, loss=4.11]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 47.21it/s]


Epoch: 19, Train loss: 4.057, Val loss: 4.230, Epoch time = 52.300s


Training...: 100%|██████████| 987/987 [00:52<00:00, 18.85it/s, loss=3.68]
Evaluating...: 100%|██████████| 16/16 [00:00<00:00, 49.50it/s]


Epoch: 20, Train loss: 4.017, Val loss: 4.198, Epoch time = 52.364s


## Step 5. 최종 테스트 및 번역

In [12]:
model.load_state_dict(torch.load('translator-ko-en-v5.1-transformer.pt'))
test_loss = evaluate(model, test_loader)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

Evaluating...: 100%|██████████| 32/32 [00:00<00:00, 47.30it/s]

| Test Loss: 4.244 | Test PPL:  69.703 |





In [14]:
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for i in range(max_len-1):
        memory = memory.to(device)
        tgt_mask = (generate_square_subsequent_mask(ys.size(1)).type(torch.bool)).to(device)
        out = model.decode(ys, memory, tgt_mask)
        # out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        if next_word == eng_word_to_idx['<end>']:
            break
    return ys

def translate(model, src_sentence):
    model.eval()
    src_tokens = mecab.morphs(src_sentence)
    src_tensor = torch.LongTensor([kor_word_to_idx.get(t, kor_word_to_idx['<unk>']) for t in src_tokens]).unsqueeze(0).to(device)
    num_tokens = src_tensor.shape[1]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool) # This should be None, but passing for greedy_decode
    tgt_tokens = greedy_decode(model, src_tensor, src_mask, max_len=num_tokens + 5, start_symbol=eng_word_to_idx['<start>']).flatten()
    return " ".join([eng_idx_to_word.get(tok.item(), '<unk>') for tok in tgt_tokens]).replace("<start>", "").replace("<end>", "")

example_sentences = [
    "오바마는 대통령이다.",
    "시민들은 도시 속에 산다.",
    "커피는 필요 없다.",
    "일곱 명의 사망자가 발생했다."
]

for sentence in example_sentences:
    print(f'Original: {sentence}')
    print(f"Translated: {translate(model, sentence)}")
    print()

Original: 오바마는 대통령이다.
Translated:  obama is the president . 

Original: 시민들은 도시 속에 산다.
Translated:  they are <unk> in the city of <unk> . 

Original: 커피는 필요 없다.
Translated:  it s not clear how many people do . 

Original: 일곱 명의 사망자가 발생했다.
Translated:  the death toll from the death toll in the country s death toll

