In [1]:
# =========================================
# 1. 라이브러리 설치 및 임포트
# =========================================
!pip install -q sentencepiece
!pip install -q nltk

# mecab 추가 설치 (필요 시 주석 해제해서 실행)
'''
!pip install konlpy
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab/
!bash install_mecab-ko_on_colab_light_220429.sh
%cd -
'''

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import sentencepiece as spm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import re
import os
from tqdm.notebook import tqdm
import math
import random
import time
from collections import defaultdict

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# === [변경 지점] 추가: 데이터 증강용 WordNet 및 NLTK 리소스 ===
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

True

In [2]:
# =========================================
# 2. 하이퍼파라미터 및 설정
# =========================================

# Model Hyperparameters
SRC_VOCAB_SIZE = 20000
TGT_VOCAB_SIZE = 20000
D_MODEL = 512
N_LAYERS = 6
N_HEADS = 8
D_FF = 2048
DROPOUT = 0.1
MAX_LEN = 50

# Training Hyperparameters
BATCH_SIZE = 64
EPOCHS = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# === [변경 지점] 실험 고도화 옵션 ===
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

USE_AMP = True                 # 혼합정밀
GRAD_ACCUM_STEPS = 2           # 그래디언트 누적
MAX_GRAD_NORM = 1.0            # 그라디언트 클립
LABEL_SMOOTHING = 0.1          # 라벨 스무딩
WEIGHT_DECAY = 1e-4            # 가중치 감쇠
WARMUP_STEPS = 4000            # Noam warmup steps
PATIENCE = 3                   # 얼리스탑 인내심
CHECKPOINT_DIR = "./checkpoints"
BEST_MODEL_PATH = "transformer-best.pt"
LAST_CKPT_PATH = os.path.join(CHECKPOINT_DIR, "last.pt")
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

# === [핵심] 재개 여부 플래그: 기본 False → 항상 1에포크부터 시작 ===
RESUME = False


Using device: cuda


In [3]:
# =========================================
# 3. 데이터 준비 및 전처리
# =========================================

# 1) 데이터 경로
data_dir = 'data'
train_kor_path = os.path.join(data_dir, 'korean-english-park.train.ko')
train_eng_path = os.path.join(data_dir, 'korean-english-park.train.en')
dev_kor_path   = os.path.join(data_dir, 'korean-english-park.dev.ko')
dev_eng_path   = os.path.join(data_dir, 'korean-english-park.dev.en')
test_kor_path  = os.path.join(data_dir, 'korean-english-park.test.ko')
test_eng_path  = os.path.join(data_dir, 'korean-english-park.test.en')

# 2) 원본 데이터 로딩
with open(train_kor_path, "r", encoding='utf-8') as f: train_kor_raw = f.read().splitlines()
with open(train_eng_path, "r", encoding='utf-8') as f: train_eng_raw = f.read().splitlines()
with open(dev_kor_path,   "r", encoding='utf-8') as f: dev_kor_raw   = f.read().splitlines()
with open(dev_eng_path,   "r", encoding='utf-8') as f: dev_eng_raw   = f.read().splitlines()
with open(test_kor_path,  "r", encoding='utf-8') as f: test_kor_raw  = f.read().splitlines()
with open(test_eng_path,  "r", encoding='utf-8') as f: test_eng_raw  = f.read().splitlines()

print(f"Train: {len(train_kor_raw)}, Dev: {len(dev_kor_raw)}, Test: {len(test_kor_raw)}")

# 3) 전처리
def preprocess_sentence(sentence):
    """구두점, 특수문자 등 불필요한 부분을 제거하고 소문자로 변환합니다."""
    sentence = sentence.lower().strip()
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^a-zA-Z가-힣?.!,]+", " ", sentence)
    sentence = sentence.strip()
    return sentence

def clean_and_preprocess_corpus(kor_raw, eng_raw):
    """문장 쌍 중복 제거 + 전처리"""
    cleaned_pairs = list(set(zip(kor_raw, eng_raw)))
    kor_corpus, eng_corpus = [], []
    for kor, eng in cleaned_pairs:
        kor_corpus.append(preprocess_sentence(kor))
        eng_corpus.append(preprocess_sentence(eng))
    return kor_corpus, eng_corpus

# === [변경 지점] 추가: 길이 필터(과도한 메모리/노이즈 방지) ===
def filter_by_length(srcs, tgts, max_len=MAX_LEN):
    f_src, f_tgt = [], []
    for s, t in zip(srcs, tgts):
        if len(s.split()) <= max_len and len(t.split()) <= max_len:
            f_src.append(s); f_tgt.append(t)
    return f_src, f_tgt

# 각 데이터셋에 대해 전처리
train_kor_corpus, train_eng_corpus = clean_and_preprocess_corpus(train_kor_raw, train_eng_raw)
dev_kor_corpus,   dev_eng_corpus   = clean_and_preprocess_corpus(dev_kor_raw,   dev_eng_raw)
test_kor_corpus,  test_eng_corpus  = clean_and_preprocess_corpus(test_kor_raw,  test_eng_raw)

# === [변경 지점] 길이 필터 적용 ===
train_kor_corpus, train_eng_corpus = filter_by_length(train_kor_corpus, train_eng_corpus, MAX_LEN)
dev_kor_corpus,   dev_eng_corpus   = filter_by_length(dev_kor_corpus,   dev_eng_corpus,   MAX_LEN)
test_kor_corpus,  test_eng_corpus  = filter_by_length(test_kor_corpus,  test_eng_corpus,  MAX_LEN)

print(f"After filter → Train: {len(train_kor_corpus)}, Dev: {len(dev_kor_corpus)}, Test: {len(test_kor_corpus)}")

Train: 94123, Dev: 1000, Test: 2000
After filter → Train: 77496, Dev: 984, Test: 1953


In [4]:
#=========================================================
#================추가된 Mecab 단계(+안전 폴백)=============
#=========================================================
try:
    from konlpy.tag import Mecab
    mecab = Mecab()
    def mecab_tokenize_corpus(corpus):
        mecab_corpus = []
        for sentence in corpus:
            morphs = mecab.morphs(sentence)
            mecab_corpus.append(" ".join(morphs))
        return mecab_corpus
except Exception as e:
    print("[경고] Mecab 사용 불가:", e, "\n[대체] KoNLPy Okt로 폴백합니다(제출 시 Mecab 권장).")
    from konlpy.tag import Okt
    _okt = Okt()
    def mecab_tokenize_corpus(corpus):
        return [" ".join(_okt.morphs(sentence)) for sentence in corpus]

# 한국어 데이터셋 Mecab/Okt 처리
train_kor_mecab = mecab_tokenize_corpus(train_kor_corpus)
dev_kor_mecab   = mecab_tokenize_corpus(dev_kor_corpus)
test_kor_mecab  = mecab_tokenize_corpus(test_kor_corpus)

print("Before:", train_kor_corpus[0])
print("After :", train_kor_mecab[0])
#=========================================================
#=========================================================

Before: 이 자금은 대부분 이라크 주둔 군대 조직운영에 투자될 예정이며 아프간 파병군도 이 기금의 일부를 얻게 될 것이다 .
After : 이 자금 은 대부분 이라크 주둔 군대 조직 운영 에 투자 될 예정 이 며 아프간 파병 군 도 이 기금 의 일부 를 얻 게 될 것 이 다 .


In [5]:
def generate_tokenizer(corpus, vocab_size, lang, pad_id=0, bos_id=1, eos_id=2, unk_id=3):
    file = f'./{lang}_corpus.txt'
    model_prefix = f'{lang}_spm'
    with open(file, 'w', encoding='utf-8') as f:
        for row in corpus:
            f.write(str(row) + '\n')
    spm.SentencePieceTrainer.Train(
        f'--input={file} --model_prefix={model_prefix} --vocab_size={vocab_size}'
        f' --pad_id={pad_id} --bos_id={bos_id} --eos_id={eos_id} --unk_id={unk_id}'
    )
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.Load(f'{model_prefix}.model')
    return tokenizer

# 한국어는 mecab(또는 폴백)의 토큰화 결과, 영어는 원문 코퍼스 기준
ko_tokenizer = generate_tokenizer(train_kor_mecab, SRC_VOCAB_SIZE, "ko")
en_tokenizer = generate_tokenizer(train_eng_corpus, TGT_VOCAB_SIZE, "en")

sentencepiece_trainer.cc(178) LOG(INFO) Running command: --input=./ko_corpus.txt --model_prefix=ko_spm --vocab_size=20000 --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3
sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: ./ko_corpus.txt
  input_format: 
  model_prefix: ko_spm
  model_type: UNIGRAM
  vocab_size: 20000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 3
 

In [6]:
# =========================================
# 4. 데이터 증강 (Lexical Substitution)
# =========================================

def synonym_replace_en(sentence, p=0.2):
    tokens = sentence.split()
    new_tokens = tokens[:]
    for i, w in enumerate(tokens):
        if random.random() < p and w.isalpha() and len(w) > 2:
            syns = wn.synsets(w, lang='eng')
            lemmas = set()
            for s in syns:
                for l in s.lemmas():
                    lemma = l.name().replace('_', ' ').lower()
                    if lemma.isalpha() and lemma != w:
                        lemmas.add(lemma)
            if lemmas:
                new_tokens[i] = random.choice(list(lemmas))
    return ' '.join(new_tokens)

def random_swap(sentence, n_swaps=1):
    tokens = sentence.split()
    if len(tokens) < 2: return sentence
    for _ in range(n_swaps):
        i, j = random.sample(range(len(tokens)), 2)
        tokens[i], tokens[j] = tokens[j], tokens[i]
    return ' '.join(tokens)

def random_deletion(sentence, p=0.1):
    tokens = sentence.split()
    if len(tokens) <= 1: return sentence
    kept = [t for t in tokens if random.random() > p]
    if not kept: kept = [random.choice(tokens)]
    return ' '.join(kept)

def augment_pair(kor, eng):
    # 한국어: 스왑/삭제 위주
    kor_aug = kor
    if random.random() < 0.5: kor_aug = random_swap(kor_aug, n_swaps=1)
    if random.random() < 0.5: kor_aug = random_deletion(kor_aug, p=0.1)

    # 영어: 동의어 치환 + 스왑/삭제
    eng_aug = eng
    if random.random() < 0.7: eng_aug = synonym_replace_en(eng_aug, p=0.2)
    if random.random() < 0.3: eng_aug = random_swap(eng_aug, n_swaps=1)
    if random.random() < 0.3: eng_aug = random_deletion(eng_aug, p=0.1)
    return kor_aug, eng_aug

def build_augmented_corpus(kor_mecab, eng_corpus, ratio=0.5, seed=SEED):
    random.seed(seed)
    n = len(kor_mecab)
    k = int(n * ratio)  # 0.5배 → 전체 1.5배
    indices = random.sample(range(n), k)
    aug_ko, aug_en = [], []
    for idx in indices:
        k_aug, e_aug = augment_pair(kor_mecab[idx], eng_corpus[idx])
        if len(k_aug.split()) <= MAX_LEN and len(e_aug.split()) <= MAX_LEN:
            aug_ko.append(k_aug)
            aug_en.append(e_aug)
    return aug_ko, aug_en

# 실제 증강 적용
aug_ko, aug_en = build_augmented_corpus(train_kor_mecab, train_eng_corpus, ratio=0.5, seed=SEED)
train_kor_mecab = train_kor_mecab + aug_ko
train_eng_corpus = train_eng_corpus + aug_en

# 셔플(동일 시드)
tmp = list(zip(train_kor_mecab, train_eng_corpus))
random.shuffle(tmp)
train_kor_mecab, train_eng_corpus = zip(*tmp)
train_kor_mecab, train_eng_corpus = list(train_kor_mecab), list(train_eng_corpus)
print(f"[Augment] +{len(aug_ko)} → Total Train: {len(train_kor_mecab)}")

[Augment] +37622 → Total Train: 115118


In [7]:
# =========================================
# 5. 데이터셋 및 DataLoader 구축
# =========================================

class TranslationDataset(Dataset):
    def __init__(self, src_corpus, tgt_corpus, src_tokenizer, tgt_tokenizer):
        self.src_corpus = src_corpus
        self.tgt_corpus = tgt_corpus
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.src_corpus)

    def __getitem__(self, idx):
        src = self.src_tokenizer.encode_as_ids(self.src_corpus[idx])
        tgt = self.tgt_tokenizer.encode_as_ids(self.tgt_corpus[idx])
        return torch.tensor(src, dtype=torch.long), torch.tensor(tgt, dtype=torch.long)

def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(src_sample)
        tgt_batch.append(tgt_sample)
    src_padded = torch.nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=ko_tokenizer.pad_id())
    tgt_padded = torch.nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=en_tokenizer.pad_id())
    return src_padded, tgt_padded

train_dataset = TranslationDataset(train_kor_mecab, train_eng_corpus, ko_tokenizer, en_tokenizer)
valid_dataset = TranslationDataset(dev_kor_mecab,   dev_eng_corpus,   ko_tokenizer, en_tokenizer)
test_dataset  = TranslationDataset(test_kor_mecab,  test_eng_corpus,  ko_tokenizer, en_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,  collate_fn=collate_fn, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=4)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=4)

print(f"Number of batches in train_loader: {len(train_loader)}")
print(f"Number of batches in valid_loader: {len(valid_loader)}")
print(f"Number of batches in test_loader: {len(test_loader)}")

Number of batches in train_loader: 1799
Number of batches in valid_loader: 16
Number of batches in test_loader: 31


In [8]:
# =========================================
# 6. 트랜스포머 모델 정의 (마스크/타이잉 개선)
# =========================================

class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        div_term = torch.exp(torch.arange(0, emb_size, 2) * (-math.log(10000.0) / emb_size))
        position = torch.arange(maxlen).unsqueeze(1)
        pos_embedding = torch.zeros(maxlen, emb_size)
        pos_embedding[:, 0::2] = torch.sin(position * div_term)
        pos_embedding[:, 1::2] = torch.cos(position * div_term)
        pos_embedding = pos_embedding.unsqueeze(0)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:, :token_embedding.size(1), :])

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model
        self.depth = d_model // num_heads
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.linear = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        d_k = Q.size(-1)
        scores = torch.matmul(Q, K.transpose(-1, -2)) / math.sqrt(d_k)  # (B, H, Lq, Lk)
        if mask is not None:
            if mask.dim() == 3:
                mask = mask.unsqueeze(1)
            scores = scores.masked_fill(mask, float('-inf'))
        attentions = torch.softmax(scores, dim=-1)
        out = torch.matmul(attentions, V)
        return out, attentions

    def split_heads(self, x):
        bsz, seq_len, _ = x.size()
        x = x.view(bsz, seq_len, self.num_heads, self.depth)
        return x.permute(0, 2, 1, 3)

    def combine_heads(self, x):
        bsz, _, seq_len, _ = x.size()
        x = x.permute(0, 2, 1, 3).contiguous()
        return x.view(bsz, seq_len, self.d_model)

    def forward(self, Q, K, V, mask=None):
        WQ = self.split_heads(self.W_q(Q))
        WK = self.split_heads(self.W_k(K))
        WV = self.split_heads(self.W_v(V))
        out, attention_weights = self.scaled_dot_product_attention(WQ, WK, WV, mask)
        out = self.combine_heads(out)
        out = self.linear(out)
        return out, attention_weights

class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)
        self.norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.do = nn.Dropout(dropout)
    def forward(self, x, mask):
        residual = x
        out, enc_attn = self.enc_self_attn(self.norm_1(x), self.norm_1(x), self.norm_1(x), mask)
        out = self.do(out) + residual
        residual = out
        out = self.ffn(self.norm_2(out))
        out = self.do(out) + residual
        return out, enc_attn

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)
        self.norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.norm_3 = nn.LayerNorm(d_model, eps=1e-6)
        self.do = nn.Dropout(dropout)

    def forward(self, x, enc_out, dec_enc_mask, padding_mask):
        residual = x
        out, dec_attn = self.dec_self_attn(self.norm_1(x), self.norm_1(x), self.norm_1(x), mask=padding_mask)
        out = self.do(out) + residual
        residual = out
        out, dec_enc_attn = self.enc_dec_attn(self.norm_2(out), enc_out, enc_out, mask=dec_enc_mask)
        out = self.do(out) + residual
        residual = out
        out = self.ffn(self.norm_3(out))
        out = self.do(out) + residual
        return out, dec_attn, dec_enc_attn

class Encoder(nn.Module):
    def __init__(self, n_layers, d_model, n_heads, d_ff, dropout, vocab_size):
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, dropout)
        self.enc_layers = nn.ModuleList([EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])

    def forward(self, x, mask):
        out = self.embedding(x) * math.sqrt(self.d_model)
        out = self.pos_encoding(out)
        enc_attns = []
        for layer in self.enc_layers:
            out, enc_attn = layer(out, mask)
            enc_attns.append(enc_attn)
        return out, enc_attns

class Decoder(nn.Module):
    def __init__(self, n_layers, d_model, n_heads, d_ff, dropout, vocab_size):
        super(Decoder, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, dropout)
        self.dec_layers = nn.ModuleList([DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(n_layers)])

    def forward(self, x, enc_out, dec_enc_mask, padding_mask):
        out = self.embedding(x) * math.sqrt(self.d_model)
        out = self.pos_encoding(out)
        dec_attns, dec_enc_attns = [], []
        for layer in self.dec_layers:
            out, dec_attn, dec_enc_attn = layer(out, enc_out, dec_enc_mask, padding_mask)
            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)
        return out, dec_attns, dec_enc_attns

# === [변경 지점] 마스크 유틸리티 추가 ===
def create_padding_mask(seq, pad_id):
    return (seq == pad_id).unsqueeze(1).unsqueeze(2)  # (B,1,1,L)

def create_look_ahead_mask(size, device):
    return torch.triu(torch.ones(size, size, device=device, dtype=torch.bool), diagonal=1)  # (L,L)

class Transformer(nn.Module):
    def __init__(self, n_layers, d_model, n_heads, d_ff, src_vocab_size, tgt_vocab_size, dropout):
        super(Transformer, self).__init__()
        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout, src_vocab_size)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout, tgt_vocab_size)
        self.fc = nn.Linear(d_model, tgt_vocab_size)
        # === [변경 지점] Weight Tying (Decoder 임베딩과 출력층 공유) ===
        self.fc.weight = self.decoder.embedding.weight

    def forward(self, src, tgt):
        src_mask = create_padding_mask(src, ko_tokenizer.pad_id())           # (B,1,1,SrcL)
        tgt_pad_mask = create_padding_mask(tgt, en_tokenizer.pad_id())       # (B,1,1,TgtL)
        lookahead = create_look_ahead_mask(tgt.size(1), device).unsqueeze(0).unsqueeze(1)  # (1,1,TgtL,TgtL)
        dec_self_mask = tgt_pad_mask | lookahead                             # (B,1,TgtL,TgtL)

        enc_out, enc_attns = self.encoder(src, src_mask)
        dec_out, dec_attns, dec_enc_attns = self.decoder(tgt, enc_out, src_mask, dec_self_mask)
        logits = self.fc(dec_out)
        return logits, enc_attns, dec_attns, dec_enc_attns

In [9]:
# =========================================
# 7. 학습 설정 (고도화)
# =========================================

model = Transformer(N_LAYERS, D_MODEL, N_HEADS, D_FF, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, DROPOUT).to(device)

# === [변경 지점] Label Smoothing + ignore_index=영어 PAD ===
class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.0, ignore_index=-100):
        super().__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.ignore_index = ignore_index
        self.log_softmax = nn.LogSoftmax(dim=-1)

    def forward(self, pred, target):
        pred = self.log_softmax(pred)  # (N,C)
        with torch.no_grad():
            true_dist = torch.full_like(pred, self.smoothing / (self.cls - 1))
            ignore_mask = target.eq(self.ignore_index)
            target_clamped = target.clone()
            target_clamped[ignore_mask] = 0  # dummy
            true_dist.scatter_(1, target_clamped.unsqueeze(1), self.confidence)
            true_dist[ignore_mask] = 0
        loss = torch.sum(-true_dist * pred, dim=1)
        loss = loss.masked_select(~ignore_mask).mean()
        return loss

criterion = LabelSmoothingLoss(classes=TGT_VOCAB_SIZE, smoothing=LABEL_SMOOTHING, ignore_index=en_tokenizer.pad_id())
optimizer = optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9, weight_decay=WEIGHT_DECAY)

# === [변경 지점] Noam 스케줄러 ===
class NoamScheduler:
    def __init__(self, optimizer, d_model, warmup_steps=4000):
        self.optimizer = optimizer
        self.warmup = warmup_steps
        self._step = 0
        self.factor = d_model ** (-0.5)
    def step(self):
        self._step += 1
        lr = self.factor * min(self._step ** (-0.5), self._step * (self.warmup ** -1.5))
        for pg in self.optimizer.param_groups:
            pg['lr'] = lr
        return lr
    @property
    def step_num(self):
        return self._step

scheduler = NoamScheduler(optimizer, d_model=D_MODEL, warmup_steps=WARMUP_STEPS)
scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)

  scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)


In [10]:
# =========================================
# 8. 학습 및 검증 (AMP/누적/체크포인트/얼리스탑)
# =========================================

# === 체크포인트 유틸 ===
def save_checkpoint(path, model, optimizer, scheduler, scaler, epoch, best_valid):
    torch.save({
        'model': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler_step': scheduler.step_num,
        'scaler': scaler.state_dict() if scaler is not None else None,
        'epoch': epoch,                 # 현재 완료한 에포크 인덱스(0-based)
        'best_valid': best_valid
    }, path)

def load_checkpoint(path, model, optimizer, scheduler, scaler):
    ckpt = torch.load(path, map_location=device)
    model.load_state_dict(ckpt['model'])
    optimizer.load_state_dict(ckpt['optimizer'])
    if 'scheduler_step' in ckpt and ckpt['scheduler_step'] is not None:
        scheduler._step = ckpt['scheduler_step']
    if scaler is not None and ckpt.get('scaler') is not None:
        scaler.load_state_dict(ckpt['scaler'])
    return ckpt.get('epoch', 0), ckpt.get('best_valid', float('inf'))

# === 얼리스탑 ===
class EarlyStopping:
    def __init__(self, patience=PATIENCE, verbose=True):
        self.patience = patience
        self.counter = 0
        self.best = None
        self.early_stop = False
        self.verbose = verbose
    def step(self, metric):
        if self.best is None or metric < self.best - 1e-9:
            self.best = metric
            self.counter = 0
        else:
            self.counter += 1
            if self.verbose:
                print(f"  [EarlyStop] no improv for {self.counter}/{self.patience}")
            if self.counter >= self.patience:
                self.early_stop = True

def train(model, iterator, optimizer, criterion, clip, log_interval=100, scheduler=None, scaler=None):
    model.train()
    epoch_loss = 0.0
    optimizer.zero_grad(set_to_none=True)
    for step, batch in enumerate(iterator, start=1):
        src = batch[0].to(device)
        tgt = batch[1].to(device)
        with torch.cuda.amp.autocast(enabled=USE_AMP):
            output, _, _, _ = model(src, tgt[:, :-1])
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            tgt_flat = tgt[:, 1:].contiguous().view(-1)
            loss = criterion(output, tgt_flat) / GRAD_ACCUM_STEPS

        if scaler is not None and USE_AMP:
            scaler.scale(loss).backward()
        else:
            loss.backward()

        if step % GRAD_ACCUM_STEPS == 0:
            if MAX_GRAD_NORM is not None:
                if scaler is not None and USE_AMP:
                    scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

            if scaler is not None and USE_AMP:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            if scheduler is not None:
                scheduler.step()

        epoch_loss += loss.item() * GRAD_ACCUM_STEPS

        if (step) % log_interval == 0:
            curr_lr = optimizer.param_groups[0]['lr']
            print(f"  - Step {step}/{len(iterator)} | Batch Loss: {loss.item()*GRAD_ACCUM_STEPS:.4f} | LR: {curr_lr:.6f}")

    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0.0
    with torch.no_grad():
        for batch in iterator:
            src = batch[0].to(device)
            tgt = batch[1].to(device)
            output, _, _, _ = model(src, tgt[:, :-1])
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)
            tgt_flat = tgt[:, 1:].contiguous().view(-1)
            loss = criterion(output, tgt_flat)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# --- 학습 루프 (체크포인트/얼리스탑/재개 안전) ---
best_valid_loss = float('inf')
early_stopper = EarlyStopping(patience=PATIENCE, verbose=True)

START_EPOCH = 0
if RESUME and os.path.exists(LAST_CKPT_PATH):
    print(f"[Resume] Loading last checkpoint from {LAST_CKPT_PATH}")
    last_epoch, best_valid_loss = load_checkpoint(LAST_CKPT_PATH, model, optimizer, scheduler, scaler)
    START_EPOCH = last_epoch + 1  # ← 저장된 마지막 에포크의 '다음'부터 시작
    print(f"[Resume] Will start from epoch {START_EPOCH+1}/{EPOCHS} (best_valid={best_valid_loss:.4f})")

if START_EPOCH >= EPOCHS:
    print("[Resume] Training already completed. Nothing to do.")
else:
    for epoch in range(START_EPOCH, EPOCHS):
        start_time = time.time()
        print(f"\nEpoch {epoch+1:02} / {EPOCHS:02}")
        print("Training...")
        train_loss = train(model, train_loader, optimizer, criterion, clip=1, log_interval=200, scheduler=scheduler, scaler=scaler)

        print("Evaluating...")
        valid_loss = evaluate(model, valid_loader, criterion)

        # 체크포인트 저장(라스트)
        save_checkpoint(LAST_CKPT_PATH, model, optimizer, scheduler, scaler, epoch, best_valid_loss)

        # 베스트 저장
        if valid_loss < best_valid_loss - 1e-9:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), BEST_MODEL_PATH)
            print("best model saved.")

        # 얼리스탑 판정
        early_stopper.step(valid_loss)
        end_time = time.time()
        epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

        print(f'Time: {int(epoch_mins)}m {int(epoch_secs)}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(min(20, train_loss)) :7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(min(20, valid_loss)):7.3f}')
        print("-" * 30)

        if early_stopper.early_stop:
            print("[EarlyStop] Stopping training early.")
            break


Epoch 01 / 10
Training...


  with torch.cuda.amp.autocast(enabled=USE_AMP):


  - Step 200/1799 | Batch Loss: 10486.0215 | LR: 0.000017
  - Step 400/1799 | Batch Loss: 8091.2275 | LR: 0.000035
  - Step 600/1799 | Batch Loss: 5964.3398 | LR: 0.000052
  - Step 800/1799 | Batch Loss: 4917.2529 | LR: 0.000070
  - Step 1000/1799 | Batch Loss: 4588.4536 | LR: 0.000087
  - Step 1200/1799 | Batch Loss: 3863.1375 | LR: 0.000105
  - Step 1400/1799 | Batch Loss: 3692.7334 | LR: 0.000122
  - Step 1600/1799 | Batch Loss: 3712.6096 | LR: 0.000140
Evaluating...
best model saved.
Time: 5m 30s
	Train Loss: 5815.647 | Train PPL: 485165195.410
	 Val. Loss: 2732.062 |  Val. PPL: 485165195.410
------------------------------

Epoch 02 / 10
Training...
  - Step 200/1799 | Batch Loss: 2934.5679 | LR: 0.000175
  - Step 400/1799 | Batch Loss: 2938.9463 | LR: 0.000192
  - Step 600/1799 | Batch Loss: 2775.8247 | LR: 0.000209
  - Step 800/1799 | Batch Loss: 2679.3535 | LR: 0.000227
  - Step 1000/1799 | Batch Loss: 2506.5425 | LR: 0.000244
  - Step 1200/1799 | Batch Loss: 2623.6423 | LR: 0.0

In [11]:
# =========================================
# 9. 번역 및 성능 평가 (BLEU)
# =========================================

def translate_sentence(sentence, src_tokenizer, tgt_tokenizer, model, device, max_len=50):
    model.eval()
    src_tokens = src_tokenizer.encode_as_ids(sentence)
    src_tensor = torch.LongTensor(src_tokens).unsqueeze(0).to(device)
    tgt_tokens = [tgt_tokenizer.bos_id()]
    dec_enc_attns_all = None
    for i in range(max_len):
        tgt_tensor = torch.LongTensor(tgt_tokens).unsqueeze(0).to(device)
        with torch.no_grad():
            output, _, _, dec_enc_attns = model(src_tensor, tgt_tensor)
        pred_token = output.argmax(2)[:,-1].item()
        tgt_tokens.append(pred_token)
        dec_enc_attns_all = dec_enc_attns  # 마지막 스텝의 어텐션들
        if pred_token == tgt_tokenizer.eos_id():
            break
    tgt_sentence = tgt_tokenizer.decode_ids(tgt_tokens)
    return tgt_sentence, dec_enc_attns_all

# (베스트 모델 로드하여 추론)
if os.path.exists(BEST_MODEL_PATH):
    model.load_state_dict(torch.load(BEST_MODEL_PATH, map_location=device))
    print("[Info] Loaded best model for inference.")

example_idx = 0
src = test_kor_raw[example_idx]
trg = test_eng_raw[example_idx]
translation, attention = translate_sentence(src, ko_tokenizer, en_tokenizer, model, device)
print(f'src = {src}')
print(f'trg = {trg}')
print(f'predicted trg = {translation}')

[Info] Loaded best model for inference.
src = 토론에 참여한 사람들은 법 집행과 국가 안전보장에 대한 우려를 표명해야 할 필요성을 진지하게 받아 들이고 있습니다.
trg = Those involved in the discussions do take seriously the need to address concerns of law enforcement and national security.
predicted trg = nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis nis


In [12]:
# =========================================
# 10. 어텐션 시각화
# =========================================

def display_attention(sentence, translation, attention, n_heads=8, n_rows=4, n_cols=2):
    """어텐션 맵을 시각화합니다."""
    assert n_rows * n_cols == n_heads

    font_path = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
    font_prop = fm.FontProperties(fname=font_path, size=8)

    fig = plt.figure(figsize=(12, 28))

    sentence_tokens = sentence.split()
    translation_tokens = translation.split()

    # 디코더-인코더 어텐션의 마지막 레이어 사용
    attn = attention[-1]  # shape: (B, H, TgtL, SrcL)

    for i in range(n_heads):
        ax = fig.add_subplot(n_rows, n_cols, i + 1)
        _attention = attn.squeeze(0)[i].detach().cpu().numpy()
        src_len = len(sentence_tokens)
        tgt_len = len(translation_tokens)
        cax = ax.matshow(_attention[:, :src_len], cmap='viridis',
                         extent=[-0.5, src_len - 0.5, tgt_len - 0.5, -0.5])
        ax.set_xticks(range(src_len))
        ax.set_yticks(range(tgt_len))
        ax.set_xticklabels(sentence_tokens, rotation=90, fontproperties=font_prop, ha='center', va='center')
        ax.set_yticklabels(translation_tokens, fontproperties=font_prop, ha='right', va='center')
        ax.tick_params(labelsize=8, pad=15)

    plt.tight_layout()
    plt.show()

display_attention(src, translation, attention)


FileNotFoundError: [Errno 2] No such file or directory: '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'

Error in callback <function _draw_all_if_interactive at 0x787ce8202520> (for post_execute), with arguments args (),kwargs {}:


FileNotFoundError: [Errno 2] No such file or directory: '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'

FileNotFoundError: [Errno 2] No such file or directory: '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'

<Figure size 1200x2800 with 8 Axes>