[nn.Transformer와 Torchtext로 언어 번역하기](https://tutorials.pytorch.kr/beginner/translation_transformer.html)

In [1]:
# Transformer를 사용한 번역 모델을 스크래치부터 학습
# Multi30k 데이터셋을 사용한 독일어 -> 영어 번역 모델 학습

In [2]:
# requirement dependency
!pip install -U torchdata
!pip install -U spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm
!pip install portalocker

Collecting spacy
  Downloading spacy-3.5.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: spacy
  Attempting uninstall: spacy
    Found existing installation: spacy 3.5.3
    Uninstalling spacy-3.5.3:
      Successfully uninstalled spacy-3.5.3
Successfully installed spacy-3.5.4
2023-07-04 15:10:43.510995: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-04 15:10:46.058715: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See 

## 데이터 구하고 처리하기

In [3]:
"""
torchtext 라이브러리: 언어 번역 모델 생성 위한 데이터셋을 만들기 위한 도구들 존재

수행
    ㄴ torchtext의 내장 데이터셋 활용
    ㄴ 원시 텍스트 문장을 토큰화하고, 텐서로 수치화하기
"""

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k

# 데이터셋의 링크 수정(원본 데이터 링크 동작 안함)
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# placeholder
token_transform = {}
vocab_transform = {}

In [4]:
from typing import Iterable, List

token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')

# 토큰 목록 생성 위한 헬퍼 함수
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# 특수 기호 및 인덱스 정의
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # 학습용 데이터 반복
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))

    # vocab 생성
    vocab_transform[ln] = build_vocab_from_iterator(
        yield_tokens(train_iter, ln),
        min_freq=1,
        specials=special_symbols,
        special_first=True
    )

# UNK_IDX를 기본으로 설정(oov vocabulary)
# 기본 인덱스 미설정시 RuntimeError 발생
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[ln].set_default_index(UNK_IDX)

## Transformer를 이용한 seq2seq 신경망

In [6]:
"""
Transformer
    ㄴ seq2seq 모델
    ㄴ 구성
        1. embedding layer: 위치 인코딩 추가
        2. transformer 모델
        3. un-normalizer probability
"""

from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# position encoding위한 헬퍼 모듈
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int =5000):
        super(PositionalEncoding, self).__init__()

        # torch.arange(0, emb_size, 2): 0 ~ emb_size-1 까지의 텐서를 2간격으로 생성
        den = torch.exp(-torch.arange(0, emb_size, 2) * math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)

        # 모델의 파라미터로 취급받지 않기 위해 buffer에 등록
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(
        self,
        token_embedding: Tensor
    ):
        return self.dropout(
            token_embedding + self.pos_embedding[:token_embedding.size(0), :]
        )

In [7]:
# 입력 인덱스의 텐서를 해당하는 토큰 임베딩의 텐서로 변환하기 위한 헬퍼 모듈
class TokenEmbedding(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        emb_size
    ):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        # 논문에서 아래의 식으로 진행함(paper 3.4 표기)
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [None]:
# Seq2Seq 신경망
class Seq2SeqTransformer(nn.Module):
    def __init__(
        self,
        num_encoder_layers: int,
        num_decoder_layers: int,
        emb_size: int,
        nhead: int,
        src_vocab_size: int,
        tgt_vocab_size: int,
        dim_feedforward: int = 512,
        dropout: float = 0.1,
    ):
        super(Seq2SeqTransformer, self).__init__()

        self.transformer = Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )

        self.generator = nn.Linear(emb_size, tgt_vocab_size)

        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)

        self.positional_encoding = PositionalEncoding(
            emb_size,
            dropout=dropout
        )

    def forward(
        self,
        src: Tensor,
        trg: Tensor,
        src_mask: Tensor,
        tgt_mask: Tensor,
        src_padding_mask: Tensor,
        tgt_padding_mask: Tensor,
        memory_key_padding_mask: Tensor,
    ):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))

        # 파라미터 설명 > https://stackoverflow.com/questions/62170439/difference-between-src-mask-and-src-key-padding-mask
        outs = self.transformer(
            src_emb,
            tgt_emb,
            src_mask,
            tgt_mask,
            None, # memory_mask: additive mask for the encoder output
            src_padding_mask,
            tgt_padding_mask,
            memory_key_padding_mask,
        )
        return self.generator(outs)

    def encode(
        self,
        src: Tensor,
        src_mask: Tensor,
    ):
        return self.transformer.encoder(
            self.positional_encoding(self.src_tok_emb(src)),
            src_mask
        )

    def decode(
        self,
        tgt: Tensor,
        memory: Tensor,
        tgt_mask: Tensor,
    ):
        return self.transformer.decoder(
            self.positional_encoding(self.tgt_tok_emb(tgt)),
            memory,
            tgt_mask,
        )