#### **Transformer Implementation**
* Attention is All You Need (NIPS2017) 기반 코드 실습

In [1]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6.0)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.2
    Uninstalling torchtext-0.15.2:
      Successfully uninstalled torchtext-0.15.2
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0


In [2]:
%%capture
!python -m spacy download en
!python -m spacy download de

In [3]:
# 전처리 모듈 spaCy 설치
import spacy
spacy_en = spacy.load('en_core_web_sm')
spacy_de = spacy.load('de_core_news_sm')

In [4]:
# 토큰화 테스트
tokenized = spacy_en.tokenizer("My name is YYS")
for i, token in enumerate(tokenized):
    print(f"Index {i} : {token.text}")

Index 0 : My
Index 1 : name
Index 2 : is
Index 3 : YYS


In [5]:
# 영어 토큰화 함수
def tokenize_en(text):
    return [token.text for token in spacy_en.tokenizer(text)]

# 독일어 토큰화 함수
def tokenize_de(text):
    return [token.text for token in spacy_de.tokenizer(text)]

In [6]:
# 문장 전처리 내역 명시
from torchtext.data import Field, BucketIterator, TabularDataset

SRC = Field(tokenize=tokenize_de, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)
TRG = Field(tokenize=tokenize_en, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)

In [7]:
# Multi30k load를 위한 과정
import os
from torchtext.datasets import Multi30k

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [46]:
!gzip -d test_2017_mscoco.en.gz
!gzip -d test_2017_mscoco.de.gz

In [47]:
# 경로 설정 및 파일 불러오기
data_folder = "/content/drive/MyDrive/dataset-master/data/task1/raw"

train_de_path = os.path.join(data_folder, "train.de")
train_en_path = os.path.join(data_folder, "train.en")
valid_de_path = os.path.join(data_folder, "val.de")
valid_en_path = os.path.join(data_folder, "val.en")
test_2016_de_path = os.path.join(data_folder, "test_2016_flickr.de")
test_2016_en_path = os.path.join(data_folder, "test_2016_flickr.en")
test_2017_de_path = os.path.join(data_folder, "test_2017_flickr.de")
test_2017_en_path = os.path.join(data_folder, "test_2017_flickr.en")
test_2017mcc_de_path = os.path.join(data_folder, "test_2017_mscoco.de")
test_2017mcc_en_path = os.path.join(data_folder, "test_2017_mscoco.en")
test_2018_de_path = os.path.join(data_folder, "test_2018_flickr.de")
test_2018_en_path = os.path.join(data_folder, "test_2018_flickr.en")

train_de_en_tsv_path = os.path.join(data_folder, "train.de-en.tsv")
valid_de_en_tsv_path = os.path.join(data_folder, "val.de-en.tsv")
test_2016_de_en_tsv_path = os.path.join(data_folder, "test_2016_flickr.de-en.tsv")
test_2017_de_en_tsv_path = os.path.join(data_folder, "test_2017_flickr.de-en.tsv")
test_2017mcc_de_en_tsv_path = os.path.join(data_folder, "test_2017_mscoco.de-en.tsv")
test_2018_de_en_tsv_path = os.path.join(data_folder, "test_2018_flickr.de-en.tsv")

In [48]:
# train.de-en.tsv 파일 만들기
with open(train_de_path, 'r', encoding='utf-8') as de_file:
    de_lines = de_file.readlines()

with open(train_en_path, 'r', encoding='utf-8') as en_file:
    en_lines = en_file.readlines()

with open(train_de_en_tsv_path, 'w', encoding='utf-8') as tsv_file:
    for de_line, en_line in zip(de_lines, en_lines):
        tsv_file.write(f"{de_line.strip()}\t{en_line.strip()}\n")

# valid.de-en.tsv 파일 만들기
with open(valid_de_path, 'r', encoding='utf-8') as de_file:
    de_lines = de_file.readlines()

with open(valid_en_path, 'r', encoding='utf-8') as en_file:
    en_lines = en_file.readlines()

with open(valid_de_en_tsv_path, 'w', encoding='utf-8') as tsv_file:
    for de_line, en_line in zip(de_lines, en_lines):
        tsv_file.write(f"{de_line.strip()}\t{en_line.strip()}\n")

# test_2016_flickr.de-en.tsv 파일 만들기
with open(test_2016_de_path, 'r', encoding='utf-8') as de_file:
    de_lines = de_file.readlines()

with open(test_2016_en_path, 'r', encoding='utf-8') as en_file:
    en_lines = en_file.readlines()

with open(test_2016_de_en_tsv_path, 'w', encoding='utf-8') as tsv_file:
    for de_line, en_line in zip(de_lines, en_lines):
        tsv_file.write(f"{de_line.strip()}\t{en_line.strip()}\n")

# test_2017_flickr.de-en.tsv 파일 만들기
with open(test_2017_de_path, 'r', encoding='utf-8') as de_file:
    de_lines = de_file.readlines()

with open(test_2017_en_path, 'r', encoding='utf-8') as en_file:
    en_lines = en_file.readlines()

with open(test_2017_de_en_tsv_path, 'w', encoding='utf-8') as tsv_file:
    for de_line, en_line in zip(de_lines, en_lines):
        tsv_file.write(f"{de_line.strip()}\t{en_line.strip()}\n")

# test_2017_mscoco.de-en.tsv 파일 만들기
with open(test_2017mcc_de_path, 'r', encoding='utf-8') as de_file:
    de_lines = de_file.readlines()

with open(test_2017mcc_en_path, 'r', encoding='utf-8') as en_file:
    en_lines = en_file.readlines()

with open(test_2017mcc_de_en_tsv_path, 'w', encoding='utf-8') as tsv_file:
    for de_line, en_line in zip(de_lines, en_lines):
        tsv_file.write(f"{de_line.strip()}\t{en_line.strip()}\n")

# test_2018_flickr.de-en.tsv 파일 만들기
with open(test_2018_de_path, 'r', encoding='utf-8') as de_file:
    de_lines = de_file.readlines()

with open(test_2018_en_path, 'r', encoding='utf-8') as en_file:
    en_lines = en_file.readlines()

with open(test_2018_de_en_tsv_path, 'w', encoding='utf-8') as tsv_file:
    for de_line, en_line in zip(de_lines, en_lines):
        tsv_file.write(f"{de_line.strip()}\t{en_line.strip()}\n")

In [53]:
# 데이터셋 로드
train_dataset, valid_dataset, test_dataset = TabularDataset.splits(
    path=data_folder,
    train='train.de-en.tsv',  # 실제 데이터셋 파일 이름에 맞게 변경
    validation='val.de-en.tsv',  # 실제 데이터셋 파일 이름에 맞게 변경
    test='test_2016_flickr.de-en.tsv',  # 실제 데이터셋 파일 이름에 맞게 변경
    format='tsv',  # 데이터셋 파일 형식에 맞게 변경
    fields=[('src', SRC), ('trg', TRG)]
)

In [54]:
# 데이터셋 잘 불러왔는지 테스트
print(f"Training dataset : {len(train_dataset.examples)}개")
print(f"Valid dataset : {len(valid_dataset.examples)}개")
print(f"Test dataset : {len(test_dataset.examples)}개")

# 데이터셋 인덱스 1번 example 문장 출력
print(vars(train_dataset.examples[1])['src'])
print(vars(train_dataset.examples[1])['trg'])

Training dataset : 29000개
Valid dataset : 1014개
Test dataset : 1000개
['mehrere', 'männer', 'mit', 'schutzhelmen', 'bedienen', 'ein', 'antriebsradsystem', '.']
['several', 'men', 'in', 'hard', 'hats', 'are', 'operating', 'a', 'giant', 'pulley', 'system', '.']


In [57]:
# vocab 세트 정의, 최소 2번 이상 등장한 단어만을 선택
SRC.build_vocab(train_dataset, min_freq=2)
TRG.build_vocab(train_dataset, min_freq=2)

# vocab 인덱스 테스트
print(TRG.vocab.stoi["asdf"]) # Unknown (없는단어)
print(TRG.vocab.stoi[TRG.pad_token]) # Padding
print(TRG.vocab.stoi["<sos>"]) # <sos>
print(TRG.vocab.stoi["<eos>"]) # <eos>
print(TRG.vocab.stoi["hello"])

0
1
2
3
4112


In [56]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128

# Iterator 선언
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_dataset, valid_dataset, test_dataset),
    batch_size=BATCH_SIZE,
    device=device, sort=False)

# 첫번째 batch만 테스트
for i, batch in enumerate(train_iterator):
    src = batch.src
    trg = batch.trg

    # 현재 batch 있는 문장에 포함된 정보 출력
    for i in range(src.shape[1]):
        print(f"Index {i} : {src[0][i].item()}")

    break

Index 0 : 2
Index 1 : 43
Index 2 : 6823
Index 3 : 307
Index 4 : 8
Index 5 : 3254
Index 6 : 965
Index 7 : 11
Index 8 : 24
Index 9 : 5625
Index 10 : 0
Index 11 : 4
Index 12 : 3
Index 13 : 1
Index 14 : 1
Index 15 : 1
Index 16 : 1
Index 17 : 1
Index 18 : 1
Index 19 : 1
Index 20 : 1
Index 21 : 1
Index 22 : 1
Index 23 : 1
Index 24 : 1


In [58]:
# Multi-head Attention 정의
import torch.nn as nn

class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hidden_dim, n_heads, dropout_ratio, device):
        super().__init__()

        assert hidden_dim % n_heads == 0

        self.hidden_dim = hidden_dim # 임베딩 차원
        self.n_heads = n_heads # head의 개수: 서로 다른 attention의 수
        self.head_dim = hidden_dim // n_heads # 각 head에서의 임베딩 차원

        self.fc_q = nn.Linear(hidden_dim, hidden_dim) # Query 값에 적용될 FC 레이어
        self.fc_k = nn.Linear(hidden_dim, hidden_dim) # Key 값에 적용될 FC 레이어
        self.fc_v = nn.Linear(hidden_dim, hidden_dim) # Value 값에 적용될 FC 레이어

        self.fc_o = nn.Linear(hidden_dim, hidden_dim)

        self.dropout = nn.Dropout(dropout_ratio)

        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)

    def forward(self, query, key, value, mask = None):

        batch_size = query.shape[0]

        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)

        # hidden_dim → n_heads X head_dim 형태로 변형
        # h개의 서로 다른 attention을 학습하도록 유도
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)

        # Attention Energy 계산
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale

        # mask를 하는 경우 : mask값이 0인 부분을 -1e10으로 채우기
        if mask is not None:
            energy = energy.masked_fill(mask==0, -1e10)

        # Attention Score 계산: 각 단어에 대한 확률 값
        attention = torch.softmax(energy, dim=-1)

        # 여기에서 Scaled Dot-Product Attention을 계산
        x = torch.matmul(self.dropout(attention), V)

        # concat 후 output linear을 거쳐 return

        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(batch_size, -1, self.hidden_dim)
        x = self.fc_o(x)

        return x, attention

In [59]:
# FFN 정의
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hidden_dim, pf_dim, dropout_ratio):
        super().__init__()

        self.fc_1 = nn.Linear(hidden_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hidden_dim)

        self.dropout = nn.Dropout(dropout_ratio)

    def forward(self, x):
        x = self.dropout(torch.relu(self.fc_1(x)))
        x = self.fc_2(x)
        return x

In [60]:
# Encoder Layer 내부 정의
class EncoderLayer(nn.Module):
    def __init__(self, hidden_dim, n_heads, pf_dim, dropout_ratio, device):
        super().__init__()

        self.self_attn_layer_norm = nn.LayerNorm(hidden_dim)
        self.ff_layer_norm = nn.LayerNorm(hidden_dim)
        self.self_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hidden_dim, pf_dim, dropout_ratio)
        self.dropout = nn.Dropout(dropout_ratio)

    def forward(self, src, src_mask):
        # self attention
        # 필요한 경우 mask matrix로 attention을 할 단어들을 조정
        _src, _ = self.self_attention(src, src, src, src_mask)

        # Add & Norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))

        # FFN
        _src = self.positionwise_feedforward(src)

        # Add & Norm
        src = self.ff_layer_norm(src + self.dropout(_src))

        return src

In [61]:
# Transformer Encoder 정의
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_layers, n_heads, pf_dim, dropout_ratio, device, max_length=100):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(input_dim, hidden_dim)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)

        # 레이어 수만큼 encoder layer을 반복
        self.layers = nn.ModuleList([EncoderLayer(hidden_dim, n_heads, pf_dim, dropout_ratio, device) for _ in range(n_layers)])

        self.dropout = nn.Dropout(dropout_ratio)

        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)

    def forward(self, src, src_mask):
        # Input 설정
        batch_size = src.shape[0]
        src_len = src.shape[1]
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        # Input embedding + Positional embedding
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))

        for layer in self.layers:
            src = layer(src, src_mask)

        # 마지막 레이어 결과를 return
        return src

In [63]:
# Decoder Layer 내부 정의
class DecoderLayer(nn.Module):
    def __init__(self, hidden_dim, n_heads, pf_dim, dropout_ratio, device):
        super().__init__()

        self.self_attn_layer_norm = nn.LayerNorm(hidden_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hidden_dim)
        self.ff_layer_norm = nn.LayerNorm(hidden_dim)
        self.self_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
        self.encoder_attention = MultiHeadAttentionLayer(hidden_dim, n_heads, dropout_ratio, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hidden_dim, pf_dim, dropout_ratio)
        self.dropout = nn.Dropout(dropout_ratio)

    def forward(self, trg, enc_src, trg_mask, src_mask):

        # self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)

        # Add & Norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))

        # encoder attention
        # 인코더의 출력 값(enc_src)을 attention하는 구조
        # 디코더의 쿼리를 이용해 인코더를 attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)

        # Add & Norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))

        # FFN
        _trg = self.positionwise_feedforward(trg)

        # Add & Norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))

        return trg, attention

In [73]:
# Transformer Decoder 정의
class Decoder(nn.Module):
    def __init__(self, output_dim, hidden_dim, n_layers, n_heads, pf_dim, dropout_ratio, device, max_length=100):
        super().__init__()

        self.device = device

        self.tok_embedding = nn.Embedding(output_dim, hidden_dim)
        self.pos_embedding = nn.Embedding(max_length, hidden_dim)

        self.layers = nn.ModuleList([DecoderLayer(hidden_dim, n_heads, pf_dim, dropout_ratio, device) for _ in range(n_layers)])

        self.fc_out = nn.Linear(hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout_ratio)

        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim])).to(device)

    def forward(self, trg, enc_src, trg_mask, src_mask):

        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)

        # Output embedding + Positional embedding
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))

        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)

        output = self.fc_out(trg)

        # 마지막 레이어 결과(최종 번역 결과)를 return
        return output, attention

In [75]:
# Transoformer 전체 구조 정의
class Transformer(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, trg_pad_idx, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    # encoder masking
    def make_src_mask(self, src):
        # padding 토큰에 대하여 mask 값을 0으로 설정
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask

    # decoder masking
    def make_trg_mask(self, trg):
        # padding 토큰에 대하여 mask 값을 0으로 설정
        """ (마스크 예시)
        1 0 0 0 0
        1 1 0 0 0
        1 1 1 0 0
        1 1 1 0 0
        1 1 1 0 0
        """
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)

        # 타겟 문장에서 각 단어는 다음 단어가 무엇인지 알 수 없도록(이전 단어만 보도록) masking
        """ (마스크 예시)
        1 0 0 0 0
        1 1 0 0 0
        1 1 1 0 0
        1 1 1 1 0
        1 1 1 1 1
        """
        trg_len = trg.shape[1]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()

        trg_mask = trg_pad_mask & trg_sub_mask
        return trg_mask

    def forward(self, src, trg):
        # masking
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)

        # encoder
        enc_src = self.encoder(src, src_mask)

        # decoder
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)

        return output, attention

In [76]:
# 하이퍼 파라미터 설정 및 모델 초기화

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HIDDEN_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

# Encoder, Decoder 객체 선언
enc = Encoder(INPUT_DIM, HIDDEN_DIM, ENC_LAYERS, ENC_HEADS, ENC_PF_DIM, ENC_DROPOUT, device)
dec = Decoder(OUTPUT_DIM, HIDDEN_DIM, DEC_LAYERS, DEC_HEADS, DEC_PF_DIM, DEC_DROPOUT, device)

# Transformer 객체 선언
model = Transformer(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [77]:
# 모델 가중치 파라미터 초기화
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights)

Transformer(
  (encoder): Encoder(
    (tok_embedding): Embedding(7853, 256)
    (pos_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0-2): 3 x EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
     

In [78]:
# 학습 및 평가 함수 정의
import torch.optim as optim

# Adam optimizer로 학습 최적화
LEARNING_RATE = 0.0005
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Padding 값 무시하도록 설정
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

# 모델 학습 함수 정의
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    # 전체 학습 데이터를 확인하며
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        # 출력 단어의 마지막 인덱스(eos)는 제외
        # 입력을 할 때는 sos부터 시작하도록 처리
        output, _ = model(src, trg[:,:-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)

        # 출력 단어의 인덱스 0(sos)은 제외
        trg = trg[:,1:].contiguous().view(-1)

        # 모델의 출력 결과와 타겟 문장을 비교하여 손실 계산
        loss = criterion(output, trg)
        loss.backward() # gradient 계산

        # gradient clipping 진행
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        # 파라미터 업데이트
        optimizer.step()

        # 전체 손실 값 계산
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# 모델 평가 함수 정의
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0

    with torch.no_grad():
        # 전체 평가 데이터를 확인하며
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            # 출력 단어의 마지막 인덱스(eos)는 제외
            # 입력을 할 때는 sos부터 시작하도록 처리
            output, _ = model(src, trg[:,:-1])
            output_dim = output.shape[-1]
            output = output.contiguous().view(-1, output_dim)

            # 출력 단어의 인덱스 0(sos)은 제외
            trg = trg[:,1:].contiguous().view(-1)

            # 모델의 출력 결과와 타겟 문장을 비교하여 손실 계산
            loss = criterion(output, trg)

            # 전체 손실 값 계산
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [79]:
# 학습 및 평가 진행 (학습횟수 epoch : 10)
import math
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [80]:
import time
import math
import random

N_EPOCHS = 10
CLIP = 1
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time() # 시작 시간 기록

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time() # 종료 시간 기록
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'transformer_german_to_english.pt')

    print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):.3f}')
    print(f'\tValidation Loss: {valid_loss:.3f} | Validation PPL: {math.exp(valid_loss):.3f}')

Epoch: 01 | Time: 0m 15s
	Train Loss: 4.255 | Train PPL: 70.476
	Validation Loss: 3.115 | Validation PPL: 22.537
Epoch: 02 | Time: 0m 15s
	Train Loss: 2.853 | Train PPL: 17.340
	Validation Loss: 2.392 | Validation PPL: 10.934
Epoch: 03 | Time: 0m 14s
	Train Loss: 2.254 | Train PPL: 9.527
	Validation Loss: 2.060 | Validation PPL: 7.845
Epoch: 04 | Time: 0m 14s
	Train Loss: 1.893 | Train PPL: 6.640
	Validation Loss: 1.887 | Validation PPL: 6.601
Epoch: 05 | Time: 0m 14s
	Train Loss: 1.643 | Train PPL: 5.171
	Validation Loss: 1.763 | Validation PPL: 5.828
Epoch: 06 | Time: 0m 14s
	Train Loss: 1.456 | Train PPL: 4.289
	Validation Loss: 1.725 | Validation PPL: 5.615
Epoch: 07 | Time: 0m 14s
	Train Loss: 1.303 | Train PPL: 3.681
	Validation Loss: 1.702 | Validation PPL: 5.484
Epoch: 08 | Time: 0m 15s
	Train Loss: 1.178 | Train PPL: 3.248
	Validation Loss: 1.685 | Validation PPL: 5.392
Epoch: 09 | Time: 0m 15s
	Train Loss: 1.069 | Train PPL: 2.911
	Validation Loss: 1.701 | Validation PPL: 5.4

In [81]:
# 학습된 모델 저장
from google.colab import files

files.download('transformer_german_to_english.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [82]:
# 번역(translation) 함수
def translate_sentence(sentence, src_field, trg_field, model, device, max_len=50, logging=True):
    model.eval() # 평가 모드

    if isinstance(sentence, str):
        nlp = spacy.load('de')
        tokens = [token.text.lower() for token in nlp(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # 처음에 <sos> 토큰, 마지막에 <eos> 토큰 붙이기
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    if logging:
        print(f"전체 소스 토큰: {tokens}")

    src_indexes = [src_field.vocab.stoi[token] for token in tokens]
    if logging:
        print(f"소스 문장 인덱스: {src_indexes}")

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)

    # 소스 문장에 따른 마스크 생성
    src_mask = model.make_src_mask(src_tensor)

    # 인코더(endocer)에 소스 문장을 넣어 출력 값 구하기
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    # 처음에는 <sos> 토큰 하나만 가지고 있도록 하기
    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):
        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        # 출력 문장에 따른 마스크 생성
        trg_mask = model.make_trg_mask(trg_tensor)

        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)

        # 출력 문장에서 가장 마지막 단어만 사용
        pred_token = output.argmax(2)[:,-1].item()
        trg_indexes.append(pred_token) # 출력 문장에 더하기

        # <eos>를 만나는 순간 끝
        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break

    # 각 출력 단어 인덱스를 실제 단어로 변환
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]

    # 첫 번째 <sos>는 제외하고 출력 문장 반환
    return trg_tokens[1:], attention

In [83]:
example_idx = 10

src = vars(test_dataset.examples[example_idx])['src']
trg = vars(test_dataset.examples[example_idx])['trg']

print(f'소스 문장: {src}')
print(f'타겟 문장: {trg}')

translation, attention = translate_sentence(src, SRC, TRG, model, device, logging=True)

print("모델 출력 결과:", " ".join(translation))

소스 문장: ['eine', 'mutter', 'und', 'ihr', 'kleiner', 'sohn', 'genießen', 'einen', 'schönen', 'tag', 'im', 'freien', '.']
타겟 문장: ['a', 'mother', 'and', 'her', 'young', 'song', 'enjoying', 'a', 'beautiful', 'day', 'outside', '.']
전체 소스 토큰: ['<sos>', 'eine', 'mutter', 'und', 'ihr', 'kleiner', 'sohn', 'genießen', 'einen', 'schönen', 'tag', 'im', 'freien', '.', '<eos>']
소스 문장 인덱스: [2, 8, 364, 10, 134, 70, 624, 565, 19, 780, 200, 20, 88, 4, 3]
모델 출력 결과: a mother and young son enjoy an outdoor day . <eos>


In [84]:
# BLEU Score 계산
from torchtext.data.metrics import bleu_score

def show_bleu(data, src_field, trg_field, model, device, max_len=50):
    trgs = []
    pred_trgs = []
    index = 0

    for datum in data:
        src = vars(datum)['src']
        trg = vars(datum)['trg']

        pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len, logging=False)

        # 마지막 <eos> 토큰 제거
        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        trgs.append([trg])

        index += 1
        if (index + 1) % 100 == 0:
            print(f"[{index + 1}/{len(data)}]")
            print(f"예측: {pred_trg}")
            print(f"정답: {trg}")

    bleu = bleu_score(pred_trgs, trgs, max_n=4, weights=[0.25, 0.25, 0.25, 0.25])
    print(f'Total BLEU Score = {bleu*100:.2f}')

    individual_bleu1_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1, 0, 0, 0])
    individual_bleu2_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 1, 0, 0])
    individual_bleu3_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 0, 1, 0])
    individual_bleu4_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 0, 0, 1])

    print(f'Individual BLEU1 score = {individual_bleu1_score*100:.2f}')
    print(f'Individual BLEU2 score = {individual_bleu2_score*100:.2f}')
    print(f'Individual BLEU3 score = {individual_bleu3_score*100:.2f}')
    print(f'Individual BLEU4 score = {individual_bleu4_score*100:.2f}')

    cumulative_bleu1_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1, 0, 0, 0])
    cumulative_bleu2_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1/2, 1/2, 0, 0])
    cumulative_bleu3_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1/3, 1/3, 1/3, 0])
    cumulative_bleu4_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1/4, 1/4, 1/4, 1/4])

    print(f'Cumulative BLEU1 score = {cumulative_bleu1_score*100:.2f}')
    print(f'Cumulative BLEU2 score = {cumulative_bleu2_score*100:.2f}')
    print(f'Cumulative BLEU3 score = {cumulative_bleu3_score*100:.2f}')
    print(f'Cumulative BLEU4 score = {cumulative_bleu4_score*100:.2f}')

In [85]:
show_bleu(test_dataset, SRC, TRG, model, device)

[100/1000]
예측: ['a', 'group', 'of', 'asian', 'children', 'are', 'sitting', 'down', 'chairs', 'in', 'blue', 'chairs', '.']
정답: ['a', 'group', 'of', 'mostly', 'asian', 'children', 'sitting', 'at', 'cubicles', 'in', 'blue', 'chairs', '.']
[200/1000]
예측: ['all', 'standing', 'in', 'the', 'group', 'of', 'people', 'standing', 'under', 'umbrellas', '.']
정답: ['the', 'group', 'of', 'people', 'are', 'all', 'covered', 'by', 'umbrellas', '.']
[300/1000]
예측: ['a', 'goalie', 'in', 'a', 'yellow', 'jersey', 'is', 'blowing', 'the', 'goal', '.']
정답: ['a', 'goalie', 'in', 'a', 'yellow', 'field', 'is', 'protecting', 'the', 'goal', '.']
[400/1000]
예측: ['two', 'young', 'children', 'on', 'the', 'sand', '.']
정답: ['two', 'young', 'children', 'are', 'on', 'sand', '.']
[500/1000]
예측: ['two', 'medium', 'sized', 'dogs', 'run', 'across', 'the', 'snow', '.']
정답: ['two', 'medium', 'sized', 'dogs', 'run', 'across', 'the', 'snow', '.']
[600/1000]
예측: ['a', 'group', 'of', 'men', 'sit', 'behind', 'talk', 'while', 'sitting