**참고문헌**  
\[1\] [16-01 트랜스포머(Transformer)](https://wikidocs.net/31379)  
\[2\] [The Annotated Transformer](https://nlp.seas.harvard.edu/2018/04/03/attention.html#prelims)  

In [1]:
from transformers import *
from konlpy.tag import Okt
import spacy
import numpy as np
import time
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.autograd import Variable
from functools import partial
from tqdm import tqdm
import seaborn
seaborn.set_context(context="talk")
%matplotlib inline


![](../images/the-annotated-transformer_14_0.png)



Figure 1: The Transformer - model architecture.

In [2]:
okt = Okt()
nlp = spacy.load("en_core_web_sm")
special_tokens = ["<pad>", "<unk>", "<sos>", "<eos>"]

In [3]:
parallel_corpus = [
    {"en": "Hello.", "ko": "안녕하세요."},
    {"en": "How are you?", "ko": "잘 지내세요?"},
    {"en": "I am a student.", "ko": "저는 학생입니다."},
    {"en": "What is your name?", "ko": "당신의 이름은 무엇인가요?"},
    {"en": "My name is John.", "ko": "제 이름은 존입니다."},
    {"en": "Nice to meet you.", "ko": "만나서 반갑습니다."},
    {"en": "Good morning!", "ko": "좋은 아침이에요!"},
    {"en": "Good night.", "ko": "안녕히 주무세요."},
    {"en": "Where are you going?", "ko": "어디 가세요?"},
    {"en": "I’m going home.", "ko": "집에 가는 중이에요."},
    {"en": "See you later.", "ko": "나중에 봐요."},
    {"en": "Thank you very much.", "ko": "정말 감사합니다."},
    {"en": "You're welcome.", "ko": "천만에요."},
    {"en": "Excuse me.", "ko": "실례합니다."},
    {"en": "I’m sorry.", "ko": "죄송합니다."},
    {"en": "No problem.", "ko": "괜찮아요."},
    {"en": "Do you speak English?", "ko": "영어 할 줄 아세요?"},
    {"en": "I speak a little Korean.", "ko": "한국어 조금 할 줄 알아요."},
    {"en": "How much is this?", "ko": "이거 얼마예요?"},
    {"en": "Where is the restroom?", "ko": "화장실 어디예요?"},
    {"en": "Can you help me?", "ko": "도와주실 수 있나요?"},
    {"en": "I don’t understand.", "ko": "이해하지 못했어요."},
    {"en": "Please speak slowly.", "ko": "천천히 말씀해 주세요."},
    {"en": "I’m hungry.", "ko": "배고파요."},
    {"en": "I’m tired.", "ko": "피곤해요."},
    {"en": "What time is it?", "ko": "지금 몇 시예요?"},
    {"en": "Today is Monday.", "ko": "오늘은 월요일이에요."},
    {"en": "It’s raining.", "ko": "비가 오고 있어요."},
    {"en": "I like music.", "ko": "저는 음악을 좋아해요."},
    {"en": "Let’s go together.", "ko": "같이 가요."},
    {"en": "Have a nice day!", "ko": "좋은 하루 되세요!"},
    {"en": "Goodbye!", "ko": "안녕히 가세요!"}
]

In [4]:
class En2KoDataset(Dataset):
    def __init__(self, corpus, okt, nlp, random=False, special_tokens = ["<pad>", "<unk>", "<sos>", "<eos>"]):
        
        self.random = random
        
        self.src_sentences = []
        self.trg_sentences = []
        
        self.en_tokens = set()
        self.ko_tokens = set()
        for line in parallel_corpus:
            self.trg_sentences.append(line['en'])
            self.src_sentences.append(line['ko'])
            self.ko_tokens.update(okt.morphs(line['ko']))
            en_list = [ token.text for token in nlp(line['en'])]
            self.en_tokens.update(en_list)
    
        self.special_tokens = special_tokens
    
        self.en_vocab = special_tokens + sorted(list(self.en_tokens))
        self.en_token2idx = {token: idx for idx, token in enumerate(self.en_vocab)}
        self.en_idx2token = {idx: token for token, idx in self.en_token2idx.items()}
        
        self.ko_vocab = special_tokens + sorted(list(self.ko_tokens))
        self.ko_token2idx = {token: idx for idx, token in enumerate(self.ko_vocab)}
        self.ko_idx2token = {idx: token for token, idx in self.ko_token2idx.items()}

        self.n_en_vocab = len(self.en_vocab)
        self.n_ko_vocab = len(self.ko_vocab)
        
        self.srcs = []
        self.trgs = []
        #self.trgs_y = []
        
        for line in parallel_corpus:
            trg = self.en_encode(line['en'], self.en_token2idx)
            self.trgs.append(trg)
            #self.trgs_y.append(trg[1:])
            src = self.ko_encode(line['ko'], self.ko_token2idx) 
            self.srcs.append(src)

        self.n_sentences = len(self.src_sentences) 
        print(self.n_sentences)
            
    def ko_encode(self, sentence, token2idx):
        tokens = okt.morphs(sentence)
        ids = [token2idx.get(tok, token2idx["<unk>"]) for tok in tokens]
        out = None
        out = [token2idx["<sos>"]] + ids + [token2idx["<eos>"]]   
        return out


    def en_encode(self, sentence, token2idx):
        tokens = [ token.text for token in nlp(sentence)]
        ids = [token2idx.get(tok, token2idx["<unk>"]) for tok in tokens]
        out = None
        out = [token2idx["<sos>"]] + ids + [token2idx["<eos>"]]   
        return out

    def __len__(self):
        return len(self.src_sentences)


    def __getitem__(self, index):
        #print(f"index: {index}")
        # if self.random:
        #     t = int(time.time() * 1000000)
        #     np.random.seed(((t & 0xff000000) >> 24) +
        #                    ((t & 0x00ff0000) >> 8) +
        #                    ((t & 0x0000ff00) << 8) +
        #                    ((t & 0x000000ff) << 24))
        #     index = np.random.randint(self.n_sentences)
        # else:
        #     index = index % self.n_sentences
        #     pass
        #index = index % self.__len__()
        
        sample = self.srcs[index], self.trgs[index]
        return sample

def custom_collate_fn(raw_data, device):
    srcs, trgs = zip(*raw_data)  # unpack batch list of triples

    srcs = [torch.tensor(seq) for seq in srcs]
    trgs = [torch.tensor(seq) for seq in trgs]

    srcs_padded = pad_sequence(srcs, batch_first=True, padding_value=0)
    trgs_padded = pad_sequence(trgs, batch_first=True, padding_value=0)
    
    return Batch(srcs_padded, trgs_padded, pad=0, device=device)

In [12]:
#``` python
def run_epoch(epoch, dataloader, model, loss_compute, batch_size, n_totals):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    total = n_totals // batch_size
    data_iterator = tqdm(dataloader, total=total, ncols=150, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}')
    for i, batch in enumerate(data_iterator):
        #print(f"i: {i}")
        out = model.forward(batch.src, batch.trg, 
                            batch.src_mask, batch.trg_mask)
        #print(f'trg_y.shape: {batch.trg_y.shape}')
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        #if i % 50 == 0:
        #    elapsed = time.time() - start
        #    print("Epoch Step: %d Loss: %f Tokens per Sec: %f" %
        #            (i, loss / batch.ntokens, tokens / elapsed))
        #    start = time.time()
        #    tokens = 0
        status = str(epoch + 1) + f' loss: {total_loss / total_tokens:.6f}' 
    
        data_iterator.set_description(status)     
    return total_loss / total_tokens
#```

In [6]:
N=2
d_ff = 2048
d_model=512
d_ff=2048
h=8
dropout=0.1

batch_size = 32

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
collate_with_device = partial(custom_collate_fn, device=device)
dataset = En2KoDataset(parallel_corpus, okt, nlp)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_with_device)

32


### Set hyperpameters

In [None]:
## loading datasets:

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
collate_with_device = partial(custom_collate_fn, device=device)
dataset = En2KoDataset(parallel_corpus, okt, nlp)
loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_with_device)

# data size
n_src_vocab = dataset.n_ko_vocab
n_tgt_vocab = dataset.n_en_vocab
n_totals = len(dataset)

32


### trainning

In [13]:
criterion = LabelSmoothing(size=n_tgt_vocab, padding_idx=0, smoothing=0.0)
model = make_model(n_src_vocab, n_tgt_vocab,  N=2)
model = model.to(device)
model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
for epoch in range(1000):
    model.train()
    run_epoch(epoch, loader, model, 
              SimpleLossCompute(model.generator, criterion, model_opt),
              batch_size=batch_size, n_totals=n_totals)


1 loss: 5.564037: 100%|██████████| 1/1 [00:00<00:00,  4.14it/s]                                                                                       
2 loss: 5.579782: 100%|██████████| 1/1 [00:00<00:00, 22.47it/s]                                                                                       
3 loss: 5.345482: 100%|██████████| 1/1 [00:00<00:00, 33.05it/s]                                                                                       
4 loss: 4.992937: 100%|██████████| 1/1 [00:00<00:00, 34.02it/s]                                                                                       
5 loss: 4.799294: 100%|██████████| 1/1 [00:00<00:00, 35.16it/s]                                                                                       
6 loss: 4.466684: 100%|██████████| 1/1 [00:00<00:00, 34.47it/s]                                                                                       
7 loss: 4.096347: 100%|██████████| 1/1 [00:00<00:00, 35.45it/s]                               

### greedy decoding

In [14]:

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type_as(src.data)
    for i in range(max_len-1):
        #print(ys)
        out = model.decode(memory, src_mask, 
                           Variable(ys), 
                           Variable(subsequent_mask(ys.size(1))
                                    .type_as(src.data)))
        #print(f"out shape: {out.shape}")
        #print(f"out[:-1] shape: {out[:, -1].shape}")
        prob = model.generator(out[:, -1])
        #print(f"prob shape: {prob.shape}")
        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.data[0]
        ys = torch.cat([ys, 
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
    return ys

In [15]:
model.eval()

print(f"input : {dataset.src_sentences[0]}")
print(f"output : {dataset.trg_sentences[0]}")

srcs = [torch.tensor(seq) for seq in dataset[0][0]]
src = Variable(torch.LongTensor([srcs]).unsqueeze(0) ).to(device)
src_mask = Variable(torch.ones(1, 1, len(srcs) )).to(device)
#print(greedy_decode(model, src, src_mask, max_len=10, start_symbol=2))

result = greedy_decode(model, src, src_mask, max_len=10, start_symbol=2)
res = result.detach().cpu().numpy()[0] 
print("predict : ", end=" ")
for r in res.tolist():
    if dataset.en_idx2token[r] == '<sos>':
        continue
    elif dataset.en_idx2token[r] == '<eos>':
        break;
    print(dataset.en_idx2token[r],end=" ")

input : 안녕하세요.
output : Hello.
predict :  Hello . 

In [17]:
print("results for simple translator.")
print("-------------------------------------------------------------------------------\n")
for i, _ in enumerate(dataset):
    print(f"input : {dataset.src_sentences[i]}")
    print(f"output : {dataset.trg_sentences[i]}")
    
    srcs = [torch.tensor(seq) for seq in dataset[i][0]]
    src = Variable(torch.LongTensor([srcs]).unsqueeze(0) ).to(device)
    src_mask = Variable(torch.ones(1, 1, len(srcs) )).to(device)
    #print(greedy_decode(model, src, src_mask, max_len=10, start_symbol=2))
    
    result = greedy_decode(model, src, src_mask, max_len=10, start_symbol=2)
    res = result.detach().cpu().numpy()[0] 
    print("predict : ", end=" ")
    for r in res.tolist():
        if dataset.en_idx2token[r] == '<sos>':
            continue
        elif dataset.en_idx2token[r] == '<eos>':
            break;
        print(dataset.en_idx2token[r],end=" ")
    print("")
    print("-------------------------------------------------------------------------------")

results for simple translator.
-------------------------------------------------------------------------------

input : 안녕하세요.
output : Hello.
predict :  Hello . 
-------------------------------------------------------------------------------
input : 잘 지내세요?
output : How are you?
predict :  How are you ? 
-------------------------------------------------------------------------------
input : 저는 학생입니다.
output : I am a student.
predict :  I am a student . 
-------------------------------------------------------------------------------
input : 당신의 이름은 무엇인가요?
output : What is your name?
predict :  What is your name ? 
-------------------------------------------------------------------------------
input : 제 이름은 존입니다.
output : My name is John.
predict :  My name is John . 
-------------------------------------------------------------------------------
input : 만나서 반갑습니다.
output : Nice to meet you.
predict :  Nice to meet you . 
-----------------------------------------------------------------