<h1 id="tocheading">Spring 2018 NLP Class Project: Neural Machine Translation</h1>
<div id="toc"></div>

In [113]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [35]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
import spacy
import pdb
import os
from underthesea import word_tokenize
import jieba
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
# ! pip install spacy && python -m spacy download en

## Part 0: Project Overview

The goal of this project is to build a neural machine translation system and experience how recent advances have made their way. Each team will build the following sequence of neural translation systems for two language pairs, __Vietnamese (Vi)→English (En)__ and __Chinese (Zh)→En__ (prepared corpora is be provided):

1. Recurrent neural network based encoder-decoder without attention
2. Recurrent neural network based encoder-decoder with attention
2. Replace the recurrent encoder with either convolutional or self-attention based encoder.
4. [Optional] Build either or both fully self-attention translation system or/and multilingual translation system.

## Part 1: Data Upload & Preprocessing

In [38]:
# start of sentence
SOS_token = 0
# end of sentence
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [39]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    """About "NFC" and "NFD": 
    
    For each character, there are two normal forms: normal form C 
    and normal form D. Normal form D (NFD) is also known as canonical 
    decomposition, and translates each character into its decomposed form. 
    Normal form C (NFC) first applies a canonical decomposition, then composes 
    pre-combined characters again.
    
    About unicodedata.category: 
    
    Returns the general category assigned to the Unicode character 
    unichr as string."""
    
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Trim
def normalizeString(s):
    s = unicodeToAscii(s.strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [94]:
def readLangs(lang1, lang2, reverse=False,
             dataset="train"):
    
    """Takes as input;
    - lang1, lang2: either (vi, en) or (zh, en)
    - dataset: one of ("train","dev","test")"""
    print("Reading lines...")
    eos = [".","?","!","\n"]
    # Read the pretokenized lang1 file and split into lines
    lang1_lines = open("../data/tokens_and_preprocessing_em/pretokenized_data/iwslt-%s-%s-processed/%s.tok.%s" % (lang1, lang2, dataset, lang1), encoding="utf-8").\
        read().strip().split("\n")
    # Read the lang2 file and split into lines
    lang2_lines = open("../data/tokens_and_preprocessing_em/pretokenized_data/iwslt-%s-%s-processed/%s.tok.%s" % (lang1, lang2, dataset, lang2), encoding="utf-8").\
        read().strip().split("\n")
    
    # create sentence pairs (lists of length 2 that consist of string pairs)
    # e.g. ["And we &apos;re going to tell you some stories from the sea here in video .",
    #       "我们 将 用 一些 影片 来讲 讲述 一些 深海 海里 的 故事  "]
    # check if there are the same number of sentences in each set
    assert len(lang1_lines) == len(lang2_lines), "Two languages must have the same number of sentences. "+ str(len(lang1_lines)) + " sentences were passed for " + str(lang1) + "." + str(len(lang2_lines)) + " sentences were passed for " + str(lang2)+"."
    # normalize if not Chinese, Chinese normalization is already handeled
    if lang1 == "zh":
        lang1_lines = lang1_lines
    else:
        lang1_lines = [normalizeString(s) for s in lang1_lines]
    lang2_lines = [normalizeString(s) for s in lang2_lines]
    # construct pairs
    pair_ran = range(len(lang1_lines))
    pairs = [[lang1_lines[i]] + [lang2_lines[i]] for i in pair_ran]
    
#     # Split every line into pairs and normalize
#     pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [95]:
def prepareData(lang1, lang2, reverse=False, dataset="train"):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse, dataset=dataset)
    print("Read %s sentence pairs" % len(pairs))
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

# example
input_lang, output_lang, pairs = prepareData('vi', 'en', False, dataset="train")
print(random.choice(pairs))

Reading lines...
Read 133317 sentence pairs
Trimmed to 133317 sentence pairs
Counting words...
Counted words:
vi 16144
en 47568
['Anh a lam rat tot voi at lien voi mat at .', 'You did a great job with the land the dirt .']


In [96]:
input_lang, output_lang, pairs = prepareData('zh', 'en', False, dataset="train")
print(random.choice(pairs))

Reading lines...
Read 213376 sentence pairs
Trimmed to 213376 sentence pairs
Counting words...
Counted words:
zh 88917
en 59329
['而 它 就 分布 在 细胞 细胞膜 胞膜 中 并且 自身 还 带 了 个 小孔  ', 'And it sits in the membrane of the cell and it apos s got a pore in it .']


### 1.1 Vietnamese to English

In [37]:
# # Please find the original tokenizing code provided by Elman Mansimov in the following link:
# # https://github.com/derincen/neural-machine-translation/tree/master/data/tokens_and_preprocessing_em/preprocess_translation

# def tokenize_vi(f_names, f_out_names):
#     for f_name, f_out_name in zip(f_names, f_out_names):
#         lines = open(f_name, 'r').readlines()
#         tok_lines = open(f_out_name, 'w')
#         for i, sentence in enumerate(lines):
#             if i > 0 and i % 1000 == 0:
#                 print (f_name.split('/')[-1], i, len(lines))
#             tok_lines.write(word_tokenize(sentence, format="text") + '\n')
#         tok_lines.close()

# def tokenize_en(f_names, f_out_names):
#     tokenizer = spacy.load('en_core_web_sm')

#     for f_name, f_out_name in zip(f_names, f_out_names):
#         lines = open(f_name, 'r').readlines()
#         tok_lines = open(f_out_name, 'w')
#         for i, sentence in enumerate(lines):
#             if i > 0 and i % 1000 == 0:
#                 print (f_name.split('/')[-1], i, len(lines))
#             # replaced tokenizer(sentence) with str(tokenizer(sentence)) to avoid 
#             # type error while joining
#             tok_lines.write(' '.join(str(tokenizer(sentence))) + '\n')
#         tok_lines.close()


# if __name__ == "__main__":
#     root = '../data/tokens_and_preprocessing_em/pretokenized_data/iwslt-vi-en-processed/'
#     tokenize_vi([os.path.join(root, 'train.vi'), os.path.join(root, 'dev.vi'), 
#                  os.path.join(root, 'test.vi')],\
#                [os.path.join(root, 'train.tok.vi'), os.path.join(root, 'dev.tok.vi'), 
#                 os.path.join(root, 'test.tok.vi')])

#     tokenize_en([os.path.join(root, 'train.en'), os.path.join(root, 'dev.en'), 
#                  os.path.join(root, 'test.en')],\
#                 [os.path.join(root, 'train.tok.en'), os.path.join(root, 'dev.tok.en'), 
#                  os.path.join(root, 'test.tok.en')])


train.vi 1000 133317
train.vi 2000 133317
train.vi 3000 133317
train.vi 4000 133317
train.vi 5000 133317
train.vi 6000 133317
train.vi 7000 133317
train.vi 8000 133317
train.vi 9000 133317
train.vi 10000 133317
train.vi 11000 133317
train.vi 12000 133317
train.vi 13000 133317
train.vi 14000 133317
train.vi 15000 133317
train.vi 16000 133317
train.vi 17000 133317
train.vi 18000 133317
train.vi 19000 133317
train.vi 20000 133317
train.vi 21000 133317
train.vi 22000 133317
train.vi 23000 133317
train.vi 24000 133317
train.vi 25000 133317
train.vi 26000 133317
train.vi 27000 133317
train.vi 28000 133317
train.vi 29000 133317
train.vi 30000 133317
train.vi 31000 133317
train.vi 32000 133317
train.vi 33000 133317
train.vi 34000 133317
train.vi 35000 133317
train.vi 36000 133317
train.vi 37000 133317
train.vi 38000 133317
train.vi 39000 133317
train.vi 40000 133317
train.vi 41000 133317
train.vi 42000 133317
train.vi 43000 133317
train.vi 44000 133317
train.vi 45000 133317
train.vi 46000 1333

In [45]:
# Format: languagepair_language_dataset
# Train 
vien_vi_train, vien_en_train, vi_en_train_pairs = prepareData('vi', 'en', False, dataset="train")
# Dev 
vien_vi_dev, vien_en_dev, vi_en_dev_pairs = prepareData('vi', 'en', False, dataset="dev")
# Test
vien_vi_test, vien_en_test, vi_en_test_pairs = prepareData('vi', 'en', False, dataset="test")

Reading lines...
Read 133317 sentence pairs
Trimmed to 133317 sentence pairs
Counting words...
Counted words:
vi 16144
en 47568
Reading lines...
Read 1268 sentence pairs
Trimmed to 1268 sentence pairs
Counting words...
Counted words:
vi 1370
en 3816
Reading lines...
Read 1553 sentence pairs
Trimmed to 1553 sentence pairs
Counting words...
Counted words:
vi 1325
en 3619


### 1.2 Chinese to English

In [89]:
# # Please find the original tokenizing code provided by Elman Mansimov in the following link:
# # https://github.com/derincen/neural-machine-translation/tree/master/data/tokens_and_preprocessing_em/preprocess_translation

# def tokenize_zh(f_names, f_out_names):
#     for f_name, f_out_name in zip(f_names, f_out_names):
#         lines = open(f_name, 'r').readlines()
#         tok_lines = open(f_out_name, 'w')
#         for i, sentence in enumerate(lines):
#             if i > 0 and i % 1000 == 0:
#                 print (f_name.split('/')[-1], i, len(lines))
#             tok_lines.write(' '.join(jieba.cut(sentence, cut_all=True)))
#         tok_lines.close()

# def tokenize_en(f_names, f_out_names):
#     tokenizer = spacy.load('en_core_web_sm')

#     for f_name, f_out_name in zip(f_names, f_out_names):
#         lines = open(f_name, 'r').readlines()
#         tok_lines = open(f_out_name, 'w')
#         for i, sentence in enumerate(lines):
#             if i > 0 and i % 1000 == 0:
#                 print (f_name.split('/')[-1], i, len(lines))
#             # replaced tokenizer(sentence) with str(tokenizer(sentence)) to avoid 
#             # type error while joining
#             tok_lines.write(' '.join(str(tokenizer(sentence))) + '\n')
#         tok_lines.close()

# if __name__ == "__main__":
#     root = '../data/tokens_and_preprocessing_em/pretokenized_data/iwslt-zh-en-processed/'
#     tokenize_zh([os.path.join(root, 'dev.zh'), os.path.join(root, 'test.zh'), os.path.join(root, 'train.zh')],\
#                 [os.path.join(root, 'dev.tok.zh'), os.path.join(root, 'test.tok.zh'), os.path.join(root, 'train.tok.zh')])

# #     tokenize_en([os.path.join(root, 'dev.en'), os.path.join(root, 'test.en'), os.path.join(root, 'train.en')],\
# #                [os.path.join(root, 'dev.tok.en'), os.path.join(root, 'test.tok.en'), os.path.join(root, 'train.tok.en')])


dev.zh 1000 1261
test.zh 1000 1397
train.zh 1000 213377
train.zh 2000 213377
train.zh 3000 213377
train.zh 4000 213377
train.zh 5000 213377
train.zh 6000 213377
train.zh 7000 213377
train.zh 8000 213377
train.zh 9000 213377
train.zh 10000 213377
train.zh 11000 213377
train.zh 12000 213377
train.zh 13000 213377
train.zh 14000 213377
train.zh 15000 213377
train.zh 16000 213377
train.zh 17000 213377
train.zh 18000 213377
train.zh 19000 213377
train.zh 20000 213377
train.zh 21000 213377
train.zh 22000 213377
train.zh 23000 213377
train.zh 24000 213377
train.zh 25000 213377
train.zh 26000 213377
train.zh 27000 213377
train.zh 28000 213377
train.zh 29000 213377
train.zh 30000 213377
train.zh 31000 213377
train.zh 32000 213377
train.zh 33000 213377
train.zh 34000 213377
train.zh 35000 213377
train.zh 36000 213377
train.zh 37000 213377
train.zh 38000 213377
train.zh 39000 213377
train.zh 40000 213377
train.zh 41000 213377
train.zh 42000 213377
train.zh 43000 213377
train.zh 44000 213377
train.

In [97]:
# Format: languagepair_language_dataset
# Train 
zhen_zh_train, zhen_en_train, zh_en_train_pairs = prepareData('zh', 'en', False, dataset="train")
# Dev 
zhen_zh_dev, zhen_en_dev, zh_en_dev_pairs = prepareData('zh', 'en', False, dataset="dev")
# Test
zhen_zh_test, zhen_en_test, zh_en_test_pairs = prepareData('zh', 'en', False, dataset="test")

Reading lines...
Read 213376 sentence pairs
Trimmed to 213376 sentence pairs
Counting words...
Counted words:
zh 88917
en 59329
Reading lines...
Read 1261 sentence pairs
Trimmed to 1261 sentence pairs
Counting words...
Counted words:
zh 6132
en 3916
Reading lines...
Read 1397 sentence pairs
Trimmed to 1397 sentence pairs
Counting words...
Counted words:
zh 5214
en 3423


In [112]:
zh_en_train_pairs[3]

['我们 将 用 一些 影片 来讲 讲述 一些 深海 海里 的 故事  ',
 'And we apos re going to tell you some stories from the sea here in video .']

### 1.3: Check Source & Target Vocabs

Since the source and target languages can have very different table lookup layers, it's good practice to have separate vocabularies for each. Thus, we build vocabularies for each language that we will be using. 

In the first class (Lang) of this section, we have already defined vocabularies for all languages. So, there is no need to redefine another function. We chech each vocabulary below.

#### Chinese Vocabulary

In [102]:
print ("The number of words in Chinese training corpus is " + str(zhen_zh_train.n_words))

The number of words in Chinese training corpus is 88917


In [103]:
zhen_zh_train.word2index["格"]

10479

In [104]:
zhen_zh_train.index2word[10479]

'格'

#### Vietnamese Vocabulary

In [67]:
print ("The number of words in Vietnamese training corpus is " + str(vien_vi_train.n_words))

The number of words in Vietnamese training corpus is 16144


In [81]:
vien_vi_train.word2index["Hamburger"]

6752

In [83]:
vien_vi_train.index2word[6752]

'Hamburger'

#### English Vocabulary for Zh-En

In [68]:
print ("The number of words in English training corpus for Zh-En is " + str(zhen_en_train.n_words))

The number of words in English training corpus for Zh-En is 59329


In [106]:
zhen_en_train.word2index["translate"]

1451

In [108]:
zhen_en_train.index2word[1451]

'translate'

#### English Vocabulary for Vi-En

In [109]:
print ("The number of words in English training corpus for Vi-En is " + str(vien_en_train.n_words))

The number of words in English training corpus for Vi-En is 47568


In [110]:
vien_en_train.word2index["machine"]

847

In [111]:
vien_en_train.index2word[847]

'machine'

### 1.4 Prepare Dataloaders

In [136]:
# convert token to id in the dataset
def token2index_dataset(paired_tokens, 
                        lang1_token2id_vocab,
                        lang2_token2id_vocab):
    """Takes as input:
    - paired_tokens: a list of sentence pairs that consist of source & target lang sentences.
    - lang1_token2id_vocab: token2index vocabulary for the first language. 
                            Get by method Lang_dataset.word2index
    - lang2_token2id_vocab: token2index vocabulary for the second language. 
                            Get by method Lang_dataset.word2index
                            
    Returns:
    - indices_data_lang_1, indices_data_lang2: A list of lists where each sub-list holds corresponding indices for each
                                               token in the sentence."""
    indices_data_lang_1, indices_data_lang_2 = [], []
    vocabs = [lang1_token2id_vocab, lang2_token2id_vocab]
    
    # lang1
    for t in range(len(paired_tokens)):
        index_list = [vocabs[0][token] if token in vocabs[0] else UNK_IDX for token in paired_tokens[t][0]]
        indices_data_lang_1.append(index_list)
    # lang2
    for t in range(len(paired_tokens)):
        index_list = [vocabs[1][token] if token in vocabs[1] else UNK_IDX for token in paired_tokens[t][1]]
        indices_data_lang_2.append(index_list)
        
    return indices_data_lang_1, indices_data_lang_2

# train indices
zhen_zh_train_indices, zhen_en_train_indices = token2index_dataset(zh_en_train_pairs,
                                                                   zhen_zh_train.word2index,
                                                                   zhen_en_train.word2index)

vien_vi_train_indices, vien_en_train_indices = token2index_dataset(vi_en_train_pairs,
                                                                   vien_vi_train.word2index,
                                                                   vien_en_train.word2index)

# dev indices
zhen_zh_dev_indices, zhen_en_dev_indices = token2index_dataset(zh_en_dev_pairs,
                                                               zhen_zh_dev.word2index,
                                                               zhen_en_dev.word2index)

vien_vi_dev_indices, vien_en_dev_indices = token2index_dataset(vi_en_dev_pairs,
                                                               vien_vi_dev.word2index,
                                                               vien_en_dev.word2index)

# test indices
zhen_zh_test_indices, zhen_en_test_indices = token2index_dataset(zh_en_test_pairs,
                                                                 zhen_zh_test.word2index,
                                                                 zhen_en_test.word2index)

vien_vi_test_indices, vien_en_test_indices = token2index_dataset(vi_en_test_pairs,
                                                                 vien_vi_test.word2index,
                                                                 vien_en_test.word2index)

In [141]:
# check length
# train
print ("Chinese training set length = "+str(len(zhen_zh_train_indices)))
print ("Chinese-English (En) training set length = "+str(len(zhen_en_train_indices)))
print ("\nVietnamese training set length = "+str(len(vien_vi_train_indices)))
print ("Vietnamese-English (En) training set length = "+str(len(vien_en_train_indices)))
# dev
print ("\nChinese dev set length = "+str(len(zhen_zh_dev_indices)))
print ("Chinese-English (En) dev set length = "+str(len(zhen_en_dev_indices)))
print ("\nVietnamese dev set length = "+str(len(vien_vi_dev_indices)))
print ("Vietnamese-English (En) dev set length = "+str(len(vien_en_dev_indices)))
# test
print ("\nChinese test set length = "+str(len(zhen_zh_test_indices)))
print ("Chinese-English (En) test set length = "+str(len(zhen_en_test_indices)))
print ("\nVietnamese test set length = "+str(len(vien_vi_test_indices)))
print ("Vietnamese-English (En) test set length = "+str(len(vien_en_test_indices)))

Chinese training set length = 213376
Chinese-English (En) training set length = 213376

Vietnamese training set length = 133317
Vietnamese-English (En) training set length = 133317

Chinese dev set length = 1261
Chinese-English (En) dev set length = 1261

Vietnamese dev set length = 1268
Vietnamese-English (En) dev set length = 1268

Chinese test set length = 1397
Chinese-English (En) test set length = 1397

Vietnamese test set length = 1553
Vietnamese-English (En) test set length = 1553


#### Dataloader

In [None]:
## TODO

## Part 2: Evaluation Metric

We use BLEU as the evaluation metric. Specifically, we focus on the corpus-level BLEU function. 

The code for BLEU is taken from https://github.com/mjpost/sacreBLEU/blob/master/sacrebleu.py#L1022-L1080

In [52]:
! pip3 install sacrebleu

[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Part 3: Beam Search Algorithm

In this section, we implement the Beam Search algorithm in Pytorch.

In [55]:
## python example

from math import log
from numpy import array
from numpy import argmax
 
# beam search
def beam_search_decoder(data, k):
	sequences = [[list(), 1.0]]
	# walk over each step in sequence
	for row in data:
		all_candidates = list()
		# expand each current candidate
		for i in range(len(sequences)):
			seq, score = sequences[i]
			for j in range(len(row)):
				candidate = [seq + [j], score * -log(row[j])]
				all_candidates.append(candidate)
		# order all candidates by score
		ordered = sorted(all_candidates, key=lambda tup:tup[1])
		# select k best
		sequences = ordered[:k]
	return sequences
 
# define a sequence of 10 words over a vocab of 5 words
data = [[0.1, 0.2, 0.3, 0.4, 0.5],
		[0.5, 0.4, 0.3, 0.2, 0.1],
		[0.1, 0.2, 0.3, 0.4, 0.5],
		[0.5, 0.4, 0.3, 0.2, 0.1],
		[0.1, 0.2, 0.3, 0.4, 0.5],
		[0.5, 0.4, 0.3, 0.2, 0.1],
		[0.1, 0.2, 0.3, 0.4, 0.5],
		[0.5, 0.4, 0.3, 0.2, 0.1],
		[0.1, 0.2, 0.3, 0.4, 0.5],
		[0.5, 0.4, 0.3, 0.2, 0.1]]
data = array(data)
# decode sequence
result = beam_search_decoder(data, 3)
# print result
for seq in result:
	print(seq)

[[4, 0, 4, 0, 4, 0, 4, 0, 4, 0], 0.025600863289563108]
[[4, 0, 4, 0, 4, 0, 4, 0, 4, 1], 0.03384250043584397]
[[4, 0, 4, 0, 4, 0, 4, 0, 3, 0], 0.03384250043584397]


In [None]:
# Original code borrowed from 
# 

class Beam(object):
    """
    Class for managing the internals of the beam search process.
    Takes care of beams, back pointers, and scores.
    Args:
       size (int): beam size
       pad, bos, eos (int): indices of padding, beginning, and ending.
       n_best (int): nbest size to use
       cuda (bool): use gpu
       global_scorer (:obj:`GlobalScorer`)
    """

    def __init__(self, size, pad, bos, eos,
                 n_best=1, cuda=False,
                 global_scorer=None,
                 min_length=0,
                 stepwise_penalty=False,
                 block_ngram_repeat=0,
                 exclusion_tokens=set()):

        self.size = size
        self.tt = torch.cuda if cuda else torch

        # The score for each translation on the beam.
        self.scores = self.tt.FloatTensor(size).zero_()
        self.all_scores = []

        # The backpointers at each time-step.
        self.prev_ks = []

        # The outputs at each time-step.
        self.next_ys = [self.tt.LongTensor(size)
                        .fill_(pad)]
        self.next_ys[0][0] = bos

        # Has EOS topped the beam yet.
        self._eos = eos
        self.eos_top = False

        # The attentions (matrix) for each time.
        self.attn = []

        # Time and k pair for finished.
        self.finished = []
        self.n_best = n_best

        # Information for global scoring.
        self.global_scorer = global_scorer
        self.global_state = {}

        # Minimum prediction length
        self.min_length = min_length

        # Apply Penalty at every step
        self.stepwise_penalty = stepwise_penalty
        self.block_ngram_repeat = block_ngram_repeat
        self.exclusion_tokens = exclusion_tokens

    def get_current_state(self):
        "Get the outputs for the current timestep."
        return self.next_ys[-1]

    def get_current_origin(self):
        "Get the backpointers for the current timestep."
        return self.prev_ks[-1]

    def advance(self, word_probs, attn_out):
        """
        Given prob over words for every last beam `wordLk` and attention
        `attn_out`: Compute and update the beam search.
        Parameters:
        * `word_probs`- probs of advancing from the last step (K x words)
        * `attn_out`- attention at the last step
        Returns: True if beam search is complete.
        """
        num_words = word_probs.size(1)
        if self.stepwise_penalty:
            self.global_scorer.update_score(self, attn_out)
        # force the output to be longer than self.min_length
        cur_len = len(self.next_ys)
        if cur_len < self.min_length:
            for k in range(len(word_probs)):
                word_probs[k][self._eos] = -1e20
        # Sum the previous scores.
        if len(self.prev_ks) > 0:
            beam_scores = word_probs + \
                self.scores.unsqueeze(1).expand_as(word_probs)
            # Don't let EOS have children.
            for i in range(self.next_ys[-1].size(0)):
                if self.next_ys[-1][i] == self._eos:
                    beam_scores[i] = -1e20

            # Block ngram repeats
            if self.block_ngram_repeat > 0:
                ngrams = []
                le = len(self.next_ys)
                for j in range(self.next_ys[-1].size(0)):
                    hyp, _ = self.get_hyp(le - 1, j)
                    ngrams = set()
                    fail = False
                    gram = []
                    for i in range(le - 1):
                        # Last n tokens, n = block_ngram_repeat
                        gram = (gram +
                                [hyp[i].item()])[-self.block_ngram_repeat:]
                        # Skip the blocking if it is in the exclusion list
                        if set(gram) & self.exclusion_tokens:
                            continue
                        if tuple(gram) in ngrams:
                            fail = True
                        ngrams.add(tuple(gram))
                    if fail:
                        beam_scores[j] = -10e20
        else:
            beam_scores = word_probs[0]
        flat_beam_scores = beam_scores.view(-1)
        best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0,
                                                            True, True)

        self.all_scores.append(self.scores)
        self.scores = best_scores

        # best_scores_id is flattened beam x word array, so calculate which
        # word and beam each score came from
        prev_k = best_scores_id / num_words
        self.prev_ks.append(prev_k)
        self.next_ys.append((best_scores_id - prev_k * num_words))
        self.attn.append(attn_out.index_select(0, prev_k))
        self.global_scorer.update_global_state(self)

        for i in range(self.next_ys[-1].size(0)):
            if self.next_ys[-1][i] == self._eos:
                global_scores = self.global_scorer.score(self, self.scores)
                s = global_scores[i]
                self.finished.append((s, len(self.next_ys) - 1, i))

        # End condition is when top-of-beam is EOS and no global score.
        if self.next_ys[-1][0] == self._eos:
            self.all_scores.append(self.scores)
            self.eos_top = True

    def done(self):
        return self.eos_top and len(self.finished) >= self.n_best

    def sort_finished(self, minimum=None):
        if minimum is not None:
            i = 0
            # Add from beam until we have minimum outputs.
            while len(self.finished) < minimum:
                global_scores = self.global_scorer.score(self, self.scores)
                s = global_scores[i]
                self.finished.append((s, len(self.next_ys) - 1, i))
                i += 1

        self.finished.sort(key=lambda a: -a[0])
        scores = [sc for sc, _, _ in self.finished]
        ks = [(t, k) for _, t, k in self.finished]
        return scores, ks

    def get_hyp(self, timestep, k):
        """
        Walk back to construct the full hypothesis.
        """
        hyp, attn = [], []
        for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1):
            hyp.append(self.next_ys[j + 1][k])
            attn.append(self.attn[j][k])
            k = self.prev_ks[j][k]
        return hyp[::-1], torch.stack(attn[::-1])

## Part 4: Model

1. Recurrent neural network based encoder-decoder without attention
2. Recurrent neural network based encoder-decoder with attention
2. Replace the recurrent encoder with either convolutional or self-attention based encoder.

### 2.1: RNN-based Encoder-Decoder without Attention

### 2.2 RNN-based Encoder-Decoder with Attention

### 2.3 Encoder Replacement with Eonvolutional or Self-attention-based Encoder

### 2.4 Fully self-attention Translation System

### 2.5 Multilingual Translation System