<h1 id="tocheading">Spring 2018 NLP Class Project: Neural Machine Translation</h1>
<div id="toc"></div>

In [14]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [16]:
import numpy as np
import copy
# from sacreBLEU.sacreBLEU import corpus_bleu

from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
import spacy
import pdb
import os
from underthesea import word_tokenize
import jieba
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
import pickle as pkl
import time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

# running on cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

## Part 0: Project Overview

The goal of this project is to build a neural machine translation system and experience how recent advances have made their way. Each team will build the following sequence of neural translation systems for two language pairs, __Vietnamese (Vi)→English (En)__ and __Chinese (Zh)→En__ (prepared corpora is be provided):

1. Recurrent neural network based encoder-decoder without attention
2. Recurrent neural network based encoder-decoder with attention
2. Replace the recurrent encoder with either convolutional or self-attention based encoder.

## Part 1: Data Upload & Preprocessing

In [3]:
UNK_IDX = 2
PAD_IDX = 0
SOS_token = 1
EOS_token = 3

In [92]:
def read_dataset(path):
    f = open(path)
    list_l = []
    for line in f:
        list_l.append(line.strip())
    data = pd.DataFrame()
    data["data"] = list_l
    return data

# vietnamese -> english
vien_en_train = read_dataset("data/iwslt-vi-en/train.tok.en")
vien_en_val = read_dataset("data/iwslt-vi-en/dev.tok.en")

vien_vi_train = read_dataset("data/iwslt-vi-en/train.tok.vi")
vien_vi_val = read_dataset("data/iwslt-vi-en/dev.tok.vi")

# chinese -> english
zhen_en_train = read_dataset("data/iwslt-zh-en/train.tok.en")
zhen_en_val = read_dataset("data/iwslt-zh-en/dev.tok.en")

zhen_zh_train = read_dataset("data/iwslt-zh-en/train.tok.zh")
zhen_zh_val = read_dataset("data/iwslt-zh-en/dev.tok.zh")

# TEST
# vietnamese -> english
vien_en_test = read_dataset("data/iwslt-vi-en/test.tok.en")
vien_vi_test = read_dataset("data/iwslt-vi-en/test.tok.vi")
# chinese -> english
zhen_en_test = read_dataset("data/iwslt-zh-en/test.tok.en")
zhen_zh_test = read_dataset("data/iwslt-zh-en/test.tok.zh")


In [93]:
# chinese -> english
zhen_train = pd.DataFrame()
zhen_train["en_data"] = zhen_en_train["data"]
zhen_train["zh_data"] = zhen_zh_train["data"]

# vietnamese -> english
vien_train = pd.DataFrame()
vien_train["en_data"] = vien_en_train["data"]
vien_train["vi_data"] = vien_vi_train["data"]

# chinese -> english
zhen_val = pd.DataFrame()
zhen_val["en_data"] = zhen_en_val["data"]
zhen_val["zh_data"] = zhen_zh_val["data"]

# vietnamese -> english
vien_val = pd.DataFrame()
vien_val["en_data"] = vien_en_val["data"]
vien_val["vi_data"] = vien_vi_val["data"]

# TEST: vietnamese -> english
vien_test = pd.DataFrame()
vien_test["en_data"] = vien_en_test["data"]
vien_test["vi_data"] = vien_vi_test["data"]

# TEST: chinese -> english
zhen_test = pd.DataFrame()
zhen_test["en_data"] = zhen_en_test["data"]
zhen_test["zh_data"] = zhen_zh_test["data"]

In [6]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "<SOS>", 1: "<EOS>", 2:"<UNK>",3:"<PAD>"}
        self.n_words = 4

    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word.lower())

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [7]:
# Taken from lab notebook
def unicodeToAscii(s):
    """About "NFC" and "NFD": 
    
    For each character, there are two normal forms: normal form C 
    and normal form D. Normal form D (NFD) is also known as canonical 
    decomposition, and translates each character into its decomposed form. 
    Normal form C (NFC) first applies a canonical decomposition, then composes 
    pre-combined characters again.
    
    About unicodedata.category: 
    
    Returns the general category assigned to the Unicode character 
    unichr as string."""
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def data_tok(data, lang="vi"):
    
    data["en_tokenized"] = data["en_data"].apply(lambda x: x.lower().split( ))
    
    if lang == "vi":
        data["vi_tokenized"] = data["vi_data"].apply(lambda x: x.lower().split( ))
    else:
        data["zh_tokenized"] = data["zh_data"].apply(lambda x: x.lower().split( ))
        
    return data

In [8]:
# vietnamese -> english
vien_en_ = Lang("vien_en")
for s in vien_train["en_data"]:
    vien_en_.addSentence(s)
    
vien_vi_ = Lang("vi")
for s in vien_train["vi_data"]:
    vien_vi_.addSentence(s)
    
# chinese -> english
zhen_en_ = Lang("zhen_en")
for s in zhen_train["en_data"]:
    zhen_en_.addSentence(s)
    
zhen_zh_ = Lang("zh")
for s in zhen_train["zh_data"]:
    zhen_zh_.addSentence(s)

In [10]:
# training

# vietnamese -> english
vien_train = data_tok(vien_train, lang="vi")

# chinese -> english
zhen_train = data_tok(zhen_train, lang="zh")

In [11]:
zhen_train.head(3)

Unnamed: 0,en_data,zh_data,en_tokenized,zh_tokenized
0,Life in the deep oceans,深海 海中 的 生命 大卫 盖罗,"[life, in, the, deep, oceans]","[深海, 海中, 的, 生命, 大卫, 盖罗]"
1,With vibrant video clips captured by submarine...,大卫 盖罗 通过 潜水 潜水艇 拍下 的 影片 把 我们 带到 了 地球 最 黑暗 ...,"[with, vibrant, video, clips, captured, by, su...","[大卫, 盖罗, 通过, 潜水, 潜水艇, 拍下, 的, 影片, 把, 我们, 带到, 了,..."
2,This is Bill Lange . I &apos;m Dave Gallo .,大卫 盖罗 这位 是 比尔 兰格 我 是 大卫 盖罗,"[this, is, bill, lange, ., i, &apos;m, dave, g...","[大卫, 盖罗, 这位, 是, 比尔, 兰格, 我, 是, 大卫, 盖罗]"


In [94]:
# VALIDATION
# vietnamese -> english
vien_val = data_tok(vien_val, lang="vi")
# chinese -> english
zhen_val = data_tok(zhen_val, lang="zh")

# TEST
vien_test = data_tok(vien_test, lang="vi")
zhen_test = data_tok(zhen_test, lang="zh")

In [13]:
def token2index_dataset(data, source_language="zh"):
    
    if source_language == "zh" and "zh_data" not in [*data.columns]:
        raise ValueError, "Source language should be compatible with the data you pass!"
    elif source_language == "vi" and "vi_data" not in [*data.columns]:
        raise ValueError, "Source language should be compatible with the data you pass!"
    else:   
        if source_language == "zh":
            # chinese -> english
            for language in ["en","zh"]:
                indices_data = []
                if language == "en":
                    lang_obj = zhen_en_
                else:
                    lang_obj = zhen_zh_

                for tokens in data[language + "_tokenized"]:

                    index_list = [lang_obj.word2index[token] if \
                                  token in lang_obj.word2index else UNK_IDX \
                                  for token in tokens]
                    index_list.append(EOS_token)
                    indices_data.append(index_list)

                data[language + "_indices"] = indices_data
        else:
            # vietnamese -> english
            for language in ["en","vi"]:
                indices_data = []
                if language == "en":
                    lang_obj = vien_en_
                else:
                    lang_obj = vien_vi_

                for tokens in data[language + "_tokenized"]:

                    index_list = [lang_obj.word2index[token] if \
                                  token in lang_obj.word2index else UNK_IDX \
                                  for token in tokens]
                    index_list.append(EOS_token)
                    indices_data.append(index_list)

                data[language + "_indices"] = indices_data

    return data

In [14]:
# train
# vietnamese -> english
vien_train = token2index_dataset(vien_train, 
                                 source_language="vi")
# chinese -> english
zhen_train = token2index_dataset(zhen_train, 
                                 source_language="zh")

# validation
# vietnamese -> english
vien_val = token2index_dataset(vien_val, 
                               source_language="vi")
# chinese -> english
zhen_val = token2index_dataset(zhen_val, 
                               source_language="zh")

In [95]:
# TEST
# vietnamese -> english
vien_test = token2index_dataset(vien_test, source_language="vi")
# chinese -> english
zhen_test = token2index_dataset(zhen_test, source_language="zh")

In [99]:
# Datasets
from torch.utils.data import Dataset

# vietnamese -> english
class Vietnamese(Dataset):
    
    def __init__(self, data, val = False):
        self.data = data
        self.val = val
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        
        english = self.data.iloc[idx,:]["en_indices"]
        vietnamese = self.data.iloc[idx,:]["vi_indices"]
        en_lengths = self.data.iloc[idx,:]["en_lengths"]
        vi_lengths = self.data.iloc[idx,:]["vi_lengths"]
        
        if self.val:
            en_data = self.data.iloc[idx,:]["en_data"].lower()
            return [vietnamese, english, vi_lengths, en_lengths, en_data]
        else:
            return [vietnamese, english, vi_lengths, en_lengths]
    
# chinese -> english
class Chinese(Dataset):
    def __init__(self, data, val = False):
        self.data = data
        self.val = val
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        
        english = self.data.iloc[idx,:]["en_indices"]
        chinese = self.data.iloc[idx,:]["zh_indices"]
        en_lengths = self.data.iloc[idx,:]["en_lengths"]
        zh_lengths = self.data.iloc[idx,:]["zh_lengths"]
        
        if self.val:
            en_data = self.data.iloc[idx,:]["en_data"].lower()
            return [chinese, english, zh_lengths, en_lengths, en_data]
        else:
            return [chinese, english, zh_lengths, en_lengths]

In [16]:
# get lengths 
max_len_ = 30
min_len_ = 2

# vietnamese -> english
# train
vien_train["en_lengths"] = vien_train["en_indices"].apply(lambda x: len(x))
vien_train["vi_lengths"] = vien_train["vi_indices"].apply(lambda x:len(x))
vien_train = vien_train[np.logical_and(vien_train["en_lengths"]>=min_len_,
                                       vien_train["vi_lengths"]>=min_len_)]
vien_train = vien_train[vien_train["vi_lengths"]<=max_len_]

# val
vien_val["en_lengths"] = vien_val["en_indices"].apply(lambda x: len(x))
vien_val["vi_lengths"] = vien_val["vi_indices"].apply(lambda x:len(x))
vien_val = vien_val[np.logical_and(vien_val["en_lengths"]>=min_len_,
                                   vien_val["vi_lengths"]>=min_len_)]
vien_val = vien_val[vien_val["vi_lengths"]<=max_len_]

# chinese -> english
# train
zhen_train["en_lengths"] = zhen_train["en_indices"].apply(lambda x: len(x))
zhen_train["zh_lengths"] = zhen_train["zh_indices"].apply(lambda x:len(x))
zhen_train = zhen_train[np.logical_and(zhen_train["en_lengths"]>=min_len_,
                                       zhen_train["zh_lengths"]>=min_len_)]
zhen_train = zhen_train[zhen_train["zh_lengths"]<=max_len_]
# val
zhen_val["en_lengths"] = zhen_val["en_indices"].apply(lambda x: len(x))
zhen_val["zh_lengths"] = zhen_val["zh_indices"].apply(lambda x:len(x))
zhen_val = zhen_val[np.logical_and(zhen_val["en_lengths"]>=min_len_,
                                   zhen_val["zh_lengths"]>=min_len_)]
zhen_val = zhen_val[zhen_val["zh_lengths"]<=max_len_]

In [82]:
# TEST
vien_test["en_lengths"] = vien_test["en_indices"].apply(lambda x: len(x))
vien_test["vi_lengths"] = vien_test["vi_indices"].apply(lambda x:len(x))

vien_test = vien_test[np.logical_and(vien_test["en_lengths"]>=min_len_,
                                     vien_test["vi_lengths"]>=min_len_)]
vien_test = vien_test[vien_test["vi_lengths"]<=max_len_]

In [96]:
# TEST
zhen_test["en_lengths"] = zhen_test["en_indices"].apply(lambda x: len(x))
zhen_test["zh_lengths"] = zhen_test["zh_indices"].apply(lambda x:len(x))

zhen_test = zhen_test[np.logical_and(zhen_test["en_lengths"]>=min_len_,
                                     zhen_test["zh_lengths"]>=min_len_)]
zhen_test = zhen_test[zhen_test["zh_lengths"]<=max_len_]

In [17]:
zhen_train.head(3)

Unnamed: 0,en_data,zh_data,en_tokenized,zh_tokenized,en_indices,zh_indices,en_lengths,zh_lengths
0,Life in the deep oceans,深海 海中 的 生命 大卫 盖罗,"[life, in, the, deep, oceans]","[深海, 海中, 的, 生命, 大卫, 盖罗]","[4, 5, 6, 7, 8, 3]","[4, 5, 6, 7, 9, 10, 3]",6,7
2,This is Bill Lange . I &apos;m Dave Gallo .,大卫 盖罗 这位 是 比尔 兰格 我 是 大卫 盖罗,"[this, is, bill, lange, ., i, &apos;m, dave, g...","[大卫, 盖罗, 这位, 是, 比尔, 兰格, 我, 是, 大卫, 盖罗]","[45, 39, 46, 47, 44, 48, 49, 50, 18, 44, 3]","[9, 10, 47, 32, 48, 49, 50, 32, 9, 10, 3]",11,11
3,And we &apos;re going to tell you some stories...,我们 将 用 一些 影片 来讲 讲述 一些 深海 海里 的 故事,"[and, we, &apos;re, going, to, tell, you, some...","[我们, 将, 用, 一些, 影片, 来讲, 讲述, 一些, 深海, 海里, 的, 故事]","[30, 51, 52, 53, 21, 54, 55, 22, 56, 57, 6, 58...","[17, 51, 52, 53, 15, 54, 55, 53, 4, 56, 6, 57, 3]",17,13


In [18]:
# MAX_SENTENCE_LENGTH = 50
# BATCH_SIZE = 32

# def translation_collate_(batch):
    
#     target_data = []
#     source_data = []
#     target_lengths = []
#     source_lengths = []

#     for datum in batch:
#         target_lengths.append(datum[2])
#         source_lengths.append(datum[3])
        
#     # PAD
#     for datum in batch:
#         if datum[2] > MAX_SENTENCE_LENGTH:
#             padded_vec_target = np.array(datum[0])[:MAX_SENTENCE_LENGTH]
#         else:
#             padded_vec_target = np.pad(np.array(datum[0]),
#                                 pad_width=((0,MAX_SENTENCE_LENGTH - datum[2])),
#                                 mode="constant", constant_values=PAD_IDX)
            
#         if datum[3] > MAX_SENTENCE_LENGTH:
#             padded_vec_source = np.array(datum[1])[:MAX_SENTENCE_LENGTH]
#         else:
#             padded_vec_source = np.pad(np.array(datum[1]),
#                                 pad_width=((0,MAX_SENTENCE_LENGTH - datum[3])),
#                                 mode="constant", constant_values=PAD_IDX)
            
#         target_data.append(padded_vec_target)
#         source_data.append(padded_vec_source)
        
#     return [torch.from_numpy(np.array(source_data)), torch.from_numpy(np.array(target_data)),
#             torch.from_numpy(np.array(source_lengths)), torch.from_numpy(np.array(target_lengths))]

In [19]:
# # vietnamese -> english
# vien_dataset = {"train": Vietnamese(vien_train), "val": Vietnamese(vien_val)}

# vien_loader = {x: DataLoader(vien_dataset[x], batch_size=BATCH_SIZE, 
#                             collate_fn=translation_collate_,
#                             shuffle=False, num_workers=0) for x in ["train", "val"]}

# # chinese -> english
# zhen_dataset = {"train": Chinese(zhen_train), "val": Chinese(zhen_val)}

# zhen_loader = {x: DataLoader(zhen_dataset[x], batch_size=BATCH_SIZE,
#                              collate_fn=translation_collate_,
#                              shuffle=False, num_workers=0) for x in ["train", "val"]}

In [20]:
# # vietnamese -> english
# vien_train_data = next(iter(vien_loader["train"]))
# vien_val_data = next(iter(vien_loader["val"]))

# # chinese -> english
# zhen_train_data = next(iter(zhen_loader["train"]))
# zhen_val_data = next(iter(zhen_loader["val"]))

In [21]:
# def out_token_2_string(index_tensor, 
#                        language):
#     sentence = []
#     for i in index_tensor:
#         if i.item() not in [0, 1, 3]: # <PAD>, <SOS>, <EOS>
#             sentence.append(language.index2word[i.item()])
#     return (' ').join(sentence)

## Part 2: Model

In [22]:
from sacreBLEU.sacreBLEU import corpus_bleu

### 2.1: RNN Encoder-Decoder without Attention

#### RNN Encoder

In [23]:

# RNNencoder
class RNNencoder(nn.Module):
    
    def __init__(self, 
                 input_size, 
                 hidden_size, 
                 num_gru_layers=1):
        
        super(RNNencoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_gru_layers

        self.embedding = nn.Embedding(self.input_size, 
                                      self.hidden_size,
                                      padding_idx=0)
        
        self.GRU = nn.GRU(self.hidden_size, 
                          self.hidden_size,
                          batch_first = True,
                          bidirectional = False)
        
        if self.GRU.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
        
    def init_hidden(self, batch_size):
        
        return torch.zeros(self.num_layers*self.num_directions, 
                           batch_size, self.hidden_size).to(device)

    def forward(self,
                source_sentence,
                source_lengths,
                hidden):
        
        sort_original_source = sorted(range(len(source_lengths)), 
                               key=lambda sentence: -source_lengths[sentence])
        unsort_to_original_source = sorted(range(len(source_lengths)), 
                                    key=lambda sentence: sort_original_source[sentence])
        
        source_sentence = source_sentence[sort_original_source]
        source_lengths = source_lengths[sort_original_source]
        batch_size, seq_len_source = source_sentence.size()
        
        embeds_source = self.embedding(source_sentence)
        
        embeds_source = torch.nn.utils.rnn.pack_padded_sequence(embeds_source, 
                                                                source_lengths, 
                                                                batch_first=True)
        output = embeds_source
        
        output, hidden = self.GRU(output, hidden)
        
        hidden = hidden.view(batch_size, self.hidden_size)
        
        hidden = hidden[unsort_to_original_source] ## back to original indices
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        
        hidden = hidden.view(1, batch_size, self.hidden_size)
        
        return hidden, output[unsort_to_original_source]

#### RNN Decoder

In [24]:
# RNNdecoder
class RNNdecoder(nn.Module):
    
    def __init__(self, 
                 hidden_size, 
                 vocab_size):
        
        super(RNNdecoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(self.vocab_size, 
                                      self.hidden_size,
                                      padding_idx=0)
        
        
        self.dropout = nn.Dropout(p=0.1)
        
        self.GRU = nn.GRU(self.hidden_size, 
                          self.hidden_size,
                          batch_first=True)
        

        self.linear_layer = nn.Linear(self.hidden_size, self.vocab_size)
        
        self.log_softmax = nn.LogSoftmax(dim=1)
        
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size).to(device)

    def forward(self, 
                input_, 
                decoder_hidden,
                encoder_outputs=None):
        
        # seq_len will always be 1 in the decoder at each time step
        batch_size = input_.size(0)
        output = self.embedding(input_)
        output = self.dropout(output)
        
#         cat_out = torch.cat((output, decoder_hidden), 2)

        output, decoder_hidden = self.GRU(output, decoder_hidden)

        output = self.linear_layer(output.squeeze(dim=1))

        output = self.log_softmax(output)

        return output, decoder_hidden


#### Translator for RNN encoder-decoder

In [25]:
def translate_rnn(encoder_model,
                  decoder_model,
                  source_sentence,
                  target_sentence,
                  source_lengths):
    
    use_teacher_forcing = True if random.random() < 0.6 else False
    
    batch_size = source_sentence.size(0)
    encoder_hidden = encoder_model.init_hidden(batch_size)
    
    encoder_hidden, encoder_output = encoder_model(source_sentence,
                                                   source_lengths,
                                                   encoder_hidden)
    
    decoder_hidden = encoder_hidden
    
    decoder_input = torch.FloatTensor([[SOS_token]]*batch_size).to(device)

    if use_teacher_forcing:
        
        decoder_out = []
         
        for time_step in range(MAX_SENTENCE_LENGTH):
            
            decoder_output, decoder_hidden = decoder_model(decoder_input,
                                                           decoder_hidden,
                                                           encoder_outputs=None)
            decoder_out.append(decoder_output.unsqueeze(-1))
            decoder_input = target_sentence[:,time_step].view(-1,1)
            
        decoder_out = torch.cat(decoder_out,
                                dim=-1)
    else:
        
        decoder_out = []
        for time_step in range(MAX_SENTENCE_LENGTH):
            
            decoder_output, decoder_hidden = decoder_model(decoder_input,
                                                           decoder_hidden,
                                                           encoder_output)
            
            decoder_out.append(decoder_output.unsqueeze(-1))
            top_scores, top_indices = decoder_output.topk(1)
            decoder_input = top_indices.squeeze().detach().view(-1,1)
            
        decoder_out = torch.cat(decoder_out,
                                dim=-1)
        
    return decoder_out, decoder_hidden

### Collate Function

In [12]:
MAX_LEN_TARGET = 50 # EN
MAX_LEN_SOURCE = 50 # CHINESE/VIETNAMESE

def translation_collate(batch):
    
    if MAX_LEN_TARGET <= 10:
        raise ValueError("MAX_LEN_TARGET too small")
    elif MAX_LEN_SOURCE <= 10:
        raise ValueError("MAX_LEN_SOURCE too small")
        
    else:
    
        target_sentence = []
        source_sentence = []
        target_lengths = []
        source_lengths = []

        for datum in batch:
            target_lengths.append(datum[3])
            source_lengths.append(datum[2])

        max_target_length = max(target_lengths)
        max_source_len = max(source_lengths)

        if max_target_length < MAX_LEN_TARGET:
            MAX_LEN_TARGET = max_target_length

        if max_source_len < MAX_LEN_SOURCE:
            MAX_LEN_SOURCE = max_source_len

        # padding
        for datum in batch:
            if datum[2] > MAX_LEN_SOURCE:
                padded_vec_source = np.array(datum[0])[:MAX_LEN_SOURCE]
            else:
                padded_vec_source = np.pad(np.array(datum[0]),
                                    pad_width=((0,MAX_LEN_SOURCE - datum[2])),
                                    mode="constant", constant_values=PAD_IDX)
            if datum[3] > MAX_LEN_TARGET:
                padded_vec_target = np.array(datum[1])[:MAX_LEN_TARGET]
            else:
                padded_vec_target = np.pad(np.array(datum[1]),
                                    pad_width=((0,MAX_LEN_TARGET - datum[3])),
                                    mode="constant", constant_values=PAD_IDX)
                
            target_sentence.append(padded_vec_target)
            source_sentence.append(padded_vec_source)

        source_sentence = np.array(source_sentence)
        target_sentence = np.array(target_sentence)
        source_lengths = np.array(source_lengths)
        target_lengths = np.array(target_lengths)

        source_lengths[source_lengths>MAX_LEN_SOURCE] = MAX_LEN_SOURCE
        target_lengths[target_lengths>MAX_LEN_TARGET] = MAX_LEN_TARGET

    return [torch.from_numpy(source_sentence), torch.from_numpy(target_sentence),
            torch.from_numpy(source_lengths), torch.from_numpy(target_lengths)]

def translation_collate_val(batch):
    return [torch.from_numpy(np.array(batch[0][0])).unsqueeze(0), 
            torch.from_numpy(np.array(batch[0][1])).unsqueeze(0),
            torch.from_numpy(np.array(batch[0][2])).unsqueeze(0), 
            torch.from_numpy(np.array(batch[0][3])).unsqueeze(0),batch[0][4]]

In [33]:
# MAX_LEN = 48
# train,val,en_lang,vi_lang = vien_train, vien_val, vien_en_, vien_vi_
train, val, en_lang, zh_lang = zhen_train, zhen_val, zhen_en_, zhen_zh_

In [83]:
# VIEN TEST
test, en_lang, vi_lang = vien_test, vien_en_, vien_vi_

In [97]:
# ZHEN TEST
zh_test, en_lang, zh_lang = zhen_test, zhen_en_, zhen_zh_

In [88]:
# Vietnamese -> English
# TEST 

batch_sizes = {"train":128,"val":1, "train_val":1,"val_train":128}
train_used = train
collate_fn_dict = {"train":translation_collate, "val": translation_collate_val,\
                   "train_val":translation_collate_val,"val_train":translation_collate}
translation_dataset = {"train": Vietnamese(train_used), 
                       "val": Vietnamese(test, val = True), # changing val test with test set
                       "train_val":Vietnamese(train.iloc[:50], val = True),
                       "val_train":Vietnamese(val)
                                               }

dataloader = {x: DataLoader(translation_dataset[x],
                            batch_size=batch_sizes[x], 
                            collate_fn=collate_fn_dict[x],
                            shuffle=True, num_workers=0) for x in ["train", "val", "train_val","val_train"]}

In [11]:
# Chinese -> English
# TEST

batch_sizes = {"test":1, "val_train":128}
# train_used = shuffle_sorted_batches(train_sorted, bs_dict["train"])
# train_used = train.iloc[:50]
# train_used = train
collate_fn_dict = {"test":translation_collate_val, "val_train":translation_collate}

translation_dataset = {"test": Chinese(zh_test, val = True),
                       "val_train":Chinese(val)}

dataloader = {x: DataLoader(translation_dataset[x], 
                            batch_size=batch_sizes[x], 
                            collate_fn=collate_fn_dict[x],
                            shuffle=True, num_workers=0) for x in ["test", "val_train"]}

In [38]:
from sacreBLEU.sacreBLEU import corpus_bleu

### Part 2.2: RNN Encoder-Decoder with Attention

In [17]:
class LSTMencoder(nn.Module):

    def __init__(self,
                 input_size,
                 embed_size,
                 hidden_size,
                 num_lstm_layers):

        super(LSTMencoder, self).__init__()
        self.hidden_size = hidden_size
        self.embed_size = embed_size

        self.embedding = Embedding(input_size,
                                   self.embed_size,
                                   padding_idx=0)

        self.dropout_ = nn.Dropout(p = 0.1)
        self.num_layers = num_lstm_layers

        self.lstm = LSTM(self.embed_size, self.hidden_size,
                         batch_first=True, bidirectional=True,
                         num_layers = self.num_layers,
                         dropout = 0.15)

    def initHidden(self, batch_size):
        return torch.zeros(self.num_layers*2,
                           batch_size,
                           self.hidden_size).to(device),\
               torch.zeros(self.num_layers*2,
                           batch_size,
                           self.hidden_size).to(device)

    def forward(self,
                encoder_inputs,
                source_lengths):

        sort_original_source = torch.sort(source_lengths, descending=True)[1]
        unsort_to_original_source = torch.sort(sort_original_source)[1]

        embeds_source = self.embedding(encoder_inputs)
        
        lstm_out = self.dropout_(embeds_source)

        batch_size, seq_len = embeds_source.size()

        hidden, context = self.initHidden(batch_size)
        sorted_output = lstm_out[sort_original_source]
        sorted_len = source_lengths[sort_original_source]

        packed_output = nn.utils.rnn.pack_padded_sequence(sorted_output, 
                                                          sorted_lengths, 
                                                          batch_first = True)

        packed_outs, (hiddden, context) = self.lstm(packed_output,(hidden, context))
        hidden = hidden[:,unsort_to_original_source,:]
        context = context[:,unsort_to_original_source,:]

        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_outs,
                                                       padding_value=PAD_IDX,
                                                       batch_first = True)
        # UNSORT OUTPUT
        lstm_out = lstm_out[unsort_to_original_source]
        hidden = hidden.view(self.num_layers, 2, batch_size, -1).transpose(1, 2).contiguous().view(self.num_layers, batch_size, -1)
        context = context.view(self.num_layers, 2, batch_size, -1).transpose(1, 2).contiguous().view(self.num_layers, batch_size, -1)

        return output, hidden, context



def initLSTM(input_size,
             hidden_size,
             **kwargs):

    model = nn.LSTM(input_size,
                    hidden_size,
                    **kwargs)

    for name, param in model.named_parameters():

        if ("weight" in name) or ("bias" in name):
            param.data.uniform_(-0.1, 0.1)

    return model


def initLSTMCell(input_size,
                 hidden_size,
                 **kwargs):

    model = nn.LSTMCell(input_size,
                        hidden_size,
                        **kwargs)

    for name, param in model.named_parameters():

        if 'weight' in name or 'bias' in name:
            param.data.uniform_(-0.1, 0.1)

    return model


def initGRUCell(input_size,
                hidden_size,
                **kwargs):

    model = nn.GRUCell(input_size,
                       hidden_size,
                       **kwargs)

    for name, param in model.named_parameters():

        if 'weight' in name or 'bias' in name:
            param.data.uniform_(-0.1, 0.1)

    return model


# Attention: attention module
class Attention(nn.Module):
    def __init__(self,
                 hidden_size,
                 attn_size):

        super(Attention, self).__init__()

        self.hidden_size = hidden_size
        self.attn_size = attn_size

        self.linear_layer1 = nn.Linear(self.hidden_size, self.attn_size)

        self.linear_layer2 = nn.Linear(self.hidden_size + self.attn_size, self.attn_size)
        
    def forward(self,
                hidden,
                encoder_outs,
                source_lengths):

        # hidden_size -> attn_size
        attn_hidden = self.linear_layer1(hidden)

        # get scores
        attn_score = torch.sum((encoder_outs.transpose(0,1) * attn_hidden.unsqueeze(0)),2)

        attn_mask = torch.transpose(seq_mask(source_lengths,
                                             max_len = max(source_lengths).item()),
                                    0,1)

        masked_attn = attn_mask*attn_score
        masked_attn[masked_attn==0] = -1e10

        # softmax over attention to get weights
        attn_scores = F.softmax(masked_attn, dim=0)
        # compute weighted sum according to attention scores
        attn_hidden = torch.sum(attn_scores.unsqueeze(2)*encoder_outs.transpose(0,1), 0)

        attn_hidden = self.linear_layer2(torch.cat((attn_hidden, hidden), dim=1))
        attn_hidden = torch.tanh(attn_hidden)

        return attn_hidden, attn_scores

# AttnDecoderRNN
class AttnDecoderRNN(nn.Module):

    def __init__(self,
                 vocab_size,
                 embed_size,
                 hidden_size,
                 num_rnn_layers = 1,
                 attention = True,
                 dropout_percent=0.1):

        super(AttnDecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        encoder_output_size = self.hidden_size

        self.embedding = nn.Embedding(vocab_size,
                                      embed_size,
                                      PAD_IDX)

        self.dropout_f = nn.Dropout(p=dropout_percent)

        self.num_layers = num_rnn_layers

        if attention:
            self.attention = Attention(self.hidden_size,
                                       encoder_output_size)
        else:
            self.attention = None

        self.layers = nn.ModuleList([initLSTMCell(input_size=self.hidden_size+self.embed_size if ((layer == 0) and attention) \
                                                  else self.embed_size if layer == 0 else self.hidden_size,
                                                  hidden_size=self.hidden_size,)for layer in range(self.num_layers)])

        self.linear_layer = nn.Linear(self.hidden_size, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, 
                decoder_inputs,
                context, 
                prev_hiddens,
                prev_context,
                encoder_outputs,
                source_lengths):
        
        batch_size = decoder_inputs.size(0)

        # embed
        embed_target = self.embedding(decoder_inputs)
        out = self.dropout_f(embed_target)
        
        if self.attention is not None:
            input_ = torch.cat([out.squeeze(1), context], dim = 1)
        else:
            input_ = out.squeeze(1)

        context_ = []
        decoder_hiddens_ = []

        for layer, rnn in enumerate(self.layers):
            hidden, con = rnn(input_, (prev_hiddens[layer],
                                       prev_context[layer]))
            input_ = self.dropout_f(hidden)
            decoder_hiddens_.append(hidden.unsqueeze(0))
            context_.append(con.unsqueeze(0))

        decoder_hiddens_ = torch.cat(decoder_hiddens_, dim = 0)
        context_ = torch.cat(context_, dim = 0)

        if self.attention is not None:
            out, attn_score = self.attention(hidden,
                                             encoder_outputs,
                                             source_lengths)
        else:
            out = hidden
            attn_score = None

        context_vec = out
        out = self.dropout_f(out)

        # linear: hidden_size -> vocab_size
        deco_out = self.linear_layer(out)
        deco_out = self.log_softmax(deco_out)

        return out_vocab, context_vec, decoder_hiddens_, context_, attn_score


#### Translator Function for Attention
For LSTMencoder and AttnDecoder

In [39]:
def translate_attn(encoder_model, decoder_model,
                   source_sentence, target_sentence,
                   source_lengths, target_lengths,
                   val=False):
    
    if val == False:
        
        teacher_forcing = True if random.random() < 0.6 else False

        batch_size, seq_len_source = source_sentence.size()
        
        encoder_out, encoder_hidden, encoder_context = encoder_model(source_sentence, source_lengths)
        
        max_source_length = max(source_lengths).item()
        max_target_length = max(target_lengths).item()
        
        prev_hiddens = encoder_hidden
        prev_context = encoder_context
        
        prev_ys = torch.zeros((batch_size, encoder_out.size(-1))).to(device)
        
        # decoder should start with SOS tokens at the first timestep
        decoder_input = torch.tensor([[SOS_token]]*batch_size).to(device)
        
        if teacher_forcing:
            
            decoder_out = []
            
            for time_step in range(max_target_length):
                
                out_, prev_ys, prev_hiddens,\ 
                prev_context, attn_score = decoder_model(decoder_input,
                                                         prev_ys,
                                                         prev_hiddens,
                                                         prev_context,
                                                         encoder_out,
                                                         source_lengths)

                decoder_out.append(out_.unsqueeze(-1))
                decoder_input = target_sentence[:,time_step].view(-1,1)
                
            decoder_out = torch.cat(decoder_out,
                                    dim=-1)

        else:
            
            decoder_out = []
            
            for time_step in range(max_target_length):
                
                out_, prev_ys, prev_hiddens,\
                prev_context, attn_score = decoder_model(decoder_input,
                                                         prev_ys,
                                                         prev_hiddens,
                                                         prev_context, 
                                                         encoder_out,
                                                         source_lengths)
                
                decoder_out.append(out_.unsqueeze(-1))
                top_scores, top_indices = out_.topk(1)
                decoder_input = top_indices.squeeze().detach().view(-1,1)

            decoder_out = torch.cat(decoder_out,
                                    dim=-1)
        return decoder_out
    
    else: # Val
        
        encoder_model.eval()
        decoder_model.eval()
        batch_size, seq_len_source = source_sentence.size()
        
        encoder_out, encoder_hidden, encoder_context = encoder_model(source_sentence, 
                                                                     source_lengths)
        max_source_length = max(source_lengths).item()
        max_target_length = max(target_lengths).item()
        
        prev_hiddens = encoder_hidden
        prev_context = encoder_context
        
        prev_ys = torch.zeros((batch_size, encoder_out.size(-1))).to(device)
        
        # SOS
        decoder_input = torch.tensor([[SOS_token]]*batch_size).to(device)
        
        decoder_out = []
        
        for i in range(max_target_length):
            
            out_, prev_ys, prev_hiddens, \
            prev_context, attn_score = decoder_model(decoder_input,
                                                     prev_ys,
                                                     prev_hiddens,
                                                     prev_context, 
                                                     encoder_out,
                                                     source_lengths)
            
            decoder_out.append(out_.unsqueeze(-1))
            top_scores, top_indices = out_.topk(1)
            decoder_input = top_indices.squeeze().detach().view(-1,1)

        decoder_out = torch.cat(decoder_out,dim=-1)
        
        return decoder_out

#### Training & Validation Functions

In [7]:
def train(encoder_optimizer,
          decoder_optimizer, 
          encoder_model, decoder_model, 
          loss_function,
          data_loader, 
          en_lang, # "vien_en_" for vietnamese -> eng, "zhen_en_" for chinese -> eng
          num_epochs=10, val_interval=1, rm = 0.8, 
          enc_scheduler=None, 
          dec_scheduler=None):

    mode_list = ["train","val_train"] # val_train, val every val_interval train epochs
    loss_hist = {"train": [], "val_train": []}
    BLEU_hist = {"train": [], "val": []}

    for epoch in range(num_epochs):
        print ("epoch", epoch)

        for ex, mode in enumerate(mode_list):
            
            start = time.time()
            total = 0
            top1_correct = 0
            running_loss = 0
            running_total = 0
            
            if mode == "train":
                encoder.train()
                decoder.train()
                
            elif mode == "val_train":
                encoder.eval()
                decoder.eval()
            else:
                raise ValueError
                
            for data in data_loader[mode]:
                
                encoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()

                encoder_input, decoder_input = data[0].to(device), data[1].to(device)
                source_lengths, target_lengths = data[2].to(device), data[3].to(device)

                if mode == "val_train":                
                    output = encode_decode_attn(encoder_model, decoder_model,
                                                encoder_input, decoder_input,
                                                source_lengths, target_lengths,
                                                rand_num=rm, val=True)
                else:
                    output = encode_decode_attn(encoder_model, decoder_model,
                                                encoder_input, decoder_input,
                                                source_lengths, target_lengths,
                                                rand_num=rm, val=False)
                    
                loss = loss_function(output.float(), 
                                     decoder_input[:,:output.size(-1)].long())
                
                batch = decoder_input.size(0)
                
                running_loss += loss.item()*batch
                
                total += batch
                
                if mode == "train":
                    
                    loss.backward()
                    
                    torch.nn.utils.clip_grad_norm_(encoder.parameters(), 0.15)
                    torch.nn.utils.clip_grad_norm_(decoder.parameters(), 0.15)
                    
                    encoder_optimizer.step()
                    decoder_optimizer.step()
                    
            epoch_loss = running_loss / total 
            loss_hist[mode].append(epoch_loss)
            print("epoch {} {} loss = {}, time = {}".format(epoch, mode, epoch_loss,
                                                                           time.time() - start))
        if (enc_scheduler is not None) and (dec_scheduler is not None):
            enc_scheduler.step(epoch_loss)
            dec_scheduler.step(epoch_loss)
            
        if epoch % val_interval == 0:
            val_bleu_score = eval_(encoder_model, decoder_model, data_loader["val"], en_lang)
            BLEU_hist["val"].append(val_bleu_score)
            print("validation BLEU = ", val_bleu_score)

    return encoder_model, decoder_model, loss_hist, BLEU_hist

def eval_(encoder, 
          decoder, 
          val_dataloader, 
          vien_en_, # change with zhen_en_ for chinese -> english
          ):
    
    encoder.eval()
    decoder.eval()
    
    pred_corpus = []
    ref_corpus = []

    for data in val_dataloader:
        
        encoder_input = data[0].to(device)
        source_lengths = data[2].to(device)
        
        batch_size, seq_len = encoder_input.size()[:2]
        
        encoder_out, encoder_hidden, encoder_context = encoder(encoder_input,
                                                               source_lengths)
        max_source_length = max(source_lengths).item()
        
        prev_hiddens = encoder_hidden
        prev_context = encoder_context
        decoder_input = torch.tensor([[SOS_token]]*batch_size).to(device)
        prev_output = torch.zeros((batch_size, encoder_out.size(-1))).to(device)
        
        decoder_out = []
        
        for i in range(seq_len*2):
            
            out_, prev_output, prev_hiddens,\
            prev_context, attention_score = decoder_model(decoder_input,
                                                          prev_output,
                                                          prev_hiddens,
                                                          prev_context, 
                                                          encoder_out,
                                                          source_lengths)
            top_scores, top_indices = out_.topk(1)
            decoder_out.append(top_indices.item())
            decoder_input = top_indices.squeeze().detach().view(-1,1)
            
            if top_indices.item() == EOS_token:
                break
        
        ref_corpus.append(data[-1])
        
        pred_sent = id2text_(decoder_out,vien_en_)
        pred_corpus.append(pred_sent)

    print ("true corpus", ref_corpus[:5])
    print ("pred corpus", pred_corpus[:5])
    
    # import above: from sacreBLEU.sacreBLEU import corpus_bleu
    score = corpus_bleu((" ").join(pred_corpus),
                        (" ").join(ref_corpus))[0]
    return score

### Test Set Performance

__enc7, dec7__ is our best encoder-decoder pair. Please see the __training \& validation__ process __below__.

Please search for transformed_dataset to see the mentioned dataloader.

transformed_dataset = {'train': Vietnamese(train_used), 
                       'val': Vietnamese(__test__, val = True), # changing val test with test set
                       'train_val':Vietnamese(train.iloc[:50], val = True),
                       'val_train':Vietnamese(val)
                                               }

In [89]:
test_data_vi = dataloader["val"] # changed val with test loader in the dict

eval_(enc7, dec7, test_data_vi, vien_en_, "attention")

true corpus ['it would be unconscionable .', 'then finally she said , &quot; the third thing i want you to promise me is that you &apos;ll never drink alcohol . &quot;', 'do you know what they call a 400 baseball hitter ?', 'because the pictures made it feel more real to you .', 'i gave her my whole rap , and when i finished she looked at me and she said , &quot; mmm mmm mmm . &quot;']
pred corpus ['but the more optimistic one is the pro-social', 'and so , i &apos;m not to , , , , , , , , , ,', 'you know , when you go to the theater , you see the people who are in the .', 'and you &apos;re in a wheelchair . you &apos;re in your', 'and she said , &quot; well , i']


23.27469722465982

In [91]:
test_data_vi = dataloader["val"] # changed val with test loader

BeamSearch(enc7, dec7, test_data_vi, vien_en_, 4)

BLEU score calculated on the validation set is  24.65522102888692


24.65522102888692

In [8]:
test_data_zh = dataloader["val"] # changed val with test loader

eval_(enc7, dec7, test_data_zh, zhen_en_, "attention")

BeamSearch(enc7, dec7, test_data_vi, vien_en_, 4)

NameError: name 'dataloader' is not defined

### Training & Validation Started Here

In [60]:
# encoder = RNNencoder_(vien_vi_.n_words,300,300, 2).to(device) # this is the old encoder
# decoder = AttnDecoderRNN(vien_en_.n_words,300,512,n_layers=2, attention = True).to(device)

In [61]:
# encoder_optimizer = optim.Adam(encoder.parameters(), lr=1e-4)
# enc_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, min_lr=1e-4,  patience=0)
# decoder_optimizer = optim.Adam(decoder.parameters(), lr=1e-4)
# dec_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(decoder_optimizer, min_lr=1e-4,  patience=0)

In [5]:
# # vietnamese -> english
# criterion = nn.NLLLoss(ignore_index=0)
# enc, dec, loss_hist, acc_hist = train_model(encoder_optimizer, 
#                                             decoder_optimizer, 
#                                             encoder, decoder, 
#                                             criterion,
#                                             "attention", 
#                                             dataloader,vien_en_, 
#                                             num_epochs = 14, rm = 0.95,\
#                                            enc_scheduler = enc_scheduler, dec_scheduler = dec_scheduler)

In [63]:
# LASTRUN - START

encoder = LSTMencoder(vien_vi_.n_words,512,512, 2).to(device)
decoder = AttnDecoderRNN(vien_en_.n_words,512,1024,n_layers=2, attention=True).to(device)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=1e-3)
enc_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=1e-4,  patience=0)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=1e-3)
dec_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=1e-4,  patience=0)

# vietnamese -> english
criterion = nn.NLLLoss(ignore_index=0)
enc, dec, loss_hist, acc_hist = train(encoder_optimizer,
                                      decoder_optimizer,
                                      encoder, decoder,
                                      criterion, "attention",
                                      dataloader,vien_en_,
                                      num_epochs = 5, rm = 0.95,
                                      enc_scheduler = enc_scheduler,
                                      dec_scheduler = dec_scheduler)

epoch 0
epoch 0 train loss = 5.554496881626442, time = 1236.9642140865326
epoch 0 val_train loss = 7.429110933809865, time = 4.553696870803833
true corpus ['it &apos;s my way of helping other victims , and it &apos;s my final request of you .', 'so we took a lot of samples from this road and we tested them in the lab .', 'this is a satellite picture showing north korea at night compared to neighbors .', 'you can zoom in and zoom out , you can wind back and fast forward .', 'it &apos;s not to say that our mothers aren &apos;t key in our success .']
pred corpus ['and i said , &quot; well , i &apos;m going to do that . &quot;', 'and we &apos;re not going to be a very important .', 'and the reason that we &apos;re going to do is that we &apos;re going to do that , and we &apos;re not going to be a very good .', 'and we &apos;re not going to be a very good .', 'and the reason that we &apos;re going to do is that we &apos;re going to do that .']
validation BLEU =  6.4868810256491765
epoch 1


In [64]:
encoder_optimizer = optim.Adam(enc.parameters(), lr=1e-3)
enc_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=5e-5,  patience=0)
decoder_optimizer = optim.Adam(dec.parameters(), lr=1e-3)
dec_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=5e-5,  patience=0)

# vietnamese -> english
criterion = nn.NLLLoss(ignore_index=0)
enc2, dec2, loss_hist, acc_hist = train(encoder_optimizer,
                                        decoder_optimizer, 
                                        enc, dec, # encoder - decoder returned above
                                        criterion,
                                        "attention",
                                        dataloader, vien_en_,
                                        num_epochs = 5, rm = 0.95,
                                        enc_scheduler = enc_scheduler, 
                                        dec_scheduler = dec_scheduler)

epoch 0
epoch 0 train loss = 3.9035900663819625, time = 1227.8427731990814
epoch 0 val_train loss = 6.266674137115478, time = 4.5217108726501465
true corpus ['the waters here have been recorded at reaching over 24 meters in height , and traveled over two miles inland .', 'i had my first apartment , my first little green american express card , and i had a very big secret .', 'this is when we started asking passing tourists to take the picture .', 'according to a quranic verse &quot; salam &quot; -- peace -- &quot; is the word of the all-merciful god , raheem . &quot;', 'they are bird-blending machines .']
pred corpus ['and i was was a the , , the , the the , , the the', 'and i was i i , , ,', 'and i was , the , , , , . the . the . the . the .', 'it &apos;s a shepherd that &apos;s been a the , &quot; the &quot;', 'and they &apos;re the , the ,']
validation BLEU =  12.725447875469124
epoch 1
epoch 1 train loss = 3.6219868814313747, time = 1229.3076164722443
epoch 1 val_train loss = 6.257

In [87]:
torch.save(dec2.state_dict(), "rnn_decoder_vietnamese_good.pth")
torch.save(enc2.state_dict(), "rnn_encoder_vietnamese_good.pth")

In [77]:
BeamSearch(enc2, dec2, dataloader["val"], vien_en_, 4)



BLEU score calculated on the validation set is  3.375153977073988


3.375153977073988

In [89]:
encoder_optimizer = optim.Adam(enc2.parameters(), lr=1e-4)
enc_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=5e-6,  patience=0)
decoder_optimizer = optim.Adam(dec2.parameters(), lr=1e-4)
dec_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=5e-6,  patience=0)

# vietnamese -> english
criterion = nn.NLLLoss(ignore_index=0)
enc3, dec3, loss_hist, acc_hist = train(encoder_optimizer,
                                        decoder_optimizer,
                                        enc2, dec2,
                                        criterion,
                                        "attention",
                                        dataloader, vien_en_,
                                        num_epochs = 3, rm = 0.95,
                                        enc_scheduler = enc_scheduler, 
                                        dec_scheduler = dec_scheduler)

epoch 0
epoch 0 train loss = 2.6355759009699433, time = 1231.4814236164093
epoch 0 val_train loss = 6.755882358551025, time = 4.598157644271851
true corpus ['i was just three years old when my brother came along , and i was so excited that i had a new being in my life .', 'hopefully less awkward than that one in the middle .', 'one of the smartest things conor did , from the very beginning , was to create the illusion that i was the dominant partner in the relationship .', 'i was able to leave , because of one final , sadistic beating that broke through my denial .', 'thank you so much . thank you .']
pred corpus ['and i was was i i to the , , i', 'that &apos;s the first time that the kid &apos;s . .', 'i &apos;ve been to the to of to the , , , , ,', 'i was i was , , , i to i to i to', 'thank you . thank you .']
validation BLEU =  16.029540231897602
epoch 1
epoch 1 train loss = 2.622434728581503, time = 1231.2793803215027
epoch 1 val_train loss = 6.832007227138597, time = 4.61272525787

In [92]:
torch.save(enc3, "best_attn_encoder_rnn.pth")
torch.save(dec3, "best_attn_decoder_rnn.pth")

In [93]:
BeamSearch(enc3, dec3, dataloader["val"], vien_en_, 4)

BLEU score calculated on the validation set is  19.15777061690079


19.15777061690079

In [94]:
BeamSearch(enc3, dec3, dataloader["val"], vien_en_, 3)

BLEU score calculated on the validation set is  19.721609375445407


19.721609375445407

In [95]:
BeamSearch(enc3, dec3, dataloader["val"], vien_en_, 2)

BLEU score calculated on the validation set is  19.943231471415235


19.943231471415235

In [41]:
enc3 = torch.load("best_attn_encoder_rnn.pth")
dec3 = torch.load("best_attn_decoder_rnn.pth")

In [47]:
encoder_optimizer = optim.Adam(enc3.parameters(), lr=5e-5)
enc_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=5e-6,  patience=0)
decoder_optimizer = optim.Adam(dec3.parameters(), lr=5e-5)
dec_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=5e-6,  patience=0)

# vietnamese -> english
criterion = nn.NLLLoss(ignore_index=0)

enc4, dec4, loss_hist, acc_hist = train(encoder_optimizer,
                                        decoder_optimizer,
                                        enc3, dec3,
                                        criterion,
                                        "attention",
                                        dataloader,vien_en_,
                                        num_epochs = 2, rm = 0.95,
                                        enc_scheduler = enc_scheduler, 
                                        dec_scheduler = dec_scheduler)

epoch 0




epoch 0 train loss = 2.4629972229044714, time = 1234.3856925964355
epoch 0 val_train loss = 7.045825487253618, time = 4.654397010803223
true corpus ['penn state asked me , a communications teacher , to teach a communications class for engineering students .', 'but nobody helped them , because they were so focused on taking care of themselves and their families .', 'that &apos;s the real zipper .', 'it could take an hour . it could take weeks .', 'here &apos;s me on the soccer team and in v magazine .']
pred corpus ['i &apos;m asking you to read , not just how hard the children are , but also the best way to do it .', 'and i think that to the , , , , , ,', 'and i think it &apos;s a the the .', 'and i &apos;m going to to', 'i &apos;ve been working with projects like this .']
validation BLEU =  17.5791288832173
epoch 1
epoch 1 train loss = 2.4973496287396157, time = 1235.0308747291565
epoch 1 val_train loss = 7.077768590498944, time = 4.568317413330078
true corpus ['extraordinary .', 'tha

In [48]:
BeamSearch(enc4, dec4, dataloader["val"], vien_en_, 3)

BLEU score calculated on the validation set is  20.28189574524616


20.28189574524616

In [49]:
encoder_optimizer = optim.Adam(enc4.parameters(), lr=3e-5)
enc_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=5e-6,  patience=0)
decoder_optimizer = optim.Adam(dec4.parameters(), lr=3e-5)
dec_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=5e-6,  patience=0)

# vietnamese -> english
criterion = nn.NLLLoss(ignore_index=0)
enc5, dec5, loss_hist, acc_hist = train(encoder_optimizer,
                                        decoder_optimizer,
                                        enc4, dec4,
                                        criterion,
                                        "attention", 
                                        dataloader, vien_en_,
                                        num_epochs = 2, rm = 0.95,
                                        enc_scheduler = enc_scheduler, 
                                        dec_scheduler = dec_scheduler)

epoch 0
epoch 0 train loss = 2.408123344059025, time = 1232.0732686519623
epoch 0 val_train loss = 7.148522694256841, time = 4.555002689361572
true corpus ['so the last question people ask me is , &quot; what is it like to be a model ? &quot;', 'but i will do a trial .', 'he looked at me suspiciously , but luckily he believed me .', 'so i put the specimen in , which i &apos;m now going to take out to see what happened .', 'it &apos;s difficult to witness something so overwhelming .']
pred corpus ['and i said , &quot; well , i', 'so , i &apos;ve been to a to', 'and he said , &quot; well , i', 'it &apos;s a little bit like the one that i &apos;m going to .', 'i was a the the , , ,']
validation BLEU =  18.432938641819742
epoch 1
epoch 1 train loss = 2.423812474525007, time = 1232.1433029174805
epoch 1 val_train loss = 7.110661590342619, time = 4.557958364486694
true corpus ['you have to learn how to get these people to come and talk to you .', 'we are lying on the floor together , and our

In [50]:
# Chinese -> English - do not run with the same dataloader !!!
BeamSearch(enc4, dec4, dataloader["val"], zhen_en_, 3)

BLEU score calculated on the validation set is  20.43469747827706


20.43469747827706

In [51]:
BeamSearch(enc5, dec5, dataloader["val"], vien_en_, 3)

BLEU score calculated on the validation set is  20.43054028818054


20.43054028818054

In [55]:
encoder_optimizer = optim.Adam(enc4.parameters(), lr=5e-5)
enc_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=1e-6,  patience=0)
decoder_optimizer = optim.Adam(dec4.parameters(), lr=5e-5)
dec_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=1e-6,  patience=0)

# vietnamese -> english
criterion = nn.NLLLoss(ignore_index=0)

enc6, dec6, loss_hist6, acc_hist6 = train(encoder_optimizer,
                                          decoder_optimizer,
                                          enc4, dec4, # 4 was better than 5
                                          criterion,
                                          "attention",
                                          dataloader, vien_en_,
                                          num_epochs = 5, rm = 0.95,
                                          enc_scheduler = enc_scheduler, 
                                          dec_scheduler = dec_scheduler)

epoch 0
epoch 0 train loss = 2.3565232976972266, time = 1231.4803340435028
epoch 0 val_train loss = 7.174542779338603, time = 4.53424859046936
true corpus ['the day of the tsunami , he &apos;d actually been in charge of making sure the tsunami gates were closed .', 'and first , i commend you on your model knowledge . very impressive .', 'so i decided to start a lawsuit against them , because i wanted to have this information .', 'and of course , everything in africa grew beautifully .', 'the journey by bus took one week , and we were almost caught several times .']
pred corpus ['so , we have a . , , , .', 'i want you to just think it &apos;s a great thing to be a , .', 'and i said , &quot; well , i', 'there &apos;s a lot of work in africa .', 'and the next morning , i &apos;d be walking around the table , and i was going to go to the']
validation BLEU =  18.336935850030034
epoch 1
epoch 1 train loss = 2.3440570105882537, time = 1232.2761716842651
epoch 1 val_train loss = 7.227972398485

In [56]:
BeamSearch(enc6, dec6, dataloader["val"], vien_en_, 3)

BLEU score calculated on the validation set is  20.92370830109886


20.92370830109886

In [57]:
encoder_optimizer = optim.Adam(enc6.parameters(), lr=1e-5)
enc_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=1e-6,  patience=0)
decoder_optimizer = optim.Adam(dec6.parameters(), lr=1e-5)
dec_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=1e-6,  patience=0)

# vietnamese -> english
criterion = nn.NLLLoss(ignore_index=0)

enc7, dec7, loss_hist6, acc_hist6 = train(encoder_optimizer,
                                          decoder_optimizer,
                                          enc6, dec6,
                                          criterion,
                                          "attention",
                                          dataloader, vien_en_,
                                          num_epochs = 5, rm = 0.95,
                                          enc_scheduler = enc_scheduler, 
                                          dec_scheduler = dec_scheduler)

epoch 0
epoch 0 train loss = 2.3299497075632645, time = 1234.203326702118
epoch 0 val_train loss = 7.314455038187456, time = 4.549794673919678
true corpus ['i can feel the brush of sweaty bodies passing me in the darkness , but i can &apos;t see much else .', 'we &apos;re pale , gray creatures .', 'thank you .', 'my councilman even called in and said how they endorse and love what we &apos;re doing .', 'you never arrive in a community with any ideas , and you sit with the local people .']
pred corpus ['the difference between the two in the morning is that , up to a number of meters , i &apos;m wearing a lot of these minus sheep that have to go off the mountain to the nearest sea , and to the ,', 'we &apos;re trying to make them grow alive .', 'thank you .', 'so we started to think , what . ,', 'and we have a lot of . , , . , . . .']
validation BLEU =  19.317700375205195
epoch 1
epoch 1 train loss = 2.27979019665766, time = 1233.5050673484802
epoch 1 val_train loss = 7.293009487463504, 

In [10]:
# encoder_optimizer = optim.Adam(enc7.parameters(), lr=5e-6)
# enc_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, min_lr=9e-7,  patience=0)
# decoder_optimizer = optim.Adam(dec7.parameters(), lr=5e-6)
# dec_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, min_lr=9e-7,  patience=0)

# # vietnamese -> english
# criterion = nn.NLLLoss(ignore_index=0)
# enc8, dec8, loss_hist6, acc_hist6 = train(encoder_optimizer, 
#                                             decoder_optimizer, 
#                                             enc7, dec7, 
#                                             criterion,
#                                             "attention", 
#                                             dataloader,vien_en_, 
#                                             num_epochs = 5, rm = 0.95,\
#                                            enc_scheduler = enc_scheduler, dec_scheduler = dec_scheduler)

In [70]:
encoder_optimizer = optim.Adam(enc7.parameters(), lr=1e-6)
enc_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=1e-7,  patience=0)
decoder_optimizer = optim.Adam(dec7.parameters(), lr=1e-6)
dec_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(encoder_optimizer, 
                                                           min_lr=1e-7,  patience=0)

# vietnamese -> english
criterion = nn.NLLLoss(ignore_index=0)
enc8, dec8, loss_hist6, acc_hist6 = train(encoder_optimizer,
                                          decoder_optimizer,
                                          enc7, dec7,
                                          criterion,
                                          "attention",
                                          dataloader, vien_en_,
                                          num_epochs = 5, rm = 0.95,
                                          enc_scheduler = enc_scheduler,
                                          dec_scheduler = dec_scheduler)

epoch 0
epoch 0 train loss = 2.253924560923617, time = 1232.7297248840332
epoch 0 val_train loss = 7.296722560026208, time = 4.623219966888428
true corpus ['it was an isolated incident , and he was never going to hurt me again .', 'i remarried a kind and gentle man , and we have those three kids .', 'so luckily i brought an outfit change .', 'there she is .', 'because you can see where i am , where i sleep at night , what i am doing .']
pred corpus ['and then , of course , they don &apos;t .', 'i am a dad .', 'i &apos;ve been working with a to to', 'she &apos;s a very , , ,', 'and i know that as a kid , i have to do something about it .']
validation BLEU =  19.133692547201704
epoch 1
epoch 1 train loss = 2.2141385778340186, time = 1231.6295711994171
epoch 1 val_train loss = 7.308412345574826, time = 4.6126954555511475
true corpus ['i was very lucky to grow up in a family where education was prized and daughters were treasured .', 'the next question people always ask me is , &quot; do t

In [62]:
# [BeamSearch(enc7, dec7, dataloader["val"], vien_en_, 8) for x in range(10)]
# scores go down as we increase the beam size too much!

In [61]:
[BeamSearch(enc7, dec7, dataloader["val"], vien_en_, 3) for x in range(10)]

BLEU score calculated on the validation set is  20.81742572124765
BLEU score calculated on the validation set is  20.74348936315408
BLEU score calculated on the validation set is  20.78065253900282
BLEU score calculated on the validation set is  20.822702964482854
BLEU score calculated on the validation set is  20.786162523535292
BLEU score calculated on the validation set is  20.77832767049285
BLEU score calculated on the validation set is  20.8010395838305
BLEU score calculated on the validation set is  20.746004956538503
BLEU score calculated on the validation set is  20.846297169599904
BLEU score calculated on the validation set is  20.76642701381443


[20.81742572124765,
 20.74348936315408,
 20.78065253900282,
 20.822702964482854,
 20.786162523535292,
 20.77832767049285,
 20.8010395838305,
 20.746004956538503,
 20.846297169599904,
 20.76642701381443]

In [68]:
[BeamSearch(enc7, dec7, dataloader["val"], vien_en_, 2) for x in range(2)]

BLEU score calculated on the validation set is  20.999334492052036
BLEU score calculated on the validation set is  21.072687856938643


[20.999334492052036, 21.072687856938643]

In [69]:
[BeamSearch(enc7, dec7, dataloader["val"], vien_en_, 2) for x in range(20)]

BLEU score calculated on the validation set is  20.997950236516527
BLEU score calculated on the validation set is  20.967508433357416
BLEU score calculated on the validation set is  20.987145049548253
BLEU score calculated on the validation set is  21.107329578393895
BLEU score calculated on the validation set is  20.92650789232682
BLEU score calculated on the validation set is  21.014706833233507
BLEU score calculated on the validation set is  20.957583094547942
BLEU score calculated on the validation set is  21.05644986944498
BLEU score calculated on the validation set is  20.937051853904997
BLEU score calculated on the validation set is  21.011069857017333
BLEU score calculated on the validation set is  21.015142363469657
BLEU score calculated on the validation set is  20.99320398021949
BLEU score calculated on the validation set is  20.949605129255378
BLEU score calculated on the validation set is  20.99183222094368
BLEU score calculated on the validation set is  21.066599009583822

[20.997950236516527,
 20.967508433357416,
 20.987145049548253,
 21.107329578393895,
 20.92650789232682,
 21.014706833233507,
 20.957583094547942,
 21.05644986944498,
 20.937051853904997,
 21.011069857017333,
 21.015142363469657,
 20.99320398021949,
 20.949605129255378,
 20.99183222094368,
 21.066599009583822,
 21.00828760056704,
 20.991591556416193,
 21.006153810510092,
 21.008781753343847,
 20.98935781975807]

In [74]:
# save best encoder - attn
torch.save(enc7, "best_attn_rnn_totest.pth")

In [75]:
# save best decoder - attn
torch.save(dec7, "best_attn_rnn_DEC_totest.pth")

  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


In [45]:
# BeamSearch
def BeamSearch(encoder_model, 
               decoder_model, 
               val_loader,
               en_, # vien_en_ or zhen_en_
               beam_size, 
               device = "cuda"):
    
    """
    Params
    :encoder_model: Trained RNN or CNN encoder model.
    :decoder_model: Trained RNN or RNN w/attention decoder model.
    :val_loader: Validation dataloader object.
    :en_: The English language (target) object of the passed val_loader language.
    :beam_size: The function selects beam_size-many hypotheses with the highest log prob
                at each timestep.

    Returns
    :4-gram precision BLEU score for the given validation data.
    """

    encoder_model.eval()
    decoder_model.eval()

    model_corpus = []
    reference_corpus = []

    encoder_model = encoder_model.to(device)
    decoder_model = decoder_model.to(device)

    running_loss = 0
    running_total = 0

    # iterate over val_loader until computing the final
    # corpus-level BLEU score
    for sentence_pair in val_loader:
        # encoder input = source sentence
        encoder_input = sentence_pair[0].to(device)
        source_lengths = sentence_pair[2].to(device)

        seqlen_ = torch.max(source_lengths) # max_len
        batch_size, seq_len = encoder_input.size()[:2]
        
        encoder_out, encoder_hidden, encoder_context = encoder_model(encoder_input,
                                                                     source_lengths)
        
        prev_hiddens, prev_context = encoder_hidden, encoder_context

        # first input to the decoder should be SOS tokens (1)
        decoder_input = torch.tensor([[SOS_token]]*batch_size).to(device)

        prev_output = torch.zeros((batch_size, 
                                   encoder_out.size(-1))).to(device)

        decoder_input_list = [None]*beam_size

        end_beam = [False]*beam_size

        # init beam scores - batch_size x beam_size
        beam_scores = torch.zeros((batch_size,beam_size)).to(device)

        decoder_out_list = [[]]*beam_size

        for t in range(seq_len+20):

            if t == 0:

                outs, prev_output, prev_hiddens, \
                prev_context, attn_score = decoder_model(decoder_input,
                                                              prev_output,
                                                              prev_hiddens,
                                                              prev_context, 
                                                              en_out,
                                                              source_lengths)
                
                # get top beam_size-many scores and their indices
                top_scores, top_indices = outs.topk(beam_size)
                out_s, vocab_size = outs.size()

                prev_out_list = [prev_output]*beam_size
                prev_hidden_list = [prev_hiddens]*beam_size
                prev_context_list = [prev_context]*beam_size

                for beam_i in range(beam_size):

                    beam_scores[0][beam_i] = top_scores[0][beam_i].item()
                    decoder_input_list[beam_i] = top_indices[0][beam_i].squeeze().detach().\
                                                                                    view(-1,1)
                    decoder_out_list[beam_i].append(top_indices[0][beam_i].item())

                    if top_indices[0][beam_i].item() == EOS_token:
                        end_beam[beam_i] = True

            else:

                out_t, hidden_t, context_t, hold_beam = beam_size*[None], beam_size*[None], \
                                                        beam_size*[None], beam_size*[None]
                
                prev_ys = copy.deepcopy(decoder_out_list)

                for beam_i in [*range(beam_size)]:

                    if not end_beam[beam_i]:

                        hold_beam[beam_i], out_t[beam_i], hidden_t[beam_i], \
                        context_t[beam_i], attn_score = decoder_model(decoder_input_list[beam_i],
                                                                        prev_out_list[beam_i],
                                                                        prev_hidden_list[beam_i],
                                                                        prev_context_list[beam_i],
                                                                        encoder_out,
                                                                        source_lengths)

                        hold_beam[beam_i] = hold_beam[beam_i] + beam_scores[0][beam_i]

                    if end_beam[beam_i]:

                        hold_beam[beam_i] = torch.zeros(out_s, vocab_size).fill_(-np.inf).to(device)

                hold_beam = torch.cat(hold_beam, dim=1)
                top_scores, top_indices = hold_beam.topk(beam_size)

                hidden_id = top_indices//vocab_size
                top_indices_ = top_indices%vocab_size

                for beam_i in range(beam_size):

                    if not end_beam[beam_i]:

                        beam_scores[0][beam_i] = top_scores[0][beam_i].item()
                        list_decoder_input[beam_i] = top_indices_[0][beam_i].squeeze().detach().view(-1,1)
                        decoder_out_list[beam_i] = copy.deepcopy(prev_ys[hidden_id[0][beam_i]])
                        decoder_out_list[beam_i].append(top_indices_[0][beam_i].item())

                        # <EOS>
                        if top_indices_[0][beam_i].item() == EOS_token:
                            end_beam[beam_i] = True

                        else:
                            prev_out_list[beam_i] = out_t[hidden_id[0][beam_i]]
                            prev_context_list[beam_i] = context_t[hidden_id[0][beam_i]]
                            prev_hidden_list[beam_i] = hidden_t[hidden_id[0][beam_i]]
                            
                # all batch <EOS>
                if all(end_beam):
                    break

        max_score_id = np.argmax(beam_scores)

        decoder_out = decoder_out_list[max_score_id]

        reference_corpus.append(sentence_pair[-1]) # true/reference sentence
        pred_sentence = id2text_(decoder_out, en_) # predicted sentence
        model_corpus.append(pred_sentence)
    
    # import above - from sacreBLEU.sacreBLEU import corpus_bleu
    # WARNING: Do not forget to join the lists before computing BLEU score!
    # Otherwise your BLEU score will be far below the true one.
    bleu_score = corpus_bleu((" ").join(model_corpus),
                             (" ").join(reference_corpus))[0]

    print ("BLEU score calculated on the validation set is ", score)

    return bleu_score

### 2.3 CNN Encoder + RNN Decoder

In [351]:
# CNNencoder
class CNNencoder(nn.Module):

    def __init__(self, 
                 vocab_size, 
                 embed_size, 
                 hidden_size, 
                 kernel_size, 
                 num_layers,
                 percent_dropout=0.3):
        
        super(CNNencoder, self).__init__()

        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        self.embed_size = embed_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(self.vocab_size, 
                                      self.embed_size, 
                                      padding_idx=0)
        
        self.dropout_f = nn.Dropout(percent_dropout)
        
        in_channels = self.embed_size
        
        self.conv1 = nn.Conv1d(in_channels,
                              self.hidden_size,
                              kernel_size, 
                              padding=kernel_size//2)
        
        # todo
        self.conv2 = nn.Conv1d(60, self.hidden_size, 
                               kernel_size,
                               padding=kernel_size//2)
        
        self.ReLU = nn.ReLU()

    def forward(self, source_sentence):
        
        batch_size, seq_len = source_sentence.size()
        
        embeds_source = self.embedding(source_sentence)
        
        out = self.conv1(embeds_source.transpose(1, 2)).transpose(1,2)
        out = self.ReLU(out)
        out = F.max_pool1d(out, kernel_size=5, stride=5)
        
        out = self.conv2(out.transpose(1, 2)).transpose(1,2)
        out = self.ReLU(out)
        out = torch.mean(out, dim=1).view(1, batch_size, self.hidden_size)
    
        return out

In [352]:
def translate_cnn(encoder_model,
                  decoder_model,
                  source_sentence,
                  target_sentence,
                  source_lengths):
    
    use_teacher_forcing = True if random.random() < 0.6 else False
    
    batch_size = source_sentence.size(0)
    
    encoder_hidden = encoder_model(source_sentence)
    
    decoder_hidden = encoder_hidden
    
    decoder_input = torch.tensor([[SOS_token]]*batch_size).to(device)

    if use_teacher_forcing:
        
        decoder_out = []
         
        for time_step in range(MAX_SENTENCE_LENGTH):
            
            decoder_output, decoder_hidden = decoder_model(decoder_input,
                                                           decoder_hidden,
                                                           encoder_outputs=None)
            decoder_out.append(decoder_output.unsqueeze(-1))
            decoder_input = target_sentence[:,time_step].view(-1,1)
            
        decoder_out = torch.cat(decoder_out,
                                dim=-1)
    else:
        
        decoder_out = []
        for time_step in range(MAX_SENTENCE_LENGTH):
            
            decoder_output, decoder_hidden = decoder_model(decoder_input,
                                                           decoder_hidden,
                                                           encoder_output)
            
            decoder_out.append(decoder_output.unsqueeze(-1))
            top_scores, top_indices = decoder_output.topk(1)
            decoder_input = top_indices.squeeze().detach().view(-1,1)
            
        decoder_out = torch.cat(decoder_out,
                                dim=-1)
        
    return decoder_out, decoder_hidden


def train_cnn(encoder_optimizer,
              decoder_optimizer,
              encoder_model, decoder_model,
              loss_function,
              data_loader,
              en_lang, # "vien_en_" for vietnamese -> eng, "zhen_en_" for chinese -> eng
              num_epochs=10, val_interval=1, rm = 0.8, 
              enc_scheduler=None, 
              dec_scheduler=None):

    mode_list = ["train","val_train"] # val_train, val every val_interval train epochs
    loss_hist = {"train": [], "val_train": []}
    BLEU_hist = {"train": [], "val": []}

    for epoch in range(num_epochs):
        print ("epoch", epoch)

        for ex, mode in enumerate(mode_list):
            
            start = time.time()
            total = 0
            top1_correct = 0
            running_loss = 0
            running_total = 0
            
            if mode == "train":
                encoder.train()
                decoder.train()
                
            elif mode == "val_train":
                encoder.eval()
                decoder.eval()
            else:
                raise ValueError
                
            for data in data_loader[mode]:
                
                encoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()

                encoder_input, decoder_input = data[0].to(device), data[1].to(device)
                source_lengths, target_lengths = data[2].to(device), data[3].to(device)

                if mode == "val_train":                
                    
                    output = encode_decode_cnn(encoder_model, decoder_model,
                                               encoder_input, decoder_input,
                                               source_lengths)
                else:
                    output = encode_decode_cnn(encoder_model, decoder_model,
                                               encoder_input, decoder_input,
                                               source_lengths)
                    
                loss = loss_function(output.float(), 
                                     decoder_input[:,:output.size(-1)].long())
                
                batch = decoder_input.size(0)
                
                running_loss += loss.item()*batch
                
                total += batch
                
                if mode == "train":
                    
                    loss.backward()
                    
                    torch.nn.utils.clip_grad_norm_(encoder.parameters(), 0.15)
                    torch.nn.utils.clip_grad_norm_(decoder.parameters(), 0.15)
                    
                    encoder_optimizer.step()
                    decoder_optimizer.step()
                    
            epoch_loss = running_loss / total 
            loss_hist[mode].append(epoch_loss)
            print("epoch {} {} loss = {}, time = {}".format(epoch, mode, epoch_loss,
                                                                           time.time() - start))
        if (enc_scheduler is not None) and (dec_scheduler is not None):
            enc_scheduler.step(epoch_loss)
            dec_scheduler.step(epoch_loss)
            
        if epoch % val_interval == 0:
            val_bleu_score = eval_(encoder_model, decoder_model, data_loader["val"], en_lang)
            BLEU_hist["val"].append(val_bleu_score)
            print("validation BLEU = ", val_bleu_score)

    return encoder_model, decoder_model, loss_hist, BLEU_hist

In [354]:
learning_rate = 1e-4
bi=True
# bi=False
encoder_ = CNNencoder(zhen_zh_.n_words,
                      300,300, 5,1,
                      percent_dropout=0.3).to(device)

decoder_ = RNNdecoder(300,zhen_en_.n_words).to(device)
encoder_optimizer = optim.Adam(encoder_.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder_.parameters(), lr=learning_rate)
criterion = nn.NLLLoss(ignore_index=0)

In [None]:
# train with encode decode cnn

In [355]:
enc, dec, loss_hist, bleu_hist = train_cnn(encoder_optimizer, decoder_optimizer,
                                           encoder_, decoder_,
                                           criterion, zhen_loader, zhen_en_,
                                           num_epochs=5)

epoch 0




Validation Loss =  2.0075799999086135
Validation BLEU =  6.015097639602013




Training Loss =  1.9711387605708715
Traning BLEU =  6.245280354192027
epoch 0 train loss = 2.2194282125410028, accurancy = 0 time = 1621.194352388382
epoch 1




Validation Loss =  1.9987231105064898
Validation BLEU =  5.756828060121056


KeyboardInterrupt: 