<h1 id="tocheading">Spring 2018 NLP Class Project: Neural Machine Translation</h1>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
import spacy
import pdb
import os
from underthesea import word_tokenize
import jieba
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
import pickle as pkl
import time

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

# running on cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

## Part 0: Project Overview

The goal of this project is to build a neural machine translation system and experience how recent advances have made their way. Each team will build the following sequence of neural translation systems for two language pairs, __Vietnamese (Vi)→English (En)__ and __Chinese (Zh)→En__ (prepared corpora is be provided):

1. Recurrent neural network based encoder-decoder without attention
2. Recurrent neural network based encoder-decoder with attention
2. Replace the recurrent encoder with either convolutional or self-attention based encoder.
4. [Optional] Build either or both fully self-attention translation system or/and multilingual translation system.

## Part 1: Data Upload & Preprocessing

In [4]:
UNK_IDX = 2
PAD_IDX = 0
SOS_token = 1
EOS_token = 3

In [6]:
def read_dataset(path):
    f = open(path)
    list_l = []
    for line in f:
        list_l.append(line.strip())
    df = pd.DataFrame()
    df["data"] = list_l
    return df

# vietnamese -> english
vien_en_train = read_dataset("data/iwslt-vi-en/train.tok.en")
vien_en_val = read_dataset("data/iwslt-vi-en/dev.tok.en")

vien_vi_train = read_dataset("data/iwslt-vi-en/train.tok.vi")
vien_vi_val = read_dataset("data/iwslt-vi-en/dev.tok.vi")

# chinese -> english
zhen_en_train = read_dataset("data/iwslt-zh-en/train.tok.en")
zhen_en_val = read_dataset("data/iwslt-zh-en/dev.tok.en")

zhen_zh_train = read_dataset("data/iwslt-zh-en/train.tok.zh")
zhen_zh_val = read_dataset("data/iwslt-zh-en/dev.tok.zh")

In [8]:
# chinese -> english
zhen_train = pd.DataFrame()
zhen_train["en_data"] = zhen_en_train["data"]
zhen_train["zh_data"] = zhen_zh_train["data"]

# vietnamese -> english
vien_train = pd.DataFrame()
vien_train["en_data"] = vien_en_train["data"]
vien_train["vi_data"] = vien_vi_train["data"]

# chinese -> english
zhen_val = pd.DataFrame()
zhen_val["en_data"] = zhen_en_val["data"]
zhen_val["zh_data"] = zhen_zh_val["data"]

# vietnamese -> english
vien_val = pd.DataFrame()
vien_val["en_data"] = vien_en_val["data"]
vien_val["vi_data"] = vien_vi_val["data"]

In [9]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "<SOS>", 1: "<EOS>", 2:"<UNK>",3:"<PAD>"}
        self.n_words = 4

    def addSentence(self, sentence):
        for word in sentence.split(" "):
            self.addWord(word.lower())

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [10]:
def unicodeToAscii(s):
    """About "NFC" and "NFD": 
    
    For each character, there are two normal forms: normal form C 
    and normal form D. Normal form D (NFD) is also known as canonical 
    decomposition, and translates each character into its decomposed form. 
    Normal form C (NFC) first applies a canonical decomposition, then composes 
    pre-combined characters again.
    
    About unicodedata.category: 
    
    Returns the general category assigned to the Unicode character 
    unichr as string."""
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

In [11]:
# vietnamese -> english
vien_en_ = Lang("vien_en")
for s in vien_train["en_data"]:
    vien_en_.addSentence(s)
    
vien_vi_ = Lang("vi")
for s in vien_train["vi_data"]:
    vien_vi_.addSentence(s)
    
# chinese -> english
zhen_en_ = Lang("zhen_en")
for s in zhen_train["en_data"]:
    zhen_en_.addSentence(s)
    
zhen_zh_ = Lang("zh")
for s in zhen_train["zh_data"]:
    zhen_zh_.addSentence(s)

In [15]:
def split(data, lang="vi"):
    data["en_tokenized"] = data["en_data"].apply(lambda x: x.lower().split( ))
    
    if lang == "vi":
        data["vi_tokenized"] = data["vi_data"].apply(lambda x: x.lower().split( ))
    else:
        data["zh_tokenized"] = data["zh_data"].apply(lambda x: x.lower().split( ))
    return data

In [16]:
# training

# vietnamese -> english
vien_train = split(vien_train, lang="vi")

# chinese -> english
zhen_train = split(zhen_train, lang="zh")

In [18]:
zhen_train.head(3)

Unnamed: 0,en_data,zh_data,en_tokenized,zh_tokenized
0,Life in the deep oceans,深海 海中 的 生命 大卫 盖罗,"[life, in, the, deep, oceans]","[深海, 海中, 的, 生命, 大卫, 盖罗]"
1,With vibrant video clips captured by submarine...,大卫 盖罗 通过 潜水 潜水艇 拍下 的 影片 把 我们 带到 了 地球 最 黑暗 ...,"[with, vibrant, video, clips, captured, by, su...","[大卫, 盖罗, 通过, 潜水, 潜水艇, 拍下, 的, 影片, 把, 我们, 带到, 了,..."
2,This is Bill Lange . I &apos;m Dave Gallo .,大卫 盖罗 这位 是 比尔 兰格 我 是 大卫 盖罗,"[this, is, bill, lange, ., i, &apos;m, dave, g...","[大卫, 盖罗, 这位, 是, 比尔, 兰格, 我, 是, 大卫, 盖罗]"


In [19]:
# validation

# vietnamese -> english
vien_val = split(vien_val, lang="vi")

# chinese -> english
zhen_val = split(zhen_val, lang="zh")

In [20]:
def token2index_dataset(data, source_language="zh"):
    
    if source_language == "zh":
        # chinese -> english
        for language in ["en","zh"]:
            indices_data = []
            if language == "en":
                lang_obj = zhen_en_
            else:
                lang_obj = zhen_zh_
                
            for tokens in data[language + "_tokenized"]:
                
                index_list = [lang_obj.word2index[token] if token in lang_obj.word2index else UNK_IDX for token in tokens]
                index_list.append(EOS_token)
                indices_data.append(index_list)
                
            data[language + "_indices"] = indices_data
    else:
        # vietnamese -> english
        for language in ["en","vi"]:
            indices_data = []
            if language == "en":
                lang_obj = vien_en_
            else:
                lang_obj = vien_vi_
                
            for tokens in data[language + "_tokenized"]:
                
                index_list = [lang_obj.word2index[token] if token in lang_obj.word2index else UNK_IDX for token in tokens]
                index_list.append(EOS_token)
                indices_data.append(index_list)
                
            data[language + "_indices"] = indices_data
            
    return data

In [22]:
# training

# vietnamese -> english
vien_train = token2index_dataset(vien_train, source_language="vi")

# chinese -> english
zhen_train = token2index_dataset(zhen_train, source_language="zh")

# validation

# vietnamese -> english
vien_val = token2index_dataset(vien_val, source_language="vi")

# chinese -> english
zhen_val = token2index_dataset(zhen_val, source_language="zh")

In [24]:
# Datasets
from torch.utils.data import Dataset

# vietnamese -> english
class Vietnamese(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        
        english = self.data.iloc[idx]["en_indices"]
        vietnamese = self.data.iloc[idx]["vi_indices"]
        en_lengths = self.data.iloc[idx]["en_lengths"]
        vi_lengths = self.data.iloc[idx]["vi_lengths"]
        
        return [english, vietnamese, en_lengths, vi_lengths]
    
# chinese -> english
class Chinese(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        
        english = self.data.iloc[idx]["en_indices"]
        chinese = self.data.iloc[idx]["zh_indices"]
        en_lengths = self.data.iloc[idx]["en_lengths"]
        zh_lengths = self.data.iloc[idx]["zh_lengths"]
        
        return [english, chinese, en_lengths, zh_lengths]

In [28]:
# get lengths 

# vietnamese -> english
# train
vien_train["en_lengths"] = vien_train["en_indices"].apply(lambda x: len(x))
vien_train["vi_lengths"] = vien_train["vi_indices"].apply(lambda x:len(x))

vien_train = vien_train[np.logical_and(vien_train["en_lengths"]>=2,vien_train["vi_lengths"]>=2)]
vien_train = vien_train[vien_train["vi_lengths"]<=30]

# val
vien_val["en_lengths"] = vien_val["en_indices"].apply(lambda x: len(x))
vien_val["vi_lengths"] = vien_val["vi_indices"].apply(lambda x:len(x))

vien_val = vien_val[np.logical_and(vien_val["en_lengths"]>=2,vien_val["vi_lengths"]>=2)]
vien_val = vien_val[vien_val["vi_lengths"]<=30]

# chinese -> english
# train
zhen_train["en_lengths"] = zhen_train["en_indices"].apply(lambda x: len(x))
zhen_train["zh_lengths"] = zhen_train["zh_indices"].apply(lambda x:len(x))
# val
zhen_val["en_lengths"] = zhen_val["en_indices"].apply(lambda x: len(x))
zhen_val["zh_lengths"] = zhen_val["zh_indices"].apply(lambda x:len(x))

zhen_train = zhen_train[np.logical_and(zhen_train["en_lengths"]>=2,zhen_train["zh_lengths"]>=2)]
zhen_train = zhen_train[zhen_train["zh_lengths"]<=30]

zhen_val = zhen_val[np.logical_and(zhen_val["en_lengths"]>=2,zhen_val["zh_lengths"]>=2)]
zhen_val = zhen_val[zhen_val["zh_lengths"]<=30]

In [29]:
zhen_train.head(3)

Unnamed: 0,en_data,zh_data,en_tokenized,zh_tokenized,en_indices,zh_indices,en_lengths,zh_lengths
0,Life in the deep oceans,深海 海中 的 生命 大卫 盖罗,"[life, in, the, deep, oceans]","[深海, 海中, 的, 生命, 大卫, 盖罗]","[4, 5, 6, 7, 8, 3]","[4, 5, 6, 7, 9, 10, 3]",6,7
2,This is Bill Lange . I &apos;m Dave Gallo .,大卫 盖罗 这位 是 比尔 兰格 我 是 大卫 盖罗,"[this, is, bill, lange, ., i, &apos;m, dave, g...","[大卫, 盖罗, 这位, 是, 比尔, 兰格, 我, 是, 大卫, 盖罗]","[45, 39, 46, 47, 44, 48, 49, 50, 18, 44, 3]","[9, 10, 47, 32, 48, 49, 50, 32, 9, 10, 3]",11,11
3,And we &apos;re going to tell you some stories...,我们 将 用 一些 影片 来讲 讲述 一些 深海 海里 的 故事,"[and, we, &apos;re, going, to, tell, you, some...","[我们, 将, 用, 一些, 影片, 来讲, 讲述, 一些, 深海, 海里, 的, 故事]","[30, 51, 52, 53, 21, 54, 55, 22, 56, 57, 6, 58...","[17, 51, 52, 53, 15, 54, 55, 53, 4, 56, 6, 57, 3]",17,13


In [104]:
MAX_SENTENCE_LENGTH = 48
BATCH_SIZE = 32

def translation_collate_(batch):
    
    target_data = []
    source_data = []
    target_lengths = []
    source_lengths = []

    for datum in batch:
        target_lengths.append(datum[2])
        source_lengths.append(datum[3])
        
    # PAD
    for datum in batch:
        if datum[2] > MAX_SENTENCE_LENGTH:
            padded_vec_target = np.array(datum[0])[:MAX_SENTENCE_LENGTH]
        else:
            padded_vec_target = np.pad(np.array(datum[0]),
                                pad_width=((0,MAX_SENTENCE_LENGTH - datum[2])),
                                mode="constant", constant_values=PAD_IDX)
            
        if datum[3] > MAX_SENTENCE_LENGTH:
            padded_vec_source = np.array(datum[1])[:MAX_SENTENCE_LENGTH]
        else:
            padded_vec_source = np.pad(np.array(datum[1]),
                                pad_width=((0,MAX_SENTENCE_LENGTH - datum[3])),
                                mode="constant", constant_values=PAD_IDX)
            
        target_data.append(padded_vec_target)
        source_data.append(padded_vec_source)
        
    return [torch.from_numpy(np.array(source_data)), torch.from_numpy(np.array(target_data)),
            torch.from_numpy(np.array(source_lengths)), torch.from_numpy(np.array(target_lengths))]

In [105]:
# vietnamese -> english
vien_dataset = {"train": Vietnamese(vien_train), "val": Vietnamese(vien_val)}

vien_loader = {x: DataLoader(vien_dataset[x], batch_size=BATCH_SIZE, 
                            collate_fn=translation_collate_,
                            shuffle=False, num_workers=0) for x in ["train", "val"]}

# chinese -> english
zhen_dataset = {"train": Chinese(zhen_train), "val": Chinese(zhen_val)}

zhen_loader = {x: DataLoader(zhen_dataset[x], batch_size=BATCH_SIZE,
                             collate_fn=translation_collate_,
                             shuffle=False, num_workers=0) for x in ["train", "val"]}

In [106]:
# vietnamese -> english
vien_train_data = next(iter(vien_loader["train"]))
vien_val_data = next(iter(vien_loader["val"]))

# chinese -> english
zhen_train_data = next(iter(zhen_loader["train"]))
zhen_val_data = next(iter(zhen_loader["val"]))

In [141]:
def out_token_2_string(index_tensor, 
                       language):
    sentence = []
    for i in index_tensor:
        if i.item() not in [0, 1, 3]: # <PAD>, <SOS>, <EOS>
            sentence.append(language.index2word[i.item()])
    return (' ').join(sentence)

## Part 2: Model

In [108]:
from sacreBLEU.sacreBLEU import corpus_bleu

### 2.1: RNN Encoder-Decoder without Attention

In [109]:
class RNNencoder(nn.Module):
    
    def __init__(self, 
                 input_size, 
                 hidden_size, 
                 num_gru_layers=1):
        
        super(RNNencoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_gru_layers

        self.embedding = nn.Embedding(self.input_size, 
                                      self.hidden_size,
                                      padding_idx=0)
        
        self.GRU = nn.GRU(self.hidden_size, 
                          self.hidden_size,
                          batch_first = True,
                          bidirectional = False)
        
        if self.GRU.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
        
    def init_hidden(self, batch_size):
        
        return torch.zeros(self.num_layers*self.num_directions, 
                           batch_size, self.hidden_size).to(device)

    def forward(self, source_sentence, source_lengths, hidden):
        
        sort_original_source = sorted(range(len(source_lengths)), 
                             key=lambda sentence: -source_lengths[sentence])
        unsort_to_original_source = sorted(range(len(source_lengths)), 
                             key=lambda sentence: sort_original_source[sentence])
        
        source_sentence = source_sentence[sort_original_source]
        source_lengths = source_lengths[sort_original_source]
        batch_size, seq_len_source = source_sentence.size()
        
        embeds_source = self.embedding(source_sentence)
        
        embeds_source = torch.nn.utils.rnn.pack_padded_sequence(embeds_source, 
                                                                source_lengths, 
                                                                batch_first=True)
        output = embeds_source
        
        output, hidden = self.GRU(output, hidden)
        
        hidden = hidden.view(batch_size, self.hidden_size)
        
        hidden = hidden[unsort_to_original_source] ## back to original indices
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
        
        hidden = hidden.view(1, batch_size, self.hidden_size)
        
        return hidden, output[unsort_to_original_source]

In [167]:
class RNNdecoder(nn.Module):
    
    def __init__(self, 
                 hidden_size, 
                 vocab_size):
        
        super(RNNdecoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(self.vocab_size, 
                                      self.hidden_size,
                                      padding_idx=0)
        
        
        self.dropout = nn.Dropout(p=0.1)
        
        self.GRU = nn.GRU(self.hidden_size, 
                          self.hidden_size,
                          batch_first=True)
        

        self.linear_layer = nn.Linear(self.hidden_size, self.vocab_size)
        
        self.log_softmax = nn.LogSoftmax(dim=1)
        
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size).to(device)

    def forward(self, 
                input_, 
                decoder_hidden,
                encoder_outputs=None):
        
        batch_size = input_.size(0)
        output = self.embedding(input_)
        output = self.dropout(output)
        
#         cat_out = torch.cat((output, decoder_hidden), 2)

        output, decoder_hidden = self.GRU(output, decoder_hidden)

        output = self.linear_layer(output.squeeze(dim=1))

        output = self.log_softmax(output)

        return output, decoder_hidden

In [360]:
def encode_decode_rnn(encoder,
                      decoder,
                      data_source,
                      data_target,
                      source_lengths):
    
    use_teacher_forcing = True if random.random() < 0.6 else False
    
    batch_size = data_source.size(0)
    encoder_hidden = encoder.init_hidden(batch_size)
    
    encoder_hidden, encoder_output = encoder(data_source,
                                          source_lengths,
                                          encoder_hidden)
    
    decoder_hidden = encoder_hidden
    
    decoder_input = torch.tensor([[SOS_token]]*batch_size).to(device)

    if use_teacher_forcing:
        
        d_out = []
         
        for i in range(MAX_SENTENCE_LENGTH):
            
            decoder_output, decoder_hidden = decoder(decoder_input,
                                                     decoder_hidden,
                                                     encoder_outputs=encoder_output)
            d_out.append(decoder_output.unsqueeze(-1))
            decoder_input = data_target[:,i].view(-1,1)
            
        d_hidden = decoder_hidden
        d_out = torch.cat(d_out,dim=-1)
    else:
        d_out = []
        for i in range(MAX_SENTENCE_LENGTH):
            
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden,
                                                     encoder_output)
            d_out.append(decoder_output.unsqueeze(-1))
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach().view(-1,1)
            
        d_hidden = decoder_hidden
        d_out = torch.cat(d_out,dim=-1)
        
    return d_out, d_hidden

In [361]:
def validate_model(encoder, decoder, dataloader, loss_fun, lang_en=zhen_en_):
    encoder.train(False)
    decoder.train(False)
    pred_corpus = []
    true_corpus = []
    running_loss = 0
    running_total = 0
    for data in dataloader:
        encoder_i = data[0].to(device)
        decoder_i = data[1].to(device)
        source_lengths = data[2].to(device)
        bs,sl = encoder_i.size()[:2]
        out, hidden = encode_decode_rnn(encoder,decoder,encoder_i,decoder_i, source_lengths)
        loss = loss_fun(out.float(), decoder_i.long())
        running_loss += loss.item()*bs
        running_total += bs
        pred = torch.max(out,dim = 1)[1]
        for t,p in zip(data[1],pred):
            t,p = out_token_2_string(t,lang_en), out_token_2_string(p,lang_en)
            true_corpus.append(t)
            pred_corpus.append(p)
    score = corpus_bleu(pred_corpus,[true_corpus],lowercase=True)[0]
    return running_loss/running_total, score

def train_model(encoder_optimizer,
                decoder_optimizer, 
                encoder, decoder, 
                dataloader,
                loss_function, 
                num_epochs=60,
                lang_en=zhen_en_):
    
    best_score = 0
    best_au = 0
    loss_hist = {"train": [], "val": []}
    acc_hist = {"train": [], "val": []}
    for epoch in range(num_epochs):
        print ("epoch", epoch)
        for ex, phase in enumerate(["train"]):
            start = time.time()
            total = 0
            top1_correct = 0
            running_loss = 0
            running_total = 0
            if phase == "train":
                encoder.train(True)
                decoder.train(True)
            else:
                encoder.train(False)
                decoder.train(False)
                
            for data in dataloader[phase]:
                
                encoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()

                encoder_i = data[0].to(device)
                decoder_i = data[1].to(device)
                source_lengths = data[2].to(device)
                                
                out, hidden = encode_decode_rnn(encoder, decoder, 
                                                encoder_i, decoder_i, 
                                                source_lengths)
                
                loss = loss_function(out.float(), decoder_i.long())
                N = decoder_i.size(0)

                running_loss += loss.item() * N
                
                total += N

                if phase == "train":
                    loss.backward()
                    encoder_optimizer.step()
                    decoder_optimizer.step()
                    
            loss, score = validate_model(encoder,decoder, dataloader["val"],loss_function, lang_en)
            print("Validation Loss = ", loss)
            print("Validation BLEU = ", score)
            loss, score = validate_model(encoder,decoder, dataloader["train"],loss_function, lang_en)
            print("Training Loss = ", loss)
            print("Traning BLEU = ", score)
#                 running_total += N
            epoch_loss = running_loss / total
#             epoch_acc = top1_correct / total
            epoch_acc = 0
            loss_hist[phase].append(epoch_loss)
            acc_hist[phase].append(epoch_acc)
            print("epoch {} {} loss = {}, accurancy = {} time = {}".format(epoch, phase, epoch_loss, epoch_acc,
                                                                           time.time() - start))
        if phase == "val" and epoch_acc > best_score:
            best_score = epoch_acc
#             torch.save(model, save_dir+save_name+str(n_channel)+str(n_top)+str(vocab_size))
    print("Training completed. Best accuracy is {}".format(best_score))
    return encoder,decoder

In [143]:
# vietnamese -> english
learning_rate = 1e-3
encoder_ = RNNencoder(vien_vi_.n_words,300).to(device)
decoder_ = RNNdecoder(300, vien_en_.n_words).to(device)

encoder_optimizer = optim.Adam(encoder_.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder_.parameters(), lr=learning_rate)

criterion = nn.NLLLoss()

In [144]:
enc, dec = train_model(encoder_optimizer, decoder_optimizer, encoder_, decoder_, 
                       vien_loader, criterion, num_epochs = 3, lang_en=vien_en_)

epoch 0




Validation Loss =  1.9821176363497364
Validation BLEU =  5.650892167679332




Training Loss =  2.4124969797003755
Traning BLEU =  5.4391965413619
epoch 0 train loss = 2.6078720465540552, accurancy = 0 time = 902.2443535327911
epoch 1




Validation Loss =  1.8856912719960115
Validation BLEU =  5.608043534440519




Training Loss =  2.313281160345092
Traning BLEU =  5.952884044515693
epoch 1 train loss = 2.412149650576681, accurancy = 0 time = 901.9006140232086
epoch 2




Validation Loss =  2.008377187106074
Validation BLEU =  5.218871476955363




Training Loss =  2.270766274806683
Traning BLEU =  6.100926628845955
epoch 2 train loss = 2.3334374406396634, accurancy = 0 time = 905.1546671390533
Training completed. Best accuracy is 0


In [148]:
# chinese -> english
learning_rate = 1e-3
encoder_ = RNNencoder(zhen_zh_.n_words,300).to(device)
decoder_ = RNNdecoder(300, zhen_en_.n_words).to(device)

encoder_optimizer = optim.Adam(encoder_.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder_.parameters(), lr=learning_rate)

criterion = nn.NLLLoss()

enc, dec = train_model(encoder_optimizer, decoder_optimizer, encoder_, decoder_, 
                       zhen_loader, criterion, num_epochs = 3, lang_en=zhen_en_)

epoch 0




Validation Loss =  1.9332467986143662
Validation BLEU =  7.622414312900019




Training Loss =  1.697795827667021
Traning BLEU =  9.681527824055898
epoch 0 train loss = 1.9247176439607387, accurancy = 0 time = 1698.2876951694489
epoch 1




Validation Loss =  1.801596927006096
Validation BLEU =  8.128555771711278


KeyboardInterrupt: 

### 2.2: RNN Decoder with Attention

In [427]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, 
                 hidden_size = 300, 
                 output_size = vien_en_.n_words, 
                 bidirectional = False):
        
        super(AttnDecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.embedding = nn.Embedding(self.output_size, # vocab size
                                      self.hidden_size) # embed_size = hidden_size
        
        self.dropout = nn.Dropout(p=0.1)
        
        self.GRU = nn.GRU(self.hidden_size, 
                          self.hidden_size,
                          batch_first=True)
        
        self.attn = nn.Linear(self.hidden_size*2, self.hidden_size)
        self.attn_drop = nn.Dropout(p = 0.5)
        
        self.attn_combine = nn.Linear(self.hidden_size*2, 
                                      self.hidden_size)

        self.out = nn.Linear(self.hidden_size, 
                             self.output_size) # feed into softmax over vocabulary
        
        self.log_softmax = nn.LogSoftmax(dim=1) 
        
    def init_hidden(self, batch_size):
        
        hidden_ = torch.zeros(1, batch_size, self.hidden_size).to(device)
        
        return hidden_

    def forward(self, 
                input_, 
                hidden, 
                encoder_outputs):
        
        # decoder seq-len will always be 1
        batch_size = input_.size(0)
        
        output = self.embedding(input_)
        output = self.dropout(output)
        
        hidden = hidden.view(batch_size, 1, self.hidden_size)
        
        cat = torch.cat((output, hidden),2)
        # 32, 1 , 600: B, S, H*2

        att_out = self.attn_drop(self.attn(cat))
        
        attn_wts = F.softmax(torch.bmm(encoder_outputs,att_out.transpose(1,2)),dim = 1)
        
        attn_applied = torch.sum(encoder_outputs*attn_wts, dim = 1).unsqueeze(1)
        
        attn_cat = torch.cat((output, attn_applied), 2)
        
        attn_comb = self.attn_combine(attn_cat)
        
        output = F.relu(attn_comb)

        output, hidden = self.GRU(output, hidden.view(1, batch_size, 
                                                      self.hidden_size)[0].unsqueeze(0))

        output = self.out(output.squeeze(dim=1))

        output = self.log_softmax(output)

        return output, hidden

In [428]:
learning_rate = 1e-4
bi=True
# bi=False
encoder_ = RNNencoder(zhen_zh_.n_words,300,bi).to(device)
decoder_ = AttnDecoderRNN(300,zhen_en_.n_words,bidirectional=False).to(device)
encoder_optimizer = optim.Adam(encoder_.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder_.parameters(), lr=learning_rate)
criterion = nn.NLLLoss(ignore_index=0)

In [429]:
X = torch.rand(3, 2, 3)
Y = torch.rand(3, 3, 480)

Z = torch.bmm(X, Y)
print ("z size = ",Z.size())

z size =  torch.Size([3, 2, 480])


In [430]:
enc_attn, dec_attn = train_model(encoder_optimizer, decoder_optimizer, 
                                 encoder_, decoder_, 
                                 zhen_loader, criterion, num_epochs = 3, lang_en=zhen_en_)

epoch 0




Validation Loss =  5.521294323556147
Validation BLEU =  3.0026225215058857


KeyboardInterrupt: 

### 2.3 CNN Encoder

In [351]:
class CNNencoder(nn.Module):

    def __init__(self, 
                 vocab_size, 
                 embed_size, 
                 hidden_size, 
                 kernel_size, 
                 num_layers,
                 percent_dropout=0.5):
        
        super(CNNencoder, self).__init__()

        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.kernel_size = kernel_size
        self.embed_size = embed_size
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(self.vocab_size, 
                                      self.embed_size, 
                                      padding_idx=0)
        
        self.dropout = nn.Dropout(percent_dropout)
        
        in_channels = self.embed_size
        
        self.conv = nn.Conv1d(in_channels, self.hidden_size, kernel_size, 
                              padding=kernel_size//2)
        
        self.conv2 = nn.Conv1d(60, self.hidden_size, kernel_size, 
                              padding=kernel_size//2)
        
        self.relu = nn.ReLU()

    def forward(self, source_sentence):
        
        batch_size, seq_len = source_sentence.size()
        
        embeds_source = self.embedding(source_sentence)
        
        out = self.conv(embeds_source.transpose(1, 2)).transpose(1,2)
        out = self.relu(out)
        out = F.max_pool1d(out, kernel_size=5, stride=5)
        
        out = self.conv2(out.transpose(1, 2)).transpose(1,2)
        out = self.relu(out)
        out = torch.mean(out, dim=1).view(1, batch_size, self.hidden_size)
    
        return out

In [352]:
def encode_decode_cnn(encoder,
                      decoder,
                      data_source,
                      data_target,
                      source_lengths):
    
    use_teacher_forcing = True if random.random() < 0.6 else False
    
    batch_size = data_source.size(0)
    
    encoder_hidden = encoder(data_source)
    
    decoder_hidden = encoder_hidden
    
    decoder_input = torch.tensor([[SOS_token]]*batch_size).to(device)

    if use_teacher_forcing:
        
        d_out = []
         
        for i in range(MAX_SENTENCE_LENGTH):
            
            decoder_output, decoder_hidden = decoder(decoder_input,
                                                     decoder_hidden)
            
            d_out.append(decoder_output.unsqueeze(-1))
            decoder_input = data_target[:,i].view(-1,1)
            
        d_hidden = decoder_hidden
        d_out = torch.cat(d_out,dim=-1)
    else:
        d_out = []
        for i in range(MAX_SENTENCE_LENGTH):
            
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            d_out.append(decoder_output.unsqueeze(-1))
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach().view(-1,1)
            
        d_hidden = decoder_hidden
        d_out = torch.cat(d_out,dim=-1)
        
    return d_out, d_hidden

In [353]:
def validate_model(encoder, decoder, dataloader, loss_fun, lang_en=zhen_en_):
    encoder.train(False)
    decoder.train(False)
    pred_corpus = []
    true_corpus = []
    running_loss = 0
    running_total = 0
    for data in dataloader:
        encoder_i = data[0].to(device)
        decoder_i = data[1].to(device)
        source_lengths = data[2].to(device)
        bs,sl = encoder_i.size()[:2]
        out, hidden = encode_decode_cnn(encoder,decoder,encoder_i,decoder_i, source_lengths)
        loss = loss_fun(out.float(), decoder_i.long())
        running_loss += loss.item()*bs
        running_total += bs
        pred = torch.max(out,dim = 1)[1]
        for t,p in zip(data[1],pred):
            t,p = out_token_2_string(t,lang_en), out_token_2_string(p,lang_en)
            true_corpus.append(t)
            pred_corpus.append(p)
    score = corpus_bleu(pred_corpus,[true_corpus],lowercase=True)[0]
    return running_loss/running_total, score

def train_model(encoder_optimizer,
                decoder_optimizer, 
                encoder, decoder, 
                dataloader,
                loss_function, 
                num_epochs=60,
                lang_en=zhen_en_):
    
    best_score = 0
    best_au = 0
    loss_hist = {"train": [], "val": []}
    acc_hist = {"train": [], "val": []}
    for epoch in range(num_epochs):
        print ("epoch", epoch)
        for ex, phase in enumerate(["train"]):
            start = time.time()
            total = 0
            top1_correct = 0
            running_loss = 0
            running_total = 0
            if phase == "train":
                encoder.train(True)
                decoder.train(True)
            else:
                encoder.train(False)
                decoder.train(False)
                
            for data in dataloader[phase]:
                
                encoder_optimizer.zero_grad()
                decoder_optimizer.zero_grad()

                encoder_i = data[0].to(device)
                decoder_i = data[1].to(device)
                source_lengths = data[2].to(device)
                                
                out, hidden = encode_decode_cnn(encoder, decoder, 
                                                encoder_i, decoder_i, 
                                                source_lengths)
                
                loss = loss_function(out.float(), decoder_i.long())
                N = decoder_i.size(0)

                running_loss += loss.item()*N
                
                total += N

                if phase == "train":
                    loss.backward()
                    encoder_optimizer.step()
                    decoder_optimizer.step()
                    
            loss, score = validate_model(encoder,decoder, dataloader["val"],loss_function, lang_en)
            print("Validation Loss = ", loss)
            print("Validation BLEU = ", score)
            loss, score = validate_model(encoder,decoder, dataloader["train"],loss_function, lang_en)
            print("Training Loss = ", loss)
            print("Traning BLEU = ", score)
#                 running_total += N
            epoch_loss = running_loss / total
#             epoch_acc = top1_correct / total
            epoch_acc = 0
            loss_hist[phase].append(epoch_loss)
            acc_hist[phase].append(epoch_acc)
            print("epoch {} {} loss = {}, accurancy = {} time = {}".format(epoch, phase, epoch_loss, epoch_acc,
                                                                           time.time() - start))
        if phase == "val" and epoch_acc > best_score:
            best_score = epoch_acc
#             torch.save(model, save_dir+save_name+str(n_channel)+str(n_top)+str(vocab_size))
    print("Training completed. Best accuracy is {}".format(best_score))
    return encoder,decoder

In [354]:
learning_rate = 1e-4
bi=True
# bi=False
encoder_ = CNNencoder(zhen_zh_.n_words,
                      300,300, 5,1,
                      percent_dropout=0.3).to(device)

decoder_ = RNNdecoder(300,zhen_en_.n_words).to(device)
encoder_optimizer = optim.Adam(encoder_.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder_.parameters(), lr=learning_rate)
criterion = nn.NLLLoss(ignore_index=0)

In [355]:
enc, dec = train_model(encoder_optimizer, decoder_optimizer, encoder_, decoder_, 
                       zhen_loader, criterion, num_epochs = 3, lang_en=zhen_en_)

epoch 0




Validation Loss =  2.0075799999086135
Validation BLEU =  6.015097639602013




Training Loss =  1.9711387605708715
Traning BLEU =  6.245280354192027
epoch 0 train loss = 2.2194282125410028, accurancy = 0 time = 1621.194352388382
epoch 1




Validation Loss =  1.9987231105064898
Validation BLEU =  5.756828060121056


KeyboardInterrupt: 