<h1 id="tocheading">Spring 2018 NLP Class Project: Neural Machine Translation</h1>
<div id="toc"></div>

In [3]:
%%javascript
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

In [4]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import pandas as pd
import spacy
import pdb
import os
from underthesea import word_tokenize
import jieba
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

# running on cpu
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu" if torch.cuda.is_available() else "cpu")

In [5]:
# ! pip install spacy && python -m spacy download en

## Part 0: Project Overview

The goal of this project is to build a neural machine translation system and experience how recent advances have made their way. Each team will build the following sequence of neural translation systems for two language pairs, __Vietnamese (Vi)→English (En)__ and __Chinese (Zh)→En__ (prepared corpora is be provided):

1. Recurrent neural network based encoder-decoder without attention
2. Recurrent neural network based encoder-decoder with attention
2. Replace the recurrent encoder with either convolutional or self-attention based encoder.
4. [Optional] Build either or both fully self-attention translation system or/and multilingual translation system.

## Part 1: Data Upload & Preprocessing

In [6]:
# start of sentence
SOS_token = 1
# end of sentence
EOS_token = 3

## 2 = unk

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0:"<PAD>",1: "<SOS>", 2:"<UNK>", 3: "<EOS>"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [7]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    """About "NFC" and "NFD": 
    
    For each character, there are two normal forms: normal form C 
    and normal form D. Normal form D (NFD) is also known as canonical 
    decomposition, and translates each character into its decomposed form. 
    Normal form C (NFC) first applies a canonical decomposition, then composes 
    pre-combined characters again.
    
    About unicodedata.category: 
    
    Returns the general category assigned to the Unicode character 
    unichr as string."""
    
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Trim
def normalizeString(s):
    s = unicodeToAscii(s.strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [8]:
def readLangs(lang1, lang2, reverse=False,
             dataset="train"):
    
    """Takes as input;
    - lang1, lang2: either (vi, en) or (zh, en)
    - dataset: one of ("train","dev","test")"""
    print("Reading lines...")
    eos = [".","?","!","\n"]
    # Read the pretokenized lang1 file and split into lines
    lang1_lines = open("../data/tokens_and_preprocessing_em/pretokenized_data/iwslt-%s-%s-processed/%s.tok.%s" % (lang1, lang2, dataset, lang1), encoding="utf-8").\
        read().strip().split("\n")
    # Read the lang2 file and split into lines
    lang2_lines = open("../data/tokens_and_preprocessing_em/pretokenized_data/iwslt-%s-%s-processed/%s.tok.%s" % (lang1, lang2, dataset, lang2), encoding="utf-8").\
        read().strip().split("\n")
    
    # create sentence pairs (lists of length 2 that consist of string pairs)
    # e.g. ["And we &apos;re going to tell you some stories from the sea here in video .",
    #       "我们 将 用 一些 影片 来讲 讲述 一些 深海 海里 的 故事  "]
    # check if there are the same number of sentences in each set
    assert len(lang1_lines) == len(lang2_lines), "Two languages must have the same number of sentences. "+ str(len(lang1_lines)) + " sentences were passed for " + str(lang1) + "." + str(len(lang2_lines)) + " sentences were passed for " + str(lang2)+"."
    # normalize if not Chinese, Chinese normalization is already handeled
    if lang1 == "zh":
        lang1_lines = [s + "<EOS>" for s in lang1_lines]
    else:
        lang1_lines = [normalizeString(s).replace(".","<EOS>").\
                       replace("?","<EOS>").replace("!","<EOS>").replace("\n","<EOS>") for s in lang1_lines]
    lang2_lines = [normalizeString(s).replace(".","<EOS>").\
                       replace("?","<EOS>").replace("!","<EOS>").replace("\n","<EOS>") for s in lang2_lines]
    # construct pairs
    pair_ran = range(len(lang1_lines))
    pairs = [[lang1_lines[i]] + [lang2_lines[i]] for i in pair_ran]
    
#     # Split every line into pairs and normalize
#     pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [9]:
def prepareData(lang1, lang2, reverse=False, dataset="train"):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse, dataset=dataset)
    print("Read %s sentence pairs" % len(pairs))
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

# example
input_lang, output_lang, pairs = prepareData('vi', 'en', False, dataset="train")
print(random.choice(pairs))

Reading lines...
Read 133317 sentence pairs
Trimmed to 133317 sentence pairs
Counting words...
Counted words:
vi 16142
en 47566
['co nhieu cach neu o la tam thoi toi thieu hoa anh huong nhung no la mot van e <EOS>', 'There apos s ways if it apos s temporary to minimize the impact but it apos s a problem <EOS>']


In [10]:
input_lang, output_lang, pairs = prepareData('zh', 'en', False, dataset="train")
print(random.choice(pairs))

Reading lines...
Read 213376 sentence pairs
Trimmed to 213376 sentence pairs
Counting words...
Counted words:
zh 89202
en 59327
['德鲁   德纳 维奇    会计 之 夜 的 即兴 表演   <EOS>', 'Drew Dernavich <EOS> quot Accounting night at the improv <EOS> quot ']


### 1.1 Vietnamese to English

In [11]:
# # Please find the original tokenizing code provided by Elman Mansimov in the following link:
# # https://github.com/derincen/neural-machine-translation/tree/master/data/tokens_and_preprocessing_em/preprocess_translation

# def tokenize_vi(f_names, f_out_names):
#     for f_name, f_out_name in zip(f_names, f_out_names):
#         lines = open(f_name, 'r').readlines()
#         tok_lines = open(f_out_name, 'w')
#         for i, sentence in enumerate(lines):
#             if i > 0 and i % 1000 == 0:
#                 print (f_name.split('/')[-1], i, len(lines))
#             tok_lines.write(word_tokenize(sentence, format="text") + '\n')
#         tok_lines.close()

# def tokenize_en(f_names, f_out_names):
#     tokenizer = spacy.load('en_core_web_sm')

#     for f_name, f_out_name in zip(f_names, f_out_names):
#         lines = open(f_name, 'r').readlines()
#         tok_lines = open(f_out_name, 'w')
#         for i, sentence in enumerate(lines):
#             if i > 0 and i % 1000 == 0:
#                 print (f_name.split('/')[-1], i, len(lines))
#             # replaced tokenizer(sentence) with str(tokenizer(sentence)) to avoid 
#             # type error while joining
#             tok_lines.write(' '.join(str(tokenizer(sentence))) + '\n')
#         tok_lines.close()


# if __name__ == "__main__":
#     root = '../data/tokens_and_preprocessing_em/pretokenized_data/iwslt-vi-en-processed/'
#     tokenize_vi([os.path.join(root, 'train.vi'), os.path.join(root, 'dev.vi'), 
#                  os.path.join(root, 'test.vi')],\
#                [os.path.join(root, 'train.tok.vi'), os.path.join(root, 'dev.tok.vi'), 
#                 os.path.join(root, 'test.tok.vi')])

#     tokenize_en([os.path.join(root, 'train.en'), os.path.join(root, 'dev.en'), 
#                  os.path.join(root, 'test.en')],\
#                 [os.path.join(root, 'train.tok.en'), os.path.join(root, 'dev.tok.en'), 
#                  os.path.join(root, 'test.tok.en')])


In [12]:
# Format: languagepair_language_dataset
# Train 
vien_vi_train, vien_en_train, vi_en_train_pairs = prepareData('vi', 'en', False, dataset="train")
# Dev 
vien_vi_dev, vien_en_dev, vi_en_dev_pairs = prepareData('vi', 'en', False, dataset="dev")
# Test
vien_vi_test, vien_en_test, vi_en_test_pairs = prepareData('vi', 'en', False, dataset="test")

Reading lines...
Read 133317 sentence pairs
Trimmed to 133317 sentence pairs
Counting words...
Counted words:
vi 16142
en 47566
Reading lines...
Read 1268 sentence pairs
Trimmed to 1268 sentence pairs
Counting words...
Counted words:
vi 1368
en 3814
Reading lines...
Read 1553 sentence pairs
Trimmed to 1553 sentence pairs
Counting words...
Counted words:
vi 1323
en 3617


### 1.2 Chinese to English

In [13]:
# # Please find the original tokenizing code provided by Elman Mansimov in the following link:
# # https://github.com/derincen/neural-machine-translation/tree/master/data/tokens_and_preprocessing_em/preprocess_translation

# def tokenize_zh(f_names, f_out_names):
#     for f_name, f_out_name in zip(f_names, f_out_names):
#         lines = open(f_name, 'r').readlines()
#         tok_lines = open(f_out_name, 'w')
#         for i, sentence in enumerate(lines):
#             if i > 0 and i % 1000 == 0:
#                 print (f_name.split('/')[-1], i, len(lines))
#             tok_lines.write(' '.join(jieba.cut(sentence, cut_all=True)))
#         tok_lines.close()

# def tokenize_en(f_names, f_out_names):
#     tokenizer = spacy.load('en_core_web_sm')

#     for f_name, f_out_name in zip(f_names, f_out_names):
#         lines = open(f_name, 'r').readlines()
#         tok_lines = open(f_out_name, 'w')
#         for i, sentence in enumerate(lines):
#             if i > 0 and i % 1000 == 0:
#                 print (f_name.split('/')[-1], i, len(lines))
#             # replaced tokenizer(sentence) with str(tokenizer(sentence)) to avoid 
#             # type error while joining
#             tok_lines.write(' '.join(str(tokenizer(sentence))) + '\n')
#         tok_lines.close()

# if __name__ == "__main__":
#     root = '../data/tokens_and_preprocessing_em/pretokenized_data/iwslt-zh-en-processed/'
#     tokenize_zh([os.path.join(root, 'dev.zh'), os.path.join(root, 'test.zh'), os.path.join(root, 'train.zh')],\
#                 [os.path.join(root, 'dev.tok.zh'), os.path.join(root, 'test.tok.zh'), os.path.join(root, 'train.tok.zh')])

# #     tokenize_en([os.path.join(root, 'dev.en'), os.path.join(root, 'test.en'), os.path.join(root, 'train.en')],\
# #                [os.path.join(root, 'dev.tok.en'), os.path.join(root, 'test.tok.en'), os.path.join(root, 'train.tok.en')])


In [14]:
# Format: languagepair_language_dataset
# Train 
zhen_zh_train, zhen_en_train, zh_en_train_pairs = prepareData('zh', 'en', False, dataset="train")
# Dev 
zhen_zh_dev, zhen_en_dev, zh_en_dev_pairs = prepareData('zh', 'en', False, dataset="dev")
# Test
zhen_zh_test, zhen_en_test, zh_en_test_pairs = prepareData('zh', 'en', False, dataset="test")

Reading lines...
Read 213376 sentence pairs
Trimmed to 213376 sentence pairs
Counting words...
Counted words:
zh 89202
en 59327
Reading lines...
Read 1261 sentence pairs
Trimmed to 1261 sentence pairs
Counting words...
Counted words:
zh 6134
en 3914
Reading lines...
Read 1397 sentence pairs
Trimmed to 1397 sentence pairs
Counting words...
Counted words:
zh 5216
en 3421


In [15]:
zh_en_train_pairs[3]

['我们 将 用 一些 影片 来讲 讲述 一些 深海 海里 的 故事  <EOS>',
 'And we apos re going to tell you some stories from the sea here in video <EOS>']

### 1.3: Check Source & Target Vocabs

Since the source and target languages can have very different table lookup layers, it's good practice to have separate vocabularies for each. Thus, we build vocabularies for each language that we will be using. 

In the first class (Lang) of this section, we have already defined vocabularies for all languages. So, there is no need to redefine another function. We chech each vocabulary below.

#### Chinese Vocabulary

In [16]:
print ("The number of words in Chinese training corpus is " + str(zhen_zh_train.n_words))

The number of words in Chinese training corpus is 89202


In [17]:
zhen_zh_train.word2index["格"]

10481

In [18]:
zhen_zh_train.index2word[10479]

'哈利'

#### Vietnamese Vocabulary

In [19]:
print ("The number of words in Vietnamese training corpus is " + str(vien_vi_train.n_words))

The number of words in Vietnamese training corpus is 16142


In [20]:
vien_vi_train.word2index["Hamburger"]

6750

In [21]:
vien_vi_train.index2word[6752]

'Enlightened'

#### English Vocabulary for Zh-En

In [22]:
print ("The number of words in English training corpus for Zh-En is " + str(zhen_en_train.n_words))

The number of words in English training corpus for Zh-En is 59327


In [23]:
zhen_en_train.word2index["translate"]

1449

In [24]:
zhen_en_train.index2word[1451]

'directly'

#### English Vocabulary for Vi-En

In [25]:
print ("The number of words in English training corpus for Vi-En is " + str(vien_en_train.n_words))

The number of words in English training corpus for Vi-En is 47566


In [26]:
vien_en_train.word2index["machine"]

846

In [27]:
vien_en_train.index2word[847]

'force'

### 1.4 Prepare Dataloaders

In [28]:
vien_en_dev.word2index["<EOS>"]

24

In [29]:
vien_en_dev.index2word[24]

'<EOS>'

In [30]:
PAD_IDX = 0
SOS_IDX = 1
UNK_IDX = 2
# EOS_IDX = 3
# convert token to id in the dataset
def token2index_dataset(paired_tokens, 
                        lang1_token2id_vocab,
                        lang2_token2id_vocab):
    """Takes as input:
    - paired_tokens: a list of sentence pairs that consist of source & target lang sentences.
    - lang1_token2id_vocab: token2index vocabulary for the first language. 
                            Get by method Lang_dataset.word2index
    - lang2_token2id_vocab: token2index vocabulary for the second language. 
                            Get by method Lang_dataset.word2index
                            
    Returns:
    - indices_data_lang_1, indices_data_lang2: A list of lists where each sub-list holds corresponding indices for each
                                               token in the sentence."""
    indices_data_lang_1, indices_data_lang_2 = [], []
    vocabs = [lang1_token2id_vocab, lang2_token2id_vocab]
    
    # lang1
    for t in range(len(paired_tokens)):
        index_list = [vocabs[0][token] if token in vocabs[0]\
                                    else UNK_IDX for token in paired_tokens[t][0]] 
        indices_data_lang_1.append(index_list)
    # lang2
    for t in range(len(paired_tokens)):
        index_list =  [vocabs[1][token] if token in vocabs[1] \
                                    else UNK_IDX for token in paired_tokens[t][1]] 
        indices_data_lang_2.append(index_list)
        
    return indices_data_lang_1, indices_data_lang_2

# train indices
zhen_zh_train_indices, zhen_en_train_indices = token2index_dataset(zh_en_train_pairs,
                                                                   zhen_zh_train.word2index,
                                                                   zhen_en_train.word2index)

vien_vi_train_indices, vien_en_train_indices = token2index_dataset(vi_en_train_pairs,
                                                                   vien_vi_train.word2index,
                                                                   vien_en_train.word2index)

# dev indices
zhen_zh_dev_indices, zhen_en_dev_indices = token2index_dataset(zh_en_dev_pairs,
                                                               zhen_zh_dev.word2index,
                                                               zhen_en_dev.word2index)

vien_vi_dev_indices, vien_en_dev_indices = token2index_dataset(vi_en_dev_pairs,
                                                               vien_vi_dev.word2index,
                                                               vien_en_dev.word2index)

# test indices
zhen_zh_test_indices, zhen_en_test_indices = token2index_dataset(zh_en_test_pairs,
                                                                 zhen_zh_test.word2index,
                                                                 zhen_en_test.word2index)

vien_vi_test_indices, vien_en_test_indices = token2index_dataset(vi_en_test_pairs,
                                                                 vien_vi_test.word2index,
                                                                 vien_en_test.word2index)

In [31]:
# check length
# train
print ("Chinese training set length = "+str(len(zhen_zh_train_indices)))
print ("Chinese-English (En) training set length = "+str(len(zhen_en_train_indices)))
print ("\nVietnamese training set length = "+str(len(vien_vi_train_indices)))
print ("Vietnamese-English (En) training set length = "+str(len(vien_en_train_indices)))
# dev
print ("\nChinese dev set length = "+str(len(zhen_zh_dev_indices)))
print ("Chinese-English (En) dev set length = "+str(len(zhen_en_dev_indices)))
print ("\nVietnamese dev set length = "+str(len(vien_vi_dev_indices)))
print ("Vietnamese-English (En) dev set length = "+str(len(vien_en_dev_indices)))
# test
print ("\nChinese test set length = "+str(len(zhen_zh_test_indices)))
print ("Chinese-English (En) test set length = "+str(len(zhen_en_test_indices)))
print ("\nVietnamese test set length = "+str(len(vien_vi_test_indices)))
print ("Vietnamese-English (En) test set length = "+str(len(vien_en_test_indices)))

Chinese training set length = 213376
Chinese-English (En) training set length = 213376

Vietnamese training set length = 133317
Vietnamese-English (En) training set length = 133317

Chinese dev set length = 1261
Chinese-English (En) dev set length = 1261

Vietnamese dev set length = 1268
Vietnamese-English (En) dev set length = 1268

Chinese test set length = 1397
Chinese-English (En) test set length = 1397

Vietnamese test set length = 1553
Vietnamese-English (En) test set length = 1553


#### Dataloader

In [32]:
from torch.utils.data import Dataset

In [33]:
## TODO 

MAX_SENTENCE_LENGTH = 15
BATCH_SIZE = 32

# zhen token2index vocabs
zhen_zh_train_token2id = zhen_zh_train.word2index
zhen_en_train_token2id = zhen_en_train.word2index

# vien token2index vocabs
vien_vi_train_token2id = vien_vi_train.word2index
vien_en_train_token2id = vien_en_train.word2index

class TranslationDataset():
    """
    Class that represents a train/dev/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """

    def __init__(self, 
                 data_source, # training indices data of the source language
                 data_target, # training indices data of the target language
                 token2id_source=None, # token2id dict of the source language
                 token2id_target=None  # token2id dict of the target language
                ):
        """
        @param data_list: list of character
        @param target_list: list of targets

        """
        self.source_sentences, self.target_sentences =  data_source, data_target
        
        self.token2id_source = token2id_source
        self.token2id_target = token2id_target

    def __len__(self):
        return len(self.source_sentences)

    def __getitem__(self, batch_index):

#         source_word_idx, target_word_idx = [], []
        source_mask, target_mask = [], []
        
        for index in self.source_sentences[batch_index][:MAX_SENTENCE_LENGTH]:
            if index != UNK_IDX:
                source_mask.append(0)
            else:
                source_mask.append(1)
                
        for index in self.target_sentences[batch_index][:MAX_SENTENCE_LENGTH]:
            if index != UNK_IDX:
                target_mask.append(0)
            else:
                target_mask.append(1)
        
        source_indices = self.source_sentences[batch_index][:MAX_SENTENCE_LENGTH]
        target_indices = self.target_sentences[batch_index][:MAX_SENTENCE_LENGTH]
        
        source_list = [source_indices, source_mask, len(source_indices)]
        target_list = [target_indices, target_mask, len(target_indices)]
        
        return source_list + target_list

    
def translation_collate(batch, max_sentence_length):
    """
    Customized function for DataLoader that dynamically pads the batch so that all
    data have the same length
    """
    source_data, target_data = [], []
    source_mask, target_mask = [], []
    source_lengths, target_lengths = [], []

    for datum in batch:
        source_lengths.append(datum[2])
        target_lengths.append(datum[5])
        
        # PAD
        source_data_padded = np.pad(np.array(datum[0]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[2])),
                                mode="constant", constant_values=0)
        source_data.append(source_data_padded)
        
        source_mask_padded = np.pad(np.array(datum[1]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[2])),
                                mode="constant", constant_values=0)
        source_mask.append(source_mask_padded)
        
        target_data_padded = np.pad(np.array(datum[3]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[5])),
                                mode="constant", constant_values=0)
        target_data.append(target_data_padded)
        
        target_mask_padded = np.pad(np.array(datum[4]), pad_width=((0, MAX_SENTENCE_LENGTH-datum[5])),
                               mode="constant", constant_values=0)
        target_mask.append(target_mask_padded)
        
    ind_dec_order = np.argsort(source_lengths)[::-1]
    source_data = np.array(source_data)[ind_dec_order]
    target_data = np.array(target_data)[ind_dec_order]
    source_mask = np.array(source_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    target_mask = np.array(target_mask)[ind_dec_order].reshape(len(batch), -1, 1)
    source_lengths = np.array(source_lengths)[ind_dec_order]
    target_lengths = np.array(target_lengths)[ind_dec_order]
    
    source_list = [torch.from_numpy(source_data), 
               torch.from_numpy(source_mask).float(), source_lengths]
    target_list = [torch.from_numpy(target_data), 
               torch.from_numpy(target_mask).float(), target_lengths]
        
    return source_list + target_list


zhen_train_dataset = TranslationDataset(zhen_zh_train_indices,
                                       zhen_en_train_indices,
                                       token2id_source=zhen_zh_train_token2id,
                                       token2id_target=zhen_en_train_token2id)

zhen_train_loader = torch.utils.data.DataLoader(dataset=zhen_train_dataset,
                               batch_size=BATCH_SIZE,
                               collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: translation_collate(x, MAX_SENTENCE_LENGTH),
                               shuffle=False)

zhen_dev_dataset = TranslationDataset(zhen_zh_dev_indices,
                                       zhen_en_dev_indices,
                                       token2id_source=zhen_zh_train_token2id,
                                       token2id_target=zhen_en_train_token2id)

zhen_dev_loader = torch.utils.data.DataLoader(dataset=zhen_dev_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: translation_collate(x, MAX_SENTENCE_LENGTH),
                             shuffle=False)

vien_train_dataset = TranslationDataset(vien_vi_train_indices,
                                       vien_en_train_indices,
                                       token2id_source=vien_vi_train_token2id,
                                       token2id_target=vien_en_train_token2id)

vien_train_loader = torch.utils.data.DataLoader(dataset=vien_train_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: translation_collate(x, MAX_SENTENCE_LENGTH),
                             shuffle=False)

vien_dev_dataset = TranslationDataset(vien_vi_dev_indices,
                                       vien_en_dev_indices,
                                       token2id_source=vien_vi_train_token2id,
                                       token2id_target=vien_en_train_token2id)

vien_dev_loader = torch.utils.data.DataLoader(dataset=vien_dev_dataset,
                             batch_size=BATCH_SIZE,
                             collate_fn=lambda x, max_sentence_length=MAX_SENTENCE_LENGTH: translation_collate(x, MAX_SENTENCE_LENGTH),
                             shuffle=False)


In [34]:
# [*vien_dev_loader][0]

## Part 2: Evaluation Metric

We use BLEU as the evaluation metric. Specifically, we focus on the corpus-level BLEU function. 

The code for BLEU is taken from https://github.com/mjpost/sacreBLEU/blob/master/sacrebleu.py#L1022-L1080

In [35]:
! pip3 install sacrebleu

[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [36]:
# import sacrebleu

In [37]:
# TODO

## Part 3: Beam Search Algorithm

In this section, we implement the Beam Search algorithm in Pytorch.

In [38]:
# initialize k-many score lists
# start only with the whole x
# initialize k-many prev y's lists
# choose top-k for y1 from the whole vocab
# choose top-k for the second time step by expanding the first time step
# compute scores by adding log probabilities

In [39]:
# beam_size_k = 10

# class BeamSearch:
    
#     """RECURSE"""
    
#     def __init__(self,
#                  beam_size=beam_size_k, ## insert num 
#                  softmax_out
#                 ):
#         """
#         Class that holds beam information, and search & score functions
#         - beam_size = beam size
#         - softmax_out = the softmax over the vocabulary at time step t, as computed by the RNN decoder,
#                         given the source sequence X and the previously decoded y_<t tokens.
#         """
        
#         self.beam_size = beam_size
#         self.softmax_out = softmax_out
        
#         # initialize paths
#         self.paths = np.empty((self.beam_size))
        
#         # initialize the dictionary that will hold the path scores 
#         # and update the scores at each time step
#         self.path_score_dict = {}
#         # we will later use each i < k as a key and populate this
#         # dict with scores

        
#     def search():
        
        
        
    
#     def score(prev_ys = None):
#         """- prev_ys = previously decoded tokens (previously generated target language tokens)
#         """
        
    

## Part 4: Model

1. Recurrent neural network based encoder-decoder without attention
2. Recurrent neural network based encoder-decoder with attention
2. Replace the recurrent encoder with either convolutional or self-attention based encoder.

#### Loss Function & Evaluation

In [37]:
# reconstruction loss = binary cross entropy between two (vocab_size x 1) vectors
# used during training, since we can compare the real Y and and the generated Y
# still at each time step of the decoder, we compare up to and including
# the real t-th token and the generated t-th, then optimize

def loss_function(y_hat, y):
    
    """Takes as input;
    - y: correct "log-softmax"(binary vector) that represents the correct t-th token in the target sentence,
                 (vocab_size x 1) vector
    - y_hat: predicted LogSoftmax for the predicted t-th token in the target sentence.
             (vocab_size x 1) vector
    Returns;
    - NLL Loss in training time"""
#     y_hat = torch.log(y_hat) # log softmax
    loss = nn.functional.binary_cross_entropy(y_hat,y)
    
    return loss
    

# generation/inference time - validation loss = BLEU

def compute_BLEU(corpus_hat,corpus):
    ## TODO
    return None


#### Beam Search

In [41]:
# MAX_PATH_LENGTH = 400 # make changeable later !!!

# class TargetOut:
#     def __init__(self,
#                  beam_size=5,
#                  source_sentence_length=400,
#                  time_step=0):
#         """
#         - beam: the tensor that will be populated with beam_size-many paths in each timestep
#         - beam_size: the width of the beam, top k tokens to include in the beam search,
#         """
        
#         # initialized again for each timestep
#         self.beam = torch.empty(beam_size)
#         self.beam_size = beam_size
#         self.beam_seq = beam_seq
#         self.time_step = time_step
        
#         self.max_target_length = source_sentence_length*(1.5)
#         # path is kept by hold_path
#         self.path = torch.empty(beam_size, max_target_length)
    
#     def _add_and_score_paths(self, 
#              top_k_tokens):
        
#         """top_k_tokens: torch.FloatTensor of indices according to logSoftmax 
#         (not embeddings - embedding matrix indices or vocab indices)"""
        
#         time_step = self.time_step
#         self.path[:,time_step] = top_k_tokens
        
#         return self
            
#     def _score_paths(self,gru_out):
        
#         """For each path, computes log(P(Y_i|Y_i-1,..,Y_i-n,X)) + log(P(Y_i-1|Y_i-2,..,Y_i-n,X)) + ...
#         -gru_out is a softmax over the vocabulary for each timestep, so 
#         we need to take its log to obtain the scores"""
#         if self.time_step = 0:
            
        
    
#     def _hold_path_score(self):
        
        





### Beam Search

In [42]:
torch.FloatTensor([3,4,2,7,5,3,2]).topk(3)

(tensor([7., 5., 4.]), tensor([3, 4, 1]))

In [43]:
BATCH_SIZE = 32

class BeamSearch(nn.Module):
    
    """network that conducts beam search over the outputs of
     any translator network. The translator networks that can 
     be passed are:
     
     - Translate (for RNN-enc-dec),
     - AttnTranslate (for RNN-enc-dec with attention),
     - CNNtranslate (for CNN-encoder based translation).
     
     The translation networks take care of the encoder-decoder
     choices specific to each task. Please see in below sections."""

    def __init__(self, translator_network, beam_size):
        super().__init__()
        # translator network that returns the logsoftmax
        # over vocabulary size:(vocab_size, 1)
        self.translator_network = translator_network
        self.beam_size = beam_size
        
    def init_search_tree(self, batch_size):
        beam_size = self.beam_size
        self.search_tree = torch.empty(batch_size, beam_size, 1)
        return self
    
    def init_score_tree(self, batch_size):
        beam_size = self.beam_size
        search_tree = self.search_tree
        self.score_tree = torch.zeros(search_tree.size())
        return self
    
    def forward(source_sentence, source_mask, source_lengths,
                target_sentence, target_mask, target_lengths):
        
        self.init_search_tree(BATCH_SIZE)
        self.init_score_tree(BATCH_SIZE)
        
        # at each time step the decoder will give us the logsoftmax
        # of one token (batch_size, vocab_size). 
        output = model(source_sentence, target_sentence,source_mask, 
                       target_mask, source_lengths,target_lengths)
        
        # for each sentence in the batch we get the top k predictions
        # for each token and append it to the search and score trees. 
        for i in range(BATCH_SIZE):
            beam = output[i].topk(beam_size) # (token scores, token indices)
            # cat instead
            self.search_tree[i] = self.search_tree.cat(beam[1]) # cat the indices to the search tree
            self.score_tree[i,:] = beam[0] # append the scores to the score tree 
        
        # we will sum the logs 
        
        

### 2.1: RNN-based Encoder-Decoder without Attention

In [44]:
batch_size = 32

In [57]:
# from https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
# same as 1st model's RNN encoder
# the different part is the attention decoder in model 2

class RNNencoder(nn.Module):
    def __init__(self,
                 vocab_size=len(zhen_zh_train_token2id), # for chinese
                 embedding_size=300,
                 percent_dropout=0.3, 
                 hidden_size=256,
                 num_gru_layers=16,
                 max_sentence_len=15):
        
        super(RNNencoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_gru_layers
        
        self.vocab_size = vocab_size
        self.embed_size = embedding_size
        self.dropout = percent_dropout
        self.embed_source = nn.Embedding(self.vocab_size,
                                         self.embed_size,
                                         padding_idx=0
                                        )
        
        self.max_sentence_len = max_sentence_len
        
        self.GRU = nn.GRU(self.embed_size, 
                          self.hidden_size, 
                          self.num_layers, 
                          batch_first=True, 
                          bidirectional=False)
        
        self.drop_out_function = nn.Dropout(self.dropout)
        
    def init_hidden(self, batch_size):
        
        hidden_ = torch.zeros(self.num_layers*self.num_directions, 
                             batch_size, self.hidden_size).to(device)
        return hidden_

    def forward(self, source_sentence, source_mask, source_lengths):
        """Returns source lengths to feed into the decoder, since we do not want
        the translation length to be above/below a certain treshold*source sentence length."""
        
        sort_original_source = sorted(range(len(source_lengths)), 
                             key=lambda sentence: -source_lengths[sentence])
        unsort_to_original_source = sorted(range(len(source_lengths)), 
                             key=lambda sentence: sort_original_source[sentence])
        
        source_sentence = source_sentence[sort_original_source]
        _source_mask = source_mask[sort_original_source]
        source_lengths = source_lengths[sort_original_source]
        batch_size, seq_len_source = source_sentence.size()
        
        # init hidden
        if self.GRU.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
        
        self.hidden_source = self.init_hidden(batch_size)
        # (self.num_layers*self.num_directions, batch_size, self.hidden_size)
        # (1, 32, 256)
        # https://pytorch.org/docs/stable/nn.html
#         print ("self hidden size. = "+str(self.hidden_source.size()))
        
        # If batch_first == True, then the input and output tensors are provided as 
        # (batch_size, seq_len, feature)
        # https://pytorch.org/docs/stable/nn.html
#         print ("seq len source = "+str(seq_len_source))
        embeds_source = self.embed_source(source_sentence).view(batch_size, seq_len_source,
                                                               self.embed_size)
        
#         print ("embeds source size = "+str(embeds_source.size()))
        
        embeds_source = source_mask*embeds_source + (1-_source_mask)*embeds_source.clone().detach()
        
#         print ("embeds source after mask size = "+str(embeds_source.size()))
        
        embeds_source = torch.nn.utils.rnn.pack_padded_sequence(embeds_source, 
                                                                source_lengths, 
                                                                batch_first=True)
        
        gru_out_source, self.hidden_source = self.GRU(embeds_source, self.hidden_source)
        
#         print ("hidden source size = "+str(self.hidden_source.size()))
        
        
        # ref: pytorch documentation
        # hidden source : h_n of shape 
        # (num_layers * num_directions, batch_size, hidden_size)
#         print ("hidden source size = "+str(self.hidden_source.size()))
        
        # ref: pytorch documentation
        # Like output, the layers can be separated using 
        # h_n.view(num_layers, num_directions, batch_size, hidden_size)
        hidden_source = self.hidden_source.view(self.num_layers, self.num_directions, 
                                                batch_size, self.hidden_size)
        # the following should print (1, 1, 32, 256) for this config
#         print ("hidden source size after view = "+str(hidden_source.size()))
        
        # get the mean along 0th axis (over layers)
        hidden_source = torch.mean(hidden_source, dim=0) ## mean instead of sum for source representation as suggested in the class
        # the following should print (1, 32, 256)
#         print ("hidden source size after mean = "+str(hidden_source.size()))
        
        if self.GRU.bidirectional:
            hidden_source = torch.cat([hidden_source[:,i,:] for i in range(self.num_directions)], dim=1)
            gru_out_source = gru_out_source
        else:
            hidden_source = hidden_source
            gru_out_source = gru_out_source
            
        # view before unsort
        hidden_source = hidden_source.view(batch_size, self.hidden_size)
        
        # the following should print (32, 256)
        # print("hidden source size before unsort = "+str(hidden_source.size()))
        # UNSORT HIDDEN
        hidden_source = hidden_source[unsort_to_original_source] ## back to original indices
        
        gru_out_source, _ = torch.nn.utils.rnn.pad_packed_sequence(gru_out_source,
                                                                  batch_first=True)
        
#         ### UNSORT GRU OUT
#         # get the mean for the GRU output (batch_size, output size, hidden_size)
#         gru_out_source = torch.mean(gru_out_source, dim=1).view(batch_size, 1, self.hidden_size)
#         gru_out_source = gru_out_source[unsort_to_original_source]
# #         print ("gru_out_source size = "+str(gru_out_source.size()))
        
        source_lengths = source_lengths[unsort_to_original_source]
        
        # here we return both hidden and out since we will pass both to
        # the attention decoder
        return hidden_source, source_lengths

In [58]:
class RNNdecoder(nn.Module):
    def __init__(self,
                 vocab_size=len(zhen_en_train_token2id), # for chinese-english's english
                 embedding_size=300,
                 percent_dropout=0.3, 
                 hidden_size=256,
                 num_gru_layers=1,
                 max_sentence_len=15):
        
        super(RNNdecoder, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embedding_size
        self.dropout = percent_dropout
        self.max_sentence_len = max_sentence_len

        self.hidden_size = hidden_size
        self.num_layers = num_gru_layers
        
        self.GRU = nn.GRU(self.embed_size, 
                          self.hidden_size, 
                          self.num_layers, 
                          batch_first=True, 
                          bidirectional=False)
        
        self.GRUcell = nn.GRUCell(self.embed_size, 
                          self.hidden_size)
        
        self.ReLU = nn.ReLU
        
        self.drop_out_function = nn.Dropout(self.dropout)
        
        self.embed_target = nn.Embedding(self.vocab_size,
                                         self.embed_size, padding_idx=0)
        
        self.sigmoid = nn.Sigmoid()
        
        # *2 because we are concating hidden with embedding plus context
        self.linear_layer = nn.Linear(self.hidden_size*2, self.vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=0)
        self.softmax = nn.Softmax(dim=0)
        
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, 
                             batch_size, self.hidden_size).to(device)
        
        return hidden

    def forward(self,
                decoder_hidden, ## decoder_hidden = encoder_hidden at first time_step
                input_, # input
                target_lengths,
                target_mask,
                time_step):
        
        # input (batch_size, seq_len_target = 1)
        # hidden (self.num_layers*self.num_directions, batch_size, self.hidden_size)
        
        self.input = input_
#         print ("self.input size = "+str(self.input.size()))
        
        sort_original_target = sorted(range(len(target_lengths)), 
                             key=lambda sentence: -target_lengths[sentence])
        unsort_to_original_target = sorted(range(len(target_lengths)), 
                             key=lambda sentence: sort_original_target[sentence])
        
        self.input = self.input[sort_original_target]
        _target_mask = target_mask[sort_original_target]
        target_lengths = target_lengths[sort_original_target]
        
        # seq_len_target is always 1 in the decoder since we are 
        # passing the tokens for only 1 time_step at a time
        batch_size, seq_len_target = self.input.size()
        
        if self.GRU.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
        
        # hidden => initial hidden will be the same as the context
        # vector, which is the hidden_source tensor
        # then as we update the hidden state at each time step, this will be 
        # updated as well
        self.hidden = decoder_hidden.view(self.num_layers*self.num_directions,
                                          batch_size, self.hidden_size)
        
        # the following should print (1, 32, 256) for this config
#         print ("self.hidden size = "+str(self.hidden.size()))
        
        self.input = self.input.unsqueeze(1)
        
        embeds_target = self.drop_out_function(self.embed_target(self.input.long())).view(batch_size,
                                                                                   seq_len_target,
                                                                                   -1)
    
#         embeds_target = target_mask*embeds_target + (1-_target_mask)*embeds_target.clone().detach()
        embeds_target = target_mask[:,time_step,:].unsqueeze(1)*embeds_target + \
                        (1-_target_mask[:,time_step,:].unsqueeze(1))*embeds_target.clone().detach()

#         print ("embeds_target size = "+str(embeds_target.size()))    
        
#         embeds_target = torch.nn.utils.rnn.pack_padded_sequence(embeds_target,
#                                                         target_lengths,
#                                                         batch_first=True)
        
#         print ("type embeds target = "+str(type(embeds_target)))

        gru_out_target, self.hidden = self.GRU(embeds_target.data.view(batch_size, 1, self.embed_size),
                                               self.hidden)
        
        # ref: pytorch documentation
        # hidden source : h_n of shape 
        # (num_layers * num_directions, batch_size, hidden_size)
        # the following should print (1, 32, 256) for this config
#         print ("hidden size after GRU = "+str(self.hidden.size()))
        
        # undo packing 
#         gru_out_target, _ = torch.nn.utils.rnn.pad_packed_sequence(gru_out_target,
#                                                                    batch_first=True)
        
#         print ("out size after GRU = "+str(gru_out_target.size()))


        hidden = self.hidden.view(self.num_layers, self.num_directions,
                                  batch_size, self.hidden_size)
        hidden = torch.sum(hidden, dim=0) # we don't divide here, just sum
    
#         print ("hidden size = "+str(hidden.size()))
        
        if self.GRU.bidirectional:
            # separate layers
            gru_out_target = gru_out_target.contiguous().view(seq_len_target,
                                                              batch_size,
                                                              self.num_directions,
                                                              self.hidden_size)
        else:
            gru_out_target = gru_out_target
        
#         print ("gru out size = "+str(gru_out_target.size()))
        
        # sum along sequence
        gru_out_target = torch.sum(gru_out_target, dim=1) # we don't divide here, just sum
        
        if self.GRU.bidirectional:
            hidden = torch.cat([hidden[:,i,:] for i in range(self.num_directions)], 
                               dim=0)
            gru_out_target = torch.cat([gru_out_target[:,i,:] for i in range(self.num_directions)], 
                                       dim=1)
        else:
            hidden = hidden.view(batch_size, 
                                 self.num_directions, self.hidden_size)
            gru_out_target = gru_out_target.view(batch_size,
                                                 self.num_directions, self.hidden_size)
        
        hidden = hidden[unsort_to_original_target] ## back to original indices
        gru_out_target = gru_out_target[unsort_to_original_target] ## back to original indices

        gru_out_target = self.sigmoid(gru_out_target)
        # concating embedding + context = gru_out_target with hidden
        out = torch.cat([gru_out_target,hidden], dim=2)
        
#         print ("out size after concat = "+str(out.size()))
        
        out = self.linear_layer(out)
        
        # softmax over vocabulary
        pred = self.log_softmax(out)

        return pred, hidden


In [59]:
torch.ones(32,1).size()

torch.Size([32, 1])

In [38]:
def convert_to_softmax(tensor_of_indices,
                       batch_size,
                       vocab_size = len(zhen_en_train_token2id)):
    """
    - takes as input a time_step vector of the batch (t-th token of each sentence in the batch)
      size: (batch_size, 1)
    - converts it to softmax of (batch_size, vocab_size)
    """
    index_tensor_ = tensor_of_indices.view(-1,1).long()
        
    one_hot = torch.FloatTensor(batch_size, vocab_size).zero_()
    one_hot.scatter_(1, index_tensor_, 1)
    
    return one_hot

In [39]:
convert_to_softmax(torch.FloatTensor([2,3,4]), 3).size()

torch.Size([3, 59325])

In [40]:
convert_to_softmax(torch.ones(32), 32)

tensor([[0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.]])

In [41]:
torch.FloatTensor([[2,3,4,4],[4,5,6,6]]).topk(1)

(tensor([[4.],
         [6.]]), tensor([[3],
         [3]]))

In [42]:
torch.cat((torch.empty(32,1), torch.ones(32,1)),1).shape

torch.Size([32, 2])

In [65]:
# chinese -> english
enc = RNNencoder(vocab_size=len(zhen_zh_train_token2id), # for chinese
                 embedding_size=300,
                 percent_dropout=0.3, 
                 hidden_size=256,
                 num_gru_layers=16)

dec = RNNdecoder(vocab_size=len(zhen_en_train_token2id), # for chinese-english's english
                 embedding_size=300,
                 percent_dropout=0.3, 
                 hidden_size=256,
                 num_gru_layers=1)

# model = Translate(enc, dec).to(device)


# train

BATCH_SIZE = 32
def train(encoder, decoder, loader=zhen_train_loader,
          optimizer = torch.optim.Adam([*enc.parameters()] + [*dec.parameters()], lr=1e-4),
#           encoder_optimizer = torch.optim.Adam(enc.parameters(), lr=1e-4),
#           decoder_optimizer = torch.optim.Adam(dec.parameters(), lr=1e-4),
          epoch=None):
    
#     encoder_optimizer.zero_grad()
#     decoder_optimizer.zero_grad()

    optimizer.zero_grad()
    
    loss = 0
    
    for batch_idx, (source_sentence, source_mask, source_lengths, 
                    target_sentence, target_mask, target_lengths)\
                    in enumerate(loader):
        
        source_sentence, source_mask = source_sentence.to(device), source_mask.to(device) 
        target_sentence, target_mask = target_sentence.to(device), target_mask.to(device)
        
        encoder_hidden, source_lengths = encoder(source_sentence,
                                               source_mask,
                                               source_lengths)
        
        decoder_hidden = encoder_hidden
        
        # decoder should start with SOS tokens 
        # ref: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
        input_ = SOS_token*torch.ones(BATCH_SIZE,1).view(-1,1).to(device)
        
        for t in range(0, target_sentence.size(1)):
            
            decoder_out, decoder_hidden = decoder(decoder_hidden, # = gru_out_source - instead of encoded_source[0]
                                                 input_, # instead of target sentence up to t 
                                                 target_lengths,  # target lengths
                                                 target_mask,
                                                 t)
            
#             print ("decoder out size = "+str(decoder_out.size()))
            target_tokens = convert_to_softmax(target_sentence[:,t], BATCH_SIZE)
            
            loss += F.binary_cross_entropy(F.sigmoid(decoder_out), target_tokens)
        print ("loss = "+str(loss))
        loss.backward(retain_graph = True)
        
        optimizer.step()
            
#             epoch_loss.backward(retain_graph = True) # if necessary call retain_graph = True
            
            
#             encoder_optimizer.step()
#             decoder_optimizer.step()
            
    return epoch_loss/BATCH_SIZE
    

In [None]:
num_epochs = 10
lr = 1e-4
# batch_

loss_train = []

for epoch in range(num_epochs):
    print ("epoch = "+str(epoch))

    loss = train(enc, dec,
                 loader = zhen_train_loader,
                 optimizer = torch.optim.Adam([*enc.parameters()] + [*dec.parameters()], lr=1e-4),
                 epoch = epoch)
    
    loss_train.append(loss)
    
    print (loss_train)

epoch = 0


  "Please ensure they have the same size.".format(target.size(), input.size()))


loss = tensor(0.4624, grad_fn=<ThAddBackward>)
loss = tensor(0.9248, grad_fn=<ThAddBackward>)
loss = tensor(1.3872, grad_fn=<ThAddBackward>)
loss = tensor(1.8496, grad_fn=<ThAddBackward>)
loss = tensor(2.3120, grad_fn=<ThAddBackward>)
loss = tensor(2.7744, grad_fn=<ThAddBackward>)
loss = tensor(3.2368, grad_fn=<ThAddBackward>)
loss = tensor(3.6992, grad_fn=<ThAddBackward>)
loss = tensor(4.1616, grad_fn=<ThAddBackward>)
loss = tensor(4.6240, grad_fn=<ThAddBackward>)
loss = tensor(5.0864, grad_fn=<ThAddBackward>)
loss = tensor(5.5488, grad_fn=<ThAddBackward>)
loss = tensor(6.0112, grad_fn=<ThAddBackward>)
loss = tensor(6.4736, grad_fn=<ThAddBackward>)
loss = tensor(6.9360, grad_fn=<ThAddBackward>)
loss = tensor(7.3984, grad_fn=<ThAddBackward>)
loss = tensor(7.8608, grad_fn=<ThAddBackward>)
loss = tensor(8.3231, grad_fn=<ThAddBackward>)
loss = tensor(8.7855, grad_fn=<ThAddBackward>)
loss = tensor(9.2479, grad_fn=<ThAddBackward>)
loss = tensor(9.7103, grad_fn=<ThAddBackward>)
loss = tensor

### 2.2 RNN-based Encoder-Decoder with Attention

#### 2.2.1 RNN Encoder

In [None]:
# from https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
# same as 1st model's RNN encoder except that works on one token at a time
# the different part is the attention decoder in model 2

class attnRNNencoder(nn.Module):
    def __init__(self,
                 vocab_size=len(zhen_zh_train_token2id), # for chinese
                 embedding_size=300,
                 percent_dropout=0.3, 
                 hidden_size=256,
                 num_gru_layers=4,
                 max_sentence_len=50):
        
        super(attnRNNencoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_gru_layers
        
        self.vocab_size = vocab_size
        self.embed_size = embedding_size
        self.dropout = percent_dropout
        self.embed_source = nn.Embedding(self.vocab_size,
                                         self.embed_size,
                                         padding_idx=0
                                        )
        
        self.max_sentence_len = max_sentence_len
        
        self.GRU = nn.GRU(self.embed_size, 
                          self.hidden_size, 
                          self.num_layers, 
                          batch_first=True, 
                          bidirectional=False)
        
        self.drop_out_function = nn.Dropout(self.dropout)
        
    def init_hidden(self, batch_size):
        
        hidden_ = torch.zeros(self.num_layers*self.num_directions, 
                             batch_size, self.hidden_size).to(device)
        return hidden_

    def forward(self, source_sentence, source_mask, source_lengths,
                time_step):
        """Returns source lengths to feed into the decoder, since we do not want
        the translation length to be above/below a certain treshold*source sentence length."""
        
        source_sentence = source_sentence.view(-1,1)
        # print ("source size = "+str(source_sentence.size()))
        # (batch_size, 1)
        
        sort_original_source = sorted(range(len(source_lengths)), 
                             key=lambda sentence: -source_lengths[sentence])
        unsort_to_original_source = sorted(range(len(source_lengths)), 
                             key=lambda sentence: sort_original_source[sentence])
        
        source_sentence = source_sentence[sort_original_source]
        _source_mask = source_mask[sort_original_source]
        source_lengths = source_lengths[sort_original_source]
        batch_size, seq_len_source = source_sentence.size()
        
        if self.GRU.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
            
        self.hidden_source = self.init_hidden(batch_size)
        # (self.num_layers*self.num_directions, batch_size, self.hidden_size)
        # (1, 32, 256)
        # https://pytorch.org/docs/stable/nn.html
        # print ("self hidden size. = "+str(self.hidden_source.size()))
        
        # If batch_first == True, then the input and output tensors are provided as 
        # (batch_size, seq_len, feature)
        # https://pytorch.org/docs/stable/nn.html
        # print ("seq len source = "+str(seq_len_source))
        
        source_sentence = source_sentence.unsqueeze(1)
        
        embeds_source = self.embed_source(source_sentence).view(batch_size, seq_len_source,
                                                               self.embed_size)
        
        # print ("embeds source size = "+str(embeds_source.size()))
        
        embeds_source = source_mask[:,time_step,:].unsqueeze(1)*embeds_source + \
                        (1-_source_mask[:,time_step,:].unsqueeze(1))*embeds_source.clone().detach()
        
        # print ("embeds source after mask size = "+str(embeds_source.size()))
        
        
#         embeds_source = torch.nn.utils.rnn.pack_padded_sequence(embeds_source, 
#                                                                 source_lengths, 
#                                                                 batch_first=True)
        
        gru_out_source, self.hidden_source = self.GRU(embeds_source, self.hidden_source)
        
        # print ("gru out source size = "+str(gru_out_source.size()))
        
        # print ("hidden source size = "+str(self.hidden_source.size()))
        # print ("gru out source size = "+str(gru_out_source.size()))
        
        # hidden source size = torch.Size([1, 32, 256])
        # gru out source size = torch.Size([32, 350, 256])
        
        # ref: pytorch documentation
        # hidden source : h_n of shape 
        # (num_layers * num_directions, batch_size, hidden_size)
        # print ("hidden source size = "+str(self.hidden_source.size()))
        
        # ref: pytorch documentation
        # Like output, the layers can be separated using 
        # h_n.view(num_layers, num_directions, batch_size, hidden_size)
        hidden_source = self.hidden_source.view(self.num_layers, self.num_directions, 
                                                batch_size, self.hidden_size)
        
        # print ("hidden source size = "+str(hidden_source.size()))
        # hidden source size = torch.Size([1, 1, 32, 256])
        
        # the following should print (1, 1, 32, 256) for this config
        # print ("hidden source size after view = "+str(hidden_source.size()))
        
        # get the mean along 0th axis (over layers)
        hidden_source = torch.mean(hidden_source, dim=0) ## mean instead of sum for source representation as suggested in the class
        # the following should print (1, 32, 256)
        # print ("hidden source size after mean = "+str(hidden_source.size()))
        
        if self.GRU.bidirectional:
            hidden_source = torch.cat([hidden_source[:,i,:] for i in range(self.num_directions)], dim=1)
            gru_out_source = gru_out_source
        else:
            hidden_source = hidden_source
            gru_out_source = gru_out_source
            
        # view before unsort
        hidden_source = hidden_source.view(batch_size, self.hidden_size)
        
        # the following should print (32, 256)
        # print("hidden source size before unsort = "+str(hidden_source.size()))
        # UNSORT HIDDEN
        hidden_source = hidden_source[unsort_to_original_source] ## back to original indices
        
#         gru_out_source, _ = torch.nn.utils.rnn.pad_packed_sequence(gru_out_source,
#                                                                   batch_first=True)
        
        ### UNSORT GRU OUT
        # get the mean for the GRU output (batch_size, output size, hidden_size)
        gru_out_source = gru_out_source.view(batch_size, seq_len_source, self.hidden_size)
        # gru_out_source = torch.mean(gru_out_source, dim=1).view(batch_size, 1, self.hidden_size)
        gru_out_source = gru_out_source[unsort_to_original_source]
        # print ("gru_out_source size = "+str(gru_out_source.size()))
        
        source_lengths = source_lengths[unsort_to_original_source]
        
        # here we return both hidden and out since we will pass both to
        # the attention decoder
        return hidden_source, gru_out_source, source_lengths

#### 2.2.2 Attention Decoder

In [None]:
# from https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

class AttnDecoderRNN(nn.Module):
    def __init__(self,
                 vocab_size=len(zhen_zh_train_token2id), 
                 embedding_size=300,
                 percent_dropout=0.3, 
                 hidden_size=256,
                 max_sentence_len=50, 
                 num_gru_layers=1):

        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.dropout = percent_dropout
        self.max_sentence_len = max_sentence_len
        self.num_layers = num_gru_layers
        self.embed_size = embedding_size
        
        self.embed_target = nn.Embedding(self.vocab_size,
                                         self.hidden_size,
                                         padding_idx=0
                                        )
        
        self.GRU = nn.GRU(self.hidden_size, 
                          self.hidden_size,
                          self.num_layers, 
                          batch_first=True, 
                          bidirectional=False)
        
        # we concat embeds with hidden before attention, thus the input size
        # of the linear attn layer is embed + hidden, and the output is hidden.
        self.attn = nn.Linear(self.hidden_size*2, 
                              self.max_sentence_len)
        
        # we combine embeds with attention applied (self.attn out) before attn_combine
        # so the input size of the linear attn_combine layer is embed_size + hidden_size 
        # 
        self.attn_combine = nn.Linear(self.hidden_size*2, 
                                      self.hidden_size)
        
        self.dropout = nn.Dropout(self.dropout)
        
        self.out = nn.Linear(self.hidden_size, self.vocab_size)
        
        self.log_softmax = nn.LogSoftmax()
        
    def forward(self,
                hidden, ## decoder_hidden = encoder_hidden at first time_step
                input_, # input (batch_size, seq_len = 1)
                encoder_outputs, # (encoder hidden and encoder out)
                target_lengths,
                target_mask,
                time_step):
        
        # input (batch_size, seq_len = 1)
        self.input = input_
        print ("input size. ="+str(self.input.size()))
        
        sort_original_target = sorted(range(len(target_lengths)), 
                             key=lambda sentence: -target_lengths[sentence])
        unsort_to_original_target = sorted(range(len(target_lengths)), 
                             key=lambda sentence: sort_original_target[sentence])
        
        self.input = self.input[sort_original_target]
        _target_mask = target_mask[sort_original_target]
        target_lengths = target_lengths[sort_original_target]
        
        # seq_len_target is always 1 in the decoder since we are 
        # passing the tokens for only 1 time_step at a time
        batch_size, seq_len_target = self.input.size()
        
        if self.GRU.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
            
        self.hidden = hidden.view(batch_size, 
                                  self.num_layers*self.num_directions, 
                                  self.hidden_size)
        
        self.input = self.input.unsqueeze(1)
        
        
        embeds_target = self.dropout(self.embed_target(self.input.long()))\
                                                                .view(batch_size,
                                                                      seq_len_target, -1)
        
        embeds_target = target_mask[:,time_step,:].unsqueeze(1)*embeds_target + \
                        (1-_target_mask[:,time_step,:].unsqueeze(1))*embeds_target.clone().detach()
        
        # print ("embeds target size = "+str(embeds_target.size()))

        attn_weights = F.softmax(self.attn(torch.cat((embeds_target, self.hidden), 2)), dim=2)
#         print ("attn_weights size = "+str(attn_weights.size()))
        
        # try for loop and bmm and see if these are the same 
        # print ("enc out size = "+str(encoder_outputs.size()))
        attn_applied = torch.zeros(batch_size, self.max_sentence_len, self.hidden_size)
        
        for i in range(batch_size):
#             print ("attn_weights[i] = "+str(attn_weights[i]))
#             print ("encoder_outputs[i] = "+str(encoder_outputs[i]))
            apply = torch.bmm(attn_weights[i].unsqueeze(0),
                              encoder_outputs[i].unsqueeze(0))
            
            attn_applied[i] = apply
        
        print ("attn_applied size = "+str(attn_applied.size()))
        print ("embeds target size = "+ str(embeds_target.size()))
#         print ("encoder outputs = "+str(encoder_outputs))
        print ("attn_applied[:,time_step,:] size = "+str(attn_applied[:,time_step,:].view(batch_size,
                                                                                          1, self.hidden_size).size()))

        output = torch.cat((embeds_target,
                            attn_applied[:,time_step,:].view(batch_size,1,
                                                             self.hidden_size)),2)
        
        output = self.attn_combine(output)
        
        output = F.relu(output)
        
        self.hidden = self.hidden.view(self.num_layers*self.num_directions,
                                       batch_size,
                                       self.hidden_size)

        output, self.hidden = self.GRU(output, self.hidden)
        
        self.hidden = self.hidden.view(batch_size,
                                       self.num_layers*self.num_directions,
                                       self.hidden_size)
        
        output = output[unsort_to_original_target]
        self.hidden = self.hidden[unsort_to_original_target]
        
        print ("output size = "+str(output.size()))
        print ("hidden size = "+str(self.hidden.size()))
        
        output = self.out(output)
        print ("out after linear size = "+str(output.size()))
        
        output = F.log_softmax(output, dim=2)
        print ("logsoft size = "+str(output.size()))
        
        return output, hidden, attn_weights
    
    

In [None]:
BATCH_SIZE = 32

class AttnTranslate(nn.Module):
    def __init__(self, encoder, decoder, use_teacher_forcing=False):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.use_teacher_forcing = use_teacher_forcing
        self.max_length = self.encoder.max_sentence_len
        
    def forward(self, source_sentence, target_sentence, 
                source_mask, target_mask, source_lengths,
                target_lengths):
        
        # following should print (batch_size, max_sentence_len) = (32, 350)
        # print ("target_sentence size = "+str(target_sentence.size()))
        
        # to hold previously decoded ys
        y_outputs = torch.zeros(batch_size, 
                                target_sentence.size(1), 
                                len(zhen_en_train_token2id)).to(device)
        
        encoder_outputs = torch.zeros(BATCH_SIZE,
                                      self.max_length, 
                                      self.encoder.hidden_size, 
                                      device=device)
        
        for i in range(self.max_length):
            #last hidden state of the encoder is the context
            encoder_hidden, encoder_output, source_lengths = self.encoder(source_sentence[:,i],
                                                                          source_mask,
                                                                          source_lengths,
                                                                          i) # i as time_step
            # doing what we want, uncomment the prints below to check
            # i-th time_step token of each sentence in batch is filled with the corresponding
            # encoder output
            encoder_outputs[:,i,:] = encoder_output.unsqueeze(1)[:,0,0]
            
            # print ("encoder_outputs[:,i,:] size = "+str(encoder_outputs[:,i,:].size()))
            # print ("encoder outputs size = "+str(encoder_outputs.size()))
            
            # print ("encoder outputs = "+str(encoder_outputs))
        
#         print ("encoder outputs size = "+str(encoder_outputs.size()))
#         print ("enc outs = "+str(encoder_outputs))

        # encoder hidden also used as the initial hidden state of the decoder
        decoder_hidden = encoder_hidden

        # decoder should start with SOS tokens 
        # ref: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
        input_ = SOS_token*torch.ones(BATCH_SIZE,1).view(-1,1)
        
        # TODO
        # Obtain target tensor using convert_to_softmax (debug function first)  
        # target tensor -> batch_size, max_sent_len, vocab_size = 32, 350, vocab_size
#         target_tensor = torch.zeros()
        # append it to y_outputs

        target_length = target_sentence.size(1)

        if self.use_teacher_forcing:
            # Teacher forcing: Feed the target as the next input
            for di in range(target_length):
                # target tensor -> (batch_size, vocab_size) of t-th time step tokens 
                # from each sentence, converted to softmax (binary)
                target_tensor = convert_to_softmax(target_sentence[:,di],32)
#                 print ("target_tensor = "+str(target_tensor))
                print ("target tensor size = "+str(target_tensor.size()))
                
                # take ith token from each sentence in the batch, and convert it to 
                # softmax
                decoder_out, decoder_hidden, decoder_attention = self.decoder(
                    decoder_hidden, input_, encoder_outputs,
                    target_lengths, target_mask, di) # di as time_step
                
                # decoder out should be size (32, 1, vocab_size)
                
                loss += loss_function(decoder_out, target_tensor[:,di,:]) # slicing (whole batch, 
                                                                          #          token_index, vocab_size)
                decoder_input = target_sentence[:,di] # Teacher forcing
            
        else:
            # Without teacher forcing: use its own predictions as the next input
            # just like we did in the RNN encoder-decoder above
            for di in range(target_length):
                # target tensor -> (batch_size, vocab_size) of t-th time step tokens 
                # from each sentence, converted to softmax (binary)
                target_tensor = convert_to_softmax(target_sentence[:,di],32)
#                 print ("target_tensor = "+str(target_tensor))
                print ("target tensor size = "+str(target_tensor.size()))
                
                decoder_out, decoder_hidden, decoder_attention = self.decoder(
                    decoder_hidden, input_, encoder_outputs, target_lengths,
                    target_mask, di)
                
                token_out = torch.max(decoder_out.view(BATCH_SIZE,self.decoder.vocab_size),1)[1]
                input_ = token_out.view(-1,1)
                print ("decoder input size = "+str(input_.size()))

                loss = nn.functional.binary_cross_entropy(decoder_out, target_tensor[di])
            
        for t in range(0, target_sentence.size(1)):
            
            decoder_out, decoder_hidden = self.decoder(decoder_hidden, # = gru_out_source - instead of encoded_source[0]
                                                 input_, # instead of target sentence up to t 
                                                 target_lengths,  # target lengths
                                                 target_mask,
                                                 t)
            
#             print ("decoder out size = "+str(decoder_out.size()))
            for s in range(batch_size):
                y_outputs[s,t] = decoder_out[s,0]
            
        return y_outputs

In [175]:
BATCH_SIZE = 32
def train(model, loader=zhen_train_loader,criterion=loss_function,
          encoder_optimizer=None, decoder_optimizer = None, 
          epoch=None):
    
    model.train()
    
    epoch_loss = 0
    
    for batch_idx, (source_sentence, source_mask, source_lengths, 
                    target_sentence, target_mask, target_lengths)\
    in enumerate(loader):
        
        source_sentence, source_mask = source_sentence.to(device), source_mask.to(device),  
        target_sentence, target_mask = target_sentence.to(device), target_mask.to(device),
        
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        
        # output softmax as generated by decoder 
        output = model(source_sentence, target_sentence, 
                source_mask, target_mask, source_lengths,
                target_lengths)
        
        print ("output size = "+str(output.size()))
        
        batch_target = torch.zeros(batch_size,model.decoder.embed_size)
        for i in range(target_sentence.size(0)):
            one_hot_ = convert_to_softmax(target_sentence[i])
            batch_target[i] = one_hot_
            
        print ("batch_target[ix] sum = "+str(torch.sum(batch_target[0],1)))

        loss = criterion(out, target)
        print ("loss = "+str(loss))
        
        loss.backward()
        
        print ("backprop done.")
        
#         torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        encoder_optimizer.step()
        decoder_optimizer.step()
        
        print ("steps done")
        epoch_loss += loss.item()
        
    return epoch_loss/BATCH_SIZE

In [176]:
# chinese -> english
enc = attnRNNencoder(vocab_size=len(zhen_zh_train_token2id), # for chinese
                 embedding_size=300,
                 percent_dropout=0.3, 
                 hidden_size=256,
                 num_gru_layers=1)

dec = AttnDecoderRNN(vocab_size=len(zhen_en_train_token2id), # for chinese-english's english
                 embedding_size=300,
                 percent_dropout=0.3, 
                 hidden_size=256,
                 num_gru_layers=1)

model = AttnTranslate(enc, dec).to(device)

In [177]:
num_epochs = 3
lr = 1e-4
# batch_

loss_train = []

for epoch in range(num_epochs):
    print ("epoch = "+str(epoch))

    loss = train(model,
                       loader = zhen_train_loader,
                       encoder_optimizer = torch.optim.Adam(model.encoder.parameters(), 
                                                   lr=lr),
                       decoder_optimizer = torch.optim.Adam(model.decoder.parameters(),
                                                    lr = lr),
                      epoch = epoch)
    
    loss_train.append(loss)
    
    print (loss_train)

epoch = 0
target tensor size = torch.Size([32, 59325])
input size. =torch.Size([32, 1])
attn_applied size = torch.Size([32, 50, 256])
embeds target size = torch.Size([32, 1, 256])
attn_applied[:,time_step,:] size = torch.Size([32, 1, 256])
output size = torch.Size([32, 1, 256])
hidden size = torch.Size([32, 1, 256])
out after linear size = torch.Size([32, 1, 59325])
logsoft size = torch.Size([32, 1, 59325])
decoder input size = torch.Size([32, 1])


UnboundLocalError: local variable 'loss' referenced before assignment

### 2.3 Encoder Replacement with Convolutional or Self-attention-based Encoder

In [529]:
m = nn.Conv1d(300, 256, 1, stride=2)
input = torch.randn(32, 1, 300)
output = m(input.transpose(1,2)).transpose(1,2)

In [530]:
output.size()

torch.Size([32, 1, 256])

In [49]:
# modified from https://github.com/yanwii/seq2seq/blob/master/seq2seq.py

# ENCODER

class CNNencoder(nn.Module):
    def __init__(self, 
                 embedding_size, # in channels
                 hidden_size, 
                 kernel_size, 
                 padding = 1,
                 stride = 2,
                 percent_dropout = 0.3,
                 vocab_size = len(zhen_zh_train.index2word),
                 max_sentence_len=50):
        
        super(CNNencoder, self).__init__()

        self.hidden_size = hidden_size
        self.embedding_size = embedding_size
        self.kernel_size = kernel_size
        self.padding = padding
        self.vocab_size = vocab_size
        self.stride = stride
        self.dropout = nn.Dropout(percent_dropout)
        self.max_sentence_len = max_sentence_len
        
        self.embedding = nn.Embedding(self.vocab_size, 
                                      self.embedding_size)
        
        self.conv1 = nn.Conv1d(self.embedding_size, self.hidden_size, 
                               kernel_size=self.kernel_size, padding=self.padding,
                               stride=self.stride)

        self.conv2 = nn.Conv1d(self.hidden_size, self.hidden_size, 
                               kernel_size=self.kernel_size, padding=self.padding,
                               stride = self.stride)
        
        self.relu = nn.ReLU()
        self.maxpool_1 = nn.MaxPool1d(3, 1)
        self.maxpool_2 = nn.MaxPool1d(5, 2)
        
        self.sigmoid = nn.Sigmoid()
 

    def forward(self, input_):
        
        # input size = 1'e uydurmaya calis
        
        batch_size, seq_len = input_.size()
        
        embed = self.dropout(self.embedding(input_))
        # print ("embed size = "+str(embed.size()))
        # 32, 350, 300 check
        
        hidden = self.conv1(embed.transpose(1,2)).transpose(1,2)
        hidden = self.relu(hidden)
        hidden = self.maxpool_1(hidden.transpose(1,2)).transpose(1,2)
        
#         # second conv layer
#         hidden = self.conv2(hidden.transpose(1,2)).transpose(1,2)
#         hidden = self.relu(hidden)
#         hidden = self.maxpool_2(hidden.transpose(1,2)).transpose(1,2)

        # print ("hidden size = "+str(hidden.size()))
        hidden = nn.functional.glu(hidden)
        
        # sum 
        hidden = torch.mean(hidden, 1).view(batch_size, 1, hidden.size(-1))
        # sigmoid
        hidden = self.sigmoid(hidden)
        
        return hidden


In [50]:
class RNNdecoder_CNN(nn.Module):
    def __init__(self,
                 vocab_size=len(zhen_en_train_token2id), # for chinese-english's english
                 embedding_size=300,
                 percent_dropout=0.3, 
                 hidden_size=512,
                 num_gru_layers=1,
                 max_sentence_len=15):
        
        super(RNNdecoder_CNN, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embedding_size
        self.dropout = percent_dropout
        self.max_sentence_len = max_sentence_len

        self.hidden_size = hidden_size
        self.num_layers = num_gru_layers
        
        self.GRU = nn.GRU(self.embed_size, 
                          self.hidden_size, 
                          self.num_layers, 
                          batch_first=True, 
                          bidirectional=False)
        
        self.ReLU = nn.ReLU
        
        self.drop_out_function = nn.Dropout(self.dropout)
        
        self.embed_target = nn.Embedding(self.vocab_size,
                                         self.embed_size, padding_idx=0)
        
        self.sigmoid = nn.Sigmoid()
        
        # *2 because we are concating hidden with embedding plus context
        self.linear_layer = nn.Linear(self.hidden_size*2, self.vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=0)
        self.softmax = nn.Softmax(dim=0)
        
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, 
                             batch_size, self.hidden_size).to(device)
        
        return hidden

    def forward(self,
                decoder_hidden, ## decoder_hidden = encoder_hidden at first time_step
                input_, # input
                target_lengths,
                target_mask,
                time_step):
        
        self.input = input_
#         print ("input size = "+str(self.input.size()))
        
        sort_original_target = sorted(range(len(target_lengths)), 
                             key=lambda sentence: -target_lengths[sentence])
        unsort_to_original_target = sorted(range(len(target_lengths)), 
                             key=lambda sentence: sort_original_target[sentence])
        
        self.input = self.input[sort_original_target]
        _target_mask = target_mask[sort_original_target]
        target_lengths = target_lengths[sort_original_target]
        
        # seq_len_target is always 1 in the decoder since we are 
        # passing the tokens for only 1 time_step at a time
        batch_size, seq_len_target = self.input.size()
        
        if self.GRU.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
        
        # hidden => initial hidden will be the same as the context
        # vector, which is the hidden_source tensor
        # then as we update the hidden state at each time step, this will be 
        # updated as well
        self.hidden = decoder_hidden.view(self.num_layers*self.num_directions,
                                          batch_size, self.hidden_size)
        
        # the following should print (1, 32, 256) for this config
        # print ("self.hidden size = "+str(self.hidden.size()))
        
        self.input = self.input.unsqueeze(1)
        
        embeds_target = self.drop_out_function(self.embed_target(self.input.long())).view(batch_size,
                                                                                   seq_len_target,
                                                                                   -1)
    
        embeds_target = target_mask[:,time_step,:].unsqueeze(1)*embeds_target + \
                        (1-_target_mask[:,time_step,:].unsqueeze(1))*embeds_target.clone().detach()


        gru_out_target, self.hidden = self.GRU(embeds_target.data.view(batch_size, 1, self.embed_size),
                                               self.hidden)
        
        # ref: pytorch documentation
        # hidden source : h_n of shape 
        # (num_layers * num_directions, batch_size, hidden_size)
        # the following should print (1, 32, 256) for this config
        # print ("hidden size after GRU = "+str(self.hidden.size()))


        hidden = self.hidden.view(self.num_layers, self.num_directions,
                                  batch_size, self.hidden_size)
        hidden = torch.sum(hidden, dim=0) # we don't divide here, just sum
        
        if self.GRU.bidirectional:
            # separate layers
            gru_out_target = gru_out_target.contiguous().view(seq_len_target,
                                                              batch_size,
                                                              self.num_directions,
                                                              self.hidden_size)
        else:
            gru_out_target = gru_out_target
        
#         print ("gru out size = "+str(gru_out_target.size()))
        
        # sum along sequence
        gru_out_target = torch.sum(gru_out_target, dim=1) # we don't divide here, just sum
        
        if self.GRU.bidirectional:
            hidden = torch.cat([hidden[:,i,:] for i in range(self.num_directions)], 
                               dim=0)
            gru_out_target = torch.cat([gru_out_target[:,i,:] for i in range(self.num_directions)], 
                                       dim=1)
        else:
            hidden = hidden.view(batch_size, 
                                 self.num_directions, self.hidden_size)
            gru_out_target = gru_out_target.view(batch_size,
                                                 self.num_directions, self.hidden_size)
        
        hidden = hidden[unsort_to_original_target] ## back to original indices
        gru_out_target = gru_out_target[unsort_to_original_target] ## back to original indices

        gru_out_target = self.sigmoid(gru_out_target)
        # concating embedding + context = gru_out_target with hidden
        out = torch.cat([gru_out_target,hidden], dim=2)
        
#         print ("out size after concat = "+str(out.size()))
        
        out = self.linear_layer(out)
        
        # softmax over vocabulary
        pred = self.log_softmax(out)

        return pred, hidden



In [58]:
# chinese -> english
enc = CNNencoder(300, # embed size
                 1024, # hidden size
                 3, # kernel size
                 padding = 1,
                 stride = 2,
                 percent_dropout = 0.3,
                 vocab_size = len(zhen_zh_train.index2word),
                 max_sentence_len=15)
    
dec = RNNdecoder_CNN()

# train

BATCH_SIZE = 32
def train(encoder, decoder, loader=zhen_train_loader,
          optimizer = torch.optim.Adam([*enc.parameters()] + [*dec.parameters()], lr=1e-4),
#           encoder_optimizer = torch.optim.Adam(enc.parameters(), lr=1e-4),
#           decoder_optimizer = torch.optim.Adam(dec.parameters(), lr=1e-4),
          epoch=None):
    
#     encoder_optimizer.zero_grad()
#     decoder_optimizer.zero_grad()

    optimizer.zero_grad()
    
    loss = 0
    
    for batch_idx, (source_sentence, source_mask, source_lengths, 
                    target_sentence, target_mask, target_lengths)\
                    in enumerate(loader):
        
        source_sentence, source_mask = source_sentence.to(device), source_mask.to(device) 
        target_sentence, target_mask = target_sentence.to(device), target_mask.to(device)
        
        encoder_hidden = encoder(source_sentence)
        
        decoder_hidden = encoder_hidden
        
        # decoder should start with SOS tokens 
        # ref: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
        input_ = SOS_token*torch.ones(BATCH_SIZE,1).view(-1,1).to(device)
        
        for t in range(0, target_sentence.size(1)):
            
            decoder_out, decoder_hidden = decoder(decoder_hidden, # = gru_out_source - instead of encoded_source[0]
                                                 input_, # instead of target sentence up to t 
                                                 target_lengths,  # target lengths
                                                 target_mask,
                                                 t)
            
#             print ("decoder out size = "+str(decoder_out.size()))
            target_tokens = convert_to_softmax(target_sentence[:,t], BATCH_SIZE)
            
            loss += F.binary_cross_entropy(F.sigmoid(decoder_out), target_tokens)
            
        print ("loss = "+str(loss))
        loss.backward(retain_graph = True)
        torch.nn.utils.clip_grad_norm_(encoder.parameters(), 50)
        torch.nn.utils.clip_grad_norm_(decoder.parameters(), 50)
        optimizer.step()
        
    torch.save(encoder.state_dict(), "encoder_state_dict")
    torch.save(decoder.state_dict(), "decoder_state_dict")
            
#             epoch_loss.backward(retain_graph = True) # if necessary call retain_graph = True
            
            
#             encoder_optimizer.step()
#             decoder_optimizer.step()
            
    return epoch_loss/BATCH_SIZE
    

In [None]:
num_epochs = 10
lr = 1e-4
# batch_

loss_train = []

for epoch in range(num_epochs):
    print ("epoch = "+str(epoch))

    loss = train(enc, dec,
                 loader = zhen_train_loader,
                 optimizer = torch.optim.Adam([*enc.parameters()] + [*dec.parameters()], lr=1e-4),
                 epoch = epoch)
    
    loss_train.append(loss)
    
    print (loss_train)

epoch = 0


  "Please ensure they have the same size.".format(target.size(), input.size()))


loss = tensor(0.4626, grad_fn=<ThAddBackward>)
loss = tensor(0.9252, grad_fn=<ThAddBackward>)
loss = tensor(1.3878, grad_fn=<ThAddBackward>)
loss = tensor(1.8504, grad_fn=<ThAddBackward>)
loss = tensor(2.3130, grad_fn=<ThAddBackward>)
loss = tensor(2.7756, grad_fn=<ThAddBackward>)
loss = tensor(3.2382, grad_fn=<ThAddBackward>)
loss = tensor(3.7007, grad_fn=<ThAddBackward>)
loss = tensor(4.1632, grad_fn=<ThAddBackward>)
loss = tensor(4.6258, grad_fn=<ThAddBackward>)
loss = tensor(5.0883, grad_fn=<ThAddBackward>)
loss = tensor(5.5508, grad_fn=<ThAddBackward>)
loss = tensor(6.0133, grad_fn=<ThAddBackward>)
loss = tensor(6.4758, grad_fn=<ThAddBackward>)


### 2.4 Fully self-attention Translation System

### 2.5 Multilingual Translation System