# Translation (VI - EN, ZH - EN)

In [130]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import torch
import torch.nn as nn 
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from io import open
from collections import Counter
from functools import partial
import unicodedata
import re
from torch.autograd import Variable
from gensim.models import KeyedVectors
from gensim.models.wrappers import FastText
import sacrebleu
import random
import time
from datetime import datetime
import pickle as pkl
import string
import os
from os import listdir 
from ast import literal_eval
from sklearn.metrics import confusion_matrix
import matplotlib.style
import matplotlib as mpl
import pickle
import math
from collections import OrderedDict


pd.set_option('max_colwidth',100)
mpl.style.use('bmh')
%matplotlib inline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Process Raw Data

In [2]:
RESERVED_TOKENS = {'<SOS>': 0, '<EOS>': 1, '<PAD>': 2, '<UNK>': 3}

In [56]:
def text2tokens(raw_text_fp, src_or_targ): 
    """ 
    Take the raw texts filepath and ('target' or 'source') and 
    output the list of tokenized data, each representing a sentence of words
    """
    with open(raw_text_fp) as f:
        tokens_data = [line.lower().split() for line in f.readlines()]
        if src_or_targ == 'source': 
            tokens_data = [datum + ['<EOS>'] for datum in tokens_data]
        elif src_or_targ == 'target': 
            tokens_data = [['<SOS>'] + datum + ['<EOS>'] for datum in tokens_data]
    return tokens_data 


def load_word2vec(lang_type): 
    """ 
    Loads pretrained vectors for a given language 'zh', 'vi' 'en'
    """
    filepath = "wiki.{}.vec".format(lang_type)
    word2vec = KeyedVectors.load_word2vec_format(filepath)
    return word2vec

def build_vocab(tokens_data, max_vocab_size, word2vec): 
    """ 
    Take a list of tokenized sentences of words, max vocabulary size and word2vec model
    Return id2token: a list of tokens, where id2token[i] returns token that corresponds to i-th token 
    and    token2id: dictionary where each key represent a token and value is its index
    Note that the vocab will comprise max_vocab_size-len(RESERVED_TOKENS) tokens that are in word2vec model 
    """
    num_vocab = max_vocab_size - len(RESERVED_TOKENS)
    all_tokens = [token for sentence in tokens_data for token in sentence]
    token_counter = Counter(all_tokens)  # return a dictionary
    vocab, count = zip(*token_counter.most_common(num_vocab))
    id2token = list(RESERVED_TOKENS.keys()) + list(vocab)
    token2id = dict(zip(id2token, range(max_vocab_size)))
    
    # check out how many words are in word2vec vs. not 
    not_in_word2vec = [1 for token in token2id if token not in word2vec]
    pct_of_corpus = 100 * sum([token_counter[token] for token in token_counter if token not in word2vec]) / len(all_tokens)
    
    print("A vocabulary of {} is generated from a set of {} unique tokens.".format(len(token2id), len(token_counter)))
    print("{} vocab tokens are not in word2vec, comprising {:.1f}% of entire corpus.".format(len(not_in_word2vec), pct_of_corpus))
    
    return token2id, id2token 

def tokens2indices(tokens_data, token2id): 
    """ 
    Take tokenized data and token2id dictionary and returns indexed data with <UNK> padding 
    """
    indices_data = [] 
    for datum in tokens_data: 
        indices_datum = [token2id[token] if token in token2id else RESERVED_TOKENS['<UNK>'] for token in datum ]
        indices_data.append(indices_datum)    
    return indices_data

def get_filepath(split, src_lang, targ_lang, src_or_targ): 
    """ Locates data filepath given data split type (train/dev/test), translation pairs (src_lang -> targ_lang), 
        and the language type (source or target) 
    """
    folder_name = "iwslt-{}-{}/".format(src_lang, targ_lang)
    if src_or_targ == 'source':
        file_name = "{}.tok.{}".format(split, src_lang)
    elif src_or_targ == 'target': 
        file_name = "{}.tok.{}".format(split, targ_lang)
    return folder_name + file_name 


def get_filepaths(src_lang, targ_lang): 
    """ 
    Take source language and target language. 
    Return a nested dictionary containing the filepaths for source/target data for train/dev/test sets  
    ex. get_filepaths('zh', 'en') gives
    {'languages': {'source': 'zh', 'target': 'en'},
     'train': {'source': {'filepath': 'iwslt-zh-en/train.tok.zh'},
      'target': {'filepath': 'iwslt-zh-en/train.tok.en'}},
     'dev': {'source': {'filepath': 'iwslt-zh-en/dev.tok.zh'},
      'target': {'filepath': 'iwslt-zh-en/dev.tok.en'}},
     'test': {'source': {'filepath': 'iwslt-zh-en/test.tok.zh'},
      'target': {'filepath': 'iwslt-zh-en/test.tok.en'}}}
    """
    fps = {} 
    
    # store language names 
    fps['languages'] = {} 
    fps['languages']['source'] = src_lang
    fps['languages']['target'] = targ_lang 
    
    # store filepaths 
    for split in ['train', 'dev', 'test']: 
        fps[split] = {} 
        for src_or_targ in ['source', 'target']: 
            fps[split][src_or_targ] = {} 
            fps[split][src_or_targ]['filepath'] = get_filepath(split, src_lang, targ_lang, src_or_targ)
            
    return fps

# Generate the Vocabulary of Train data using word2vec

In [20]:
def generate_vocab(src_lang, targ_lang, src_vocab_size, targ_vocab_size):
    """ 
    Input source and target language names and vocab sizes
    Output a nested dictionary vocab containing token2id, id2token, and word2vec for both source and target languages. 
    The first level of keys is language name (e.x 'zh'), and that of nested dictionary are token2id, id2token, and reduced word2vec.
    """
    
    vocab = {} 
    for lang, vocab_size in zip([src_lang, targ_lang], [src_vocab_size, targ_vocab_size]): 
        
        # load train data 
        train_data_fp = get_filepath(split='train', src_lang=src_lang, targ_lang=targ_lang, 
                                     src_or_targ='target' if lang == 'en' else 'source')
        with open(train_data_fp) as f:
            train_tokens = [line.lower().split() for line in f.readlines()]        
        
        # load word embeddings, generate token2id and id2token 
        word2vec_full = load_word2vec(lang)
        token2id, id2token = build_vocab(train_tokens, vocab_size, word2vec_full) 
        word2vec_reduced = {word: word2vec_full[word] for word in token2id if word in word2vec_full} 
        
        # store token2id, id2token, and word embeddings as a dict in nested dict lang 
        vocab[lang] = {'token2id': token2id, 'id2token': id2token, 'word2vec': word2vec_reduced}
        
    return vocab

### Chinese to English

In [21]:
dd = 'zh'
TARG_LANG = 'en'
SRC_VOCAB_SIZE = 30000  # Maybe enlarge our vocabulary size
TARG_VOCAB_SIZE = 30000

In [22]:
vocab = generate_vocab(SRC_LANG, TARG_LANG, SRC_VOCAB_SIZE, TARG_VOCAB_SIZE)
# vocab['zh']['id2token'], vocab['en']['token2id'], vocab['zh']['word2vec'] 
# vocabulary of train data of both source and target language

A vocabulary of 30000 is generated from a set of 88421 unique tokens.
11854 vocab tokens are not in word2vec, comprising 11.0% of entire corpus.
A vocabulary of 30000 is generated from a set of 60694 unique tokens.
1792 vocab tokens are not in word2vec, comprising 3.6% of entire corpus.


#### Pickle the zh and en Data

In [94]:
#pickle.dump(vocab, open( "zh_en_vocab.p", "wb"))
vocab_zh_en = pickle.load( open( "zh_en_vocab.p", "rb"))

#### Get the tokenized data and its index in our vocubulary

In [48]:
def process_data(src_lang, targ_lang, src_max_sentence_len, targ_max_sentence_len, vocab, sample_limit=None, filter_long=True): 
    """ 
    Input source and target language names and maximum sentence length, vocab returned from generate_vocab, 
    and an optional sample_limit representing the number of sentences to subset if necessary. 
    Output data as a nested dictionary containing the indices and tokens of train/dev/test data for both source and target languages. 
    Note the hierachy of data dict is: data[split][lang_type]['tokens' or 'indices'], 
    e.x. to access indices of source training data, use data['train']['source']['indices']
    """ 
    
    # get filepaths 
    data = get_filepaths(src_lang, targ_lang) # data is a file path!!!!
    
    # loop through each file, read in text, convert to tokens, then to indices 
    for split in ['train', 'dev', 'test']: 
        for src_or_targ in ['source', 'target']: 
            # read in tokens 
            data[split][src_or_targ]['tokens'] = text2tokens(data[split][src_or_targ]['filepath'], src_or_targ)
    
    # for training data, keep only pairs with both source and target sentences within max_sent_len 
    if filter_long:                
        original_train_size = len(data['train']['source']['tokens'])
        source_lengths = np.array([len(l) for l in data['train']['source']['tokens']])
        target_lengths = np.array([len(l) for l in data['train']['target']['tokens']])
        keep_mask = (source_lengths <= src_max_sentence_len) & (target_lengths <= targ_max_sentence_len)
        data['train']['source']['tokens'] = list(np.array(data['train']['source']['tokens'])[keep_mask])
        data['train']['target']['tokens'] = list(np.array(data['train']['target']['tokens'])[keep_mask])
        new_train_size = len(data['train']['source']['tokens']) 
        print("{} data points are removed from training data after filtering out long sentences: {} remain.".format(
            new_train_size - original_train_size, new_train_size))

    # further limit number of samples if applicable 
    if sample_limit is not None: 
        for split in ['train', 'dev', 'test']: 
            for lang_type in ['source', 'target']: 
                data[split][lang_type]['tokens'] = data[split][lang_type]['tokens'][:sample_limit]

    # convert tokens to indices 
    for split in ['train', 'dev', 'test']: 
        for lang_type in ['source', 'target']: 
            data[split][lang_type]['indices'] = tokens2indices(tokens_data=data[split][lang_type]['tokens'],  
                                                        token2id = vocab[data['languages'][lang_type]]['token2id'])

    return data

In [100]:
SRC_MAX_SENTENCE_LEN = 10
TARG_MAX_SENTENCE_LEN = 10
data = process_data(SRC_LANG, TARG_LANG, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, vocab_zh_en, filter_long=False)
# data['train']['source']['indices'], data['train']['source']['tokens']   for train val, test, source, target, indices, tokens

In [120]:
data['train']['source']['tokens']

[['深海', '海中', '的', '生命', '大卫', '盖罗', '<EOS>'],
 ['大卫',
  '盖罗',
  '通过',
  '潜水',
  '潜水艇',
  '拍下',
  '的',
  '影片',
  '把',
  '我们',
  '带到',
  '了',
  '地球',
  '最',
  '黑暗',
  '最',
  '险恶',
  '同时',
  '也',
  '最美',
  '美丽',
  '的',
  '生物',
  '栖息',
  '栖息地',
  '这里',
  '是',
  '海洋',
  '深处',
  '的',
  '峡谷',
  '和',
  '火山',
  '山脊',
  '这里',
  '怪诞',
  '适应',
  '适应力',
  '应力',
  '强',
  '而且',
  '数量',
  '惊人',
  '的',
  '生命',
  '<EOS>'],
 ['大卫', '盖罗', '这位', '是', '比尔', '兰格', '我', '是', '大卫', '盖罗', '<EOS>'],
 ['我们',
  '将',
  '用',
  '一些',
  '影片',
  '来讲',
  '讲述',
  '一些',
  '深海',
  '海里',
  '的',
  '故事',
  '<EOS>'],
 ['我们',
  '这',
  '有',
  '不少',
  '精彩',
  '的',
  '泰坦',
  '泰坦尼克',
  '坦尼',
  '尼克',
  '的',
  '影片',
  '可惜',
  '您',
  '今天',
  '看不到',
  '不到',
  '<EOS>'],
 ['泰坦',
  '泰坦尼克',
  '泰坦尼克号',
  '坦尼',
  '尼克',
  '号',
  '是',
  '拿',
  '了',
  '不少',
  '票房',
  '冠军',
  '但',
  '事实',
  '事实上',
  '它',
  '并',
  '不是',
  '关于',
  '于海洋',
  '海洋',
  '的',
  '最',
  '刺激',
  '的',
  '故事',
  '<EOS>'],
 ['原因', '在于', '我们', '一直', '没', '把', '海洋', '当回事', '回事'

### Data Loader

In [104]:
class TranslationDataset(Dataset):   # inherits torch.utils.data.Dataset
    """ 
    Class that represents a train/validation/test/dataset that's readable for Pytorch. 
    """
    def __init__(self, src_indices, targ_indices, src_max_sentence_len, targ_max_sentence_len):
        """ 
        Initialize dataset by passing in a list of input indices and a list of output indices 
        """
        self.src_indices = src_indices
        self.targ_indices = targ_indices
        self.src_max_sentence_len = src_max_sentence_len
        self.targ_max_sentence_len = targ_max_sentence_len
        assert (len(self.src_indices) == len(self.targ_indices))
        
    def __len__(self): 
        return len(self.src_indices)
    
    def __getitem__(self, key): 
        """ 
        Triggered when dataset[i] is called, outputs lists of input and output indices, as well as their 
        respective lengths
        """
        src_idx = self.src_indices[key][:self.src_max_sentence_len]
        src_len = len(src_idx)
        targ_idx = self.targ_indices[key][:self.targ_max_sentence_len]
        targ_len = len(targ_idx)
        return [src_idx, targ_idx, src_len, targ_len]
    

def collate_func(src_max_sentence_len, targ_max_sentence_len, batch): 
    """ 
    Customized function for DataLoader that dynamically pads the batch so that all data have the same length
    """
    src_idxs = [] 
    targ_idxs = [] 
    src_lens = [] 
    targ_lens = [] 
    
    for datum in batch: 
        # append original lengths of sequences 
        src_lens.append(datum[2]) 
        targ_lens.append(datum[3])
        
        # pad sequences before appending 
        src_idx_padded = np.pad(array=np.array(datum[0]), pad_width = ((0, src_max_sentence_len - datum[2])), 
                                mode='constant', constant_values=RESERVED_TOKENS['<PAD>'])
        targ_idx_padded = np.pad(array=np.array(datum[1]), pad_width = ((0, targ_max_sentence_len - datum[3])),
                                 mode='constant', constant_values=RESERVED_TOKENS['<PAD>'])
        src_idxs.append(src_idx_padded)
        targ_idxs.append(targ_idx_padded)
    
    return [torch.from_numpy(np.array(src_idxs)), torch.from_numpy(np.array(targ_idxs)), 
            torch.LongTensor(src_lens), torch.LongTensor(targ_lens)]


def create_dataloaders(processed_data, src_max_sentence_len, targ_max_sentence_len, batch_size): 
    """ 
    Takes processed_data as dictionary output from process_data function, maximum sentence lengths, 
    outputs a nested dictionary called 'loaders' that holds train, dev, and test loaders, 
    e.x. loaders['dev'] holds the data loader for dev/validation set 
    """
    loaders = {} 
    for split in ['train', 'dev', 'test']: 
        dataset = TranslationDataset(processed_data[split]['source']['indices'], processed_data[split]['target']['indices'], 
                                     src_max_sentence_len, targ_max_sentence_len)
        loaders[split] = DataLoader(dataset, batch_size=batch_size, shuffle=False, 
                                    collate_fn=partial(collate_func, src_max_sentence_len, targ_max_sentence_len))
    return loaders 

In [105]:
BATCH_SIZE = 32
loaders_full = create_dataloaders(data, SRC_MAX_SENTENCE_LEN, TARG_MAX_SENTENCE_LEN, BATCH_SIZE)

### Model Architecture

#### Get Pretrained Word Embedding

In [109]:
def get_pretrained_emb(word2vec, token2id): 
    """ 
    Given word2vec model and vocab's token2id, generate pretrained word embeddings for all tokens in vocab. 
    For tokens not in the word2vec model, initialize with random vectors using normal distribution 
    """

    pretrained_emb = np.zeros((len(token2id), 300)) 
    for token in token2id: 
        try: 
            pretrained_emb[token2id[token]] = word2vec[token]
        except:
            pretrained_emb[token2id[token]] = np.random.normal(size=(300,))
    return torch.from_numpy(pretrained_emb.astype(np.float32)).to(device)

#### RNN Encoder Decoder

In [129]:
class EncoderRNN(nn.Module): # previously EncoderSimpleRNN_Test

    """ 
    Concats bidirectional hidden/output. 
    """ 
    def __init__(self, rnn_cell_type, enc_hidden_dim, num_layers, enc_dropout, src_max_sentence_len, pretrained_word2vec):
        super(EncoderRNN, self).__init__()
        self.enc_embed_dim = 300
        self.enc_hidden_dim = enc_hidden_dim 
        self.enc_dropout = enc_dropout 
        self.src_max_sentence_len = src_max_sentence_len
        self.num_layers = num_layers
        self.embedding = nn.Embedding.from_pretrained(pretrained_word2vec, freeze=True) 
        self.rnn_cell_type = rnn_cell_type 
        if self.rnn_cell_type == 'gru': 
            self.rnn = nn.GRU(input_size=self.enc_embed_dim, hidden_size=self.enc_hidden_dim, 
                              num_layers=self.num_layers, dropout = enc_dropout, batch_first=True, bidirectional=True) 
        elif self.rnn_cell_type == 'lstm': 
            self.rnn = nn.LSTM(input_size=self.enc_embed_dim, hidden_size=self.enc_hidden_dim, 
                               num_layers=self.num_layers, dropout = enc_dropout, batch_first=True, bidirectional=True) 

    def forward(self, enc_input, enc_input_lens):
        batch_size = enc_input.size()[0]
        _, idx_sort = torch.sort(enc_input_lens, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)
        enc_input, enc_input_lens = enc_input.index_select(0, idx_sort), enc_input_lens.index_select(0, idx_sort)
        embedded = self.embedding(enc_input)
        embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, enc_input_lens, batch_first=True)
        hidden = self.initHidden(batch_size) #.to(device)
        if self.rnn_cell_type == 'gru': 
            output, hidden = self.rnn(embedded, hidden)
        elif self.rnn_cell_type == 'lstm': 
            memory = self.initHidden(batch_size) #.to(device)
            output, (hidden, memory) = self.rnn(embedded, (hidden, memory)) 
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(output, batch_first=True, total_length=self.src_max_sentence_len,
                                                            padding_value=RESERVED_TOKENS['<PAD>'])
        output = output.index_select(0, idx_unsort)
        hidden = hidden.index_select(1, idx_unsort)
        #output = output[:, :, :self.enc_hidden_dim] + output[:, :, self.enc_hidden_dim:]
        output = torch.cat([output[:, :, :self.enc_hidden_dim], output[:, :, self.enc_hidden_dim:]], dim=2)
        hidden = hidden.view(self.num_layers, 2, batch_size, self.enc_hidden_dim)
        #hidden = hidden[:, 0, :, :].squeeze(dim=1) + hidden[:, 1, :, :].squeeze(dim=1)
        hidden = torch.cat([hidden[:, 0, :, :].view(self.num_layers, 1, batch_size, self.enc_hidden_dim).squeeze(dim=1), 
                        hidden[:, 1, :, :].view(self.num_layers, 1, batch_size, self.enc_hidden_dim).squeeze(dim=1)], dim=2) 
        hidden = hidden.view(self.num_layers, batch_size, 2 * self.enc_hidden_dim)

        return output, hidden

    def initHidden(self, batch_size):
        return torch.zeros(2*self.num_layers, batch_size, self.enc_hidden_dim).to(device)


class DecoderRNN(nn.Module): 

    """ 
    Vanilla decoder without attention, but final layer from encoder is repeatedly passed as input to each time step. 
    Handles output from EncoderRNN, which concats bidirectional output. 
    """ 

    def __init__(self, dec_hidden_dim, enc_hidden_dim, num_layers, targ_vocab_size, targ_max_sentence_len, pretrained_word2vec):
        super(DecoderRNN, self).__init__()
        self.dec_embed_dim = 300
        self.dec_hidden_dim = dec_hidden_dim 
        self.enc_hidden_dim = enc_hidden_dim
        self.targ_vocab_size = targ_vocab_size
        self.targ_max_sentence_len = targ_max_sentence_len
        self.num_layers = num_layers
        self.embedding = nn.Embedding.from_pretrained(pretrained_word2vec, freeze=True) 
        self.gru = nn.GRU(self.dec_embed_dim + 2 * self.enc_hidden_dim, self.dec_hidden_dim, num_layers=self.num_layers) 
        self.out = nn.Linear(dec_hidden_dim, self.targ_vocab_size) 
        self.softmax = nn.LogSoftmax(dim=1) 

    def forward(self, dec_input, dec_hidden, enc_outputs): 
        dec_input = dec_input 
        dec_hidden = dec_hidden 
        enc_outputs = enc_outputs 
        batch_size = dec_input.size()[0]
        embedded = self.embedding(dec_input).view(1, batch_size, -1)
        #context = enc_outputs[:, -1, :].unsqueeze(dim=1).transpose(0, 1) 
        context = torch.cat([enc_outputs[:, -1, :self.enc_hidden_dim], 
                             enc_outputs[:, 0, self.enc_hidden_dim:]], dim=1).unsqueeze(0)
        concat = torch.cat([embedded, context], 2) 
        output, hidden = self.gru(concat, dec_hidden)
        output = self.softmax(self.out(output[0]))  
        return output, hidden


class EncoderDecoder(nn.Module): 

    """ 
    Encoder-Decoder without attention 
    """

    def __init__(self, encoder, decoder, decoder_token2id): 
        super(EncoderDecoder, self).__init__() 
        self.encoder = encoder 
        self.decoder = decoder 
        self.targ_vocab_size = self.decoder.targ_vocab_size
        self.src_max_sentence_len = self.encoder.src_max_sentence_len 
        self.targ_max_sentence_len = self.decoder.targ_max_sentence_len 

    def forward(self, src_idx, targ_idx, src_lens, targ_lens, teacher_forcing_ratio): 
        
        batch_size = src_idx.size()[0]
        enc_outputs, enc_hidden = self.encoder(src_idx, src_lens)
        dec_hidden = enc_hidden 
        dec_outputs = Variable(torch.zeros(self.targ_max_sentence_len, batch_size, self.targ_vocab_size))
        hypotheses = Variable(torch.zeros(self.targ_max_sentence_len, batch_size))
        dec_output = targ_idx[:, 0] 

        for di in range(1, self.targ_max_sentence_len): 
            dec_output, dec_hidden = self.decoder(dec_output, dec_hidden, enc_outputs)
            dec_outputs[di] = dec_output 
            teacher_labels = targ_idx[:, di-1] 
            greedy_labels = dec_output.data.max(1)[1]
            dec_output = teacher_labels if random.random() < teacher_forcing_ratio else greedy_labels 
            hypotheses[di] = greedy_labels

        attn_placeholder = Variable(torch.zeros(batch_size, self.targ_max_sentence_len, self.src_max_sentence_len))

        return dec_outputs, hypotheses.transpose(0,1), attn_placeholder 

#### Attention 

In [128]:
class Attention(nn.Module): 
    
    """ 
    Implements additive attention
    """ 
    
    def __init__(self, enc_hidden_dim, dec_hidden_dim, num_annotations, num_layers): 
        super(Attention, self).__init__() 
        self.dec_hidden_dim = dec_hidden_dim
        self.input_dim = 2 * enc_hidden_dim + self.dec_hidden_dim
        self.attn = nn.Linear(self.input_dim, self.dec_hidden_dim) 
        self.v = nn.Parameter(torch.rand(self.dec_hidden_dim))
        self.num_layers = num_layers 
        nn.init.normal_(self.v, mean=0, std=1. / math.sqrt(self.dec_hidden_dim))

    def forward(self, encoder_outputs, last_dec_hidden, src_idx): 
        time_steps = encoder_outputs.size()[1]
        batch_size = encoder_outputs.size()[0]
        v_broadcast = self.v.repeat(batch_size, 1, 1) #.to(device) # [B, 1, H]
        last_dec_hidden = last_dec_hidden.transpose(0, 1)[:, -1, :].unsqueeze(1) # [B, L, H] -> [B, 1, H] -> [B, H] (take last layer)
        hidden_broadcast = last_dec_hidden.repeat(1, time_steps, 1) #.to(device) # [B, T, H]
        concat = torch.cat([encoder_outputs, hidden_broadcast], dim=2) #.to(device) # [B, T, 2H]
        energies = torch.tanh(self.attn(concat)).transpose(1, 2) # [B, T, H] -> [B, H, T]
        energies = torch.bmm(v_broadcast, energies).squeeze(1) # [B, 1, H] * [B, H, T] -> [B, 1, T] -> [B, T]
        energies.data.masked_fill_(src_idx == RESERVED_TOKENS['<PAD>'], -float('inf'))
        attn_weights = F.softmax(energies, dim=1) # [B, T]

        return attn_weights


class DotAttention(nn.Module): 

    """ Implements multiplicative attention """

    def __init__(self, enc_hidden_dim, dec_hidden_dim, num_annotations, num_layers): 
        super(DotAttention, self).__init__() 
        self.dec_hidden_dim = dec_hidden_dim
        self.num_layers = num_layers 

    def forward(self, encoder_outputs, last_dec_hidden, src_idx): 
        time_steps = encoder_outputs.size()[1]
        batch_size = encoder_outputs.size()[0]
        last_dec_hidden = last_dec_hidden.transpose(0, 1)[:, -1, :].view(batch_size, 1, -1) # [B, L, H] -> [B, 1, H]
        energies = torch.bmm(encoder_outputs, last_dec_hidden.transpose(1, 2)).squeeze(-1)  # [B, T, H] * [B, H, 1] -> [B, T, 1] -> [B, T]
        energies.data.masked_fill_(src_idx == RESERVED_TOKENS['<PAD>'], -float('inf'))
        attn_weights = F.softmax(energies, dim=1) # [B, T]

        return attn_weights

#### Encoder Decoder with Attention

In [127]:
class DecoderAttnRNN(nn.Module):

    """ 
    Decoder with attention
    """ 

    def __init__(self, rnn_cell_type, dec_hidden_dim, enc_hidden_dim, num_layers, dec_dropout, targ_vocab_size, 
                    src_max_sentence_len, targ_max_sentence_len, attention_type, pretrained_word2vec):

        super(DecoderAttnRNN, self).__init__()
        self.dec_embed_dim = 300
        self.dec_hidden_dim = dec_hidden_dim 
        self.enc_hidden_dim = enc_hidden_dim
        self.src_max_sentence_len = src_max_sentence_len
        self.targ_max_sentence_len = targ_max_sentence_len
        self.targ_vocab_size = targ_vocab_size
        self.num_layers = num_layers 
        self.rnn_cell_type = rnn_cell_type 
        self.embedding = nn.Embedding.from_pretrained(pretrained_word2vec, freeze=True) 
        # choose attention type 
        if attention_type == 'additive': 
            self.attn = Attention(self.enc_hidden_dim, self.dec_hidden_dim, 
                                    num_annotations = self.src_max_sentence_len, num_layers=self.num_layers) 
        elif attention_type == 'multiplicative': 
            self.attn = DotAttention(self.enc_hidden_dim, self.dec_hidden_dim, 
            num_annotations = self.src_max_sentence_len, num_layers=self.num_layers) 
        # choose RNN cell type 
        if self.rnn_cell_type == 'gru':
            self.rnn = nn.GRU(self.dec_embed_dim + 2 * self.enc_hidden_dim, self.dec_hidden_dim, num_layers=self.num_layers, dropout=dec_dropout) 
        elif self.rnn_cell_type == 'lstm': 
            self.rnn = nn.LSTM(self.dec_embed_dim + 2 * self.enc_hidden_dim, self.dec_hidden_dim, num_layers=self.num_layers, dropout=dec_dropout) 
        self.out = nn.Linear(self.dec_hidden_dim, self.targ_vocab_size) 
        self.softmax = nn.LogSoftmax(dim=1) 

    def forward(self, dec_input, dec_hidden, enc_outputs, src_idx):
        batch_size = dec_input.size()[0]
        embedded = self.embedding(dec_input).view(1, batch_size, -1) # [1, B, H]
        attn_weights = self.attn(encoder_outputs=enc_outputs, last_dec_hidden=dec_hidden, src_idx=src_idx).unsqueeze(1) # [B, 1, T]
        context = attn_weights.bmm(enc_outputs).transpose(0, 1) # [B, 1, T] * [B, T, H] = [B, 1, H] -> [1, B, H]
        concat = torch.cat([embedded, context], 2) # [1, B, 2H] 
        if self.rnn_cell_type == 'gru':
             output, hidden = self.rnn(concat, dec_hidden) # [1, B, H], [2, B, H] 
        elif self.rnn_cell_type == 'lstm':
            output, (hidden, memory) = self.rnn(concat, (dec_hidden, dec_hidden))		
        output = self.softmax(self.out(output[0])) # [B, H] -> [B, V] 

        return output, hidden, attn_weights 


class EncoderDecoderAttn(nn.Module): 

    """ 
    Encoder Decoder with Attention 
    """

    def __init__(self, encoder, decoder, decoder_token2id): 
        super(EncoderDecoderAttn, self).__init__() 
        self.encoder = encoder 
        self.decoder = decoder 
        self.targ_vocab_size = self.decoder.targ_vocab_size
        self.src_max_sentence_len = self.encoder.src_max_sentence_len 
        self.targ_max_sentence_len = self.decoder.targ_max_sentence_len 

    def forward(self, src_idx, targ_idx, src_lens, targ_lens, teacher_forcing_ratio): 

        batch_size = src_idx.size()[0]
        enc_outputs, enc_hidden = self.encoder(src_idx, src_lens)
        dec_hidden = enc_hidden 
        dec_outputs = Variable(torch.zeros(self.targ_max_sentence_len, batch_size, self.targ_vocab_size))
        hypotheses = Variable(torch.zeros(self.targ_max_sentence_len, batch_size))
        attn_weights_all = Variable(torch.zeros(self.targ_max_sentence_len, batch_size, self.targ_max_sentence_len))
        dec_output = targ_idx[:, 0] 

        for di in range(1, self.targ_max_sentence_len): 
            dec_output, dec_hidden, attn_weights = self.decoder(dec_output, dec_hidden, enc_outputs, src_idx) # src_idx for masking 
            dec_outputs[di] = dec_output 
            teacher_labels = targ_idx[:, di-1] 
            greedy_labels = dec_output.data.max(1)[1]
            dec_output = teacher_labels if random.random() < teacher_forcing_ratio else greedy_labels 
            hypotheses[di] = greedy_labels
            attn_weights_all[di] = attn_weights.squeeze(1)

        return dec_outputs, hypotheses.transpose(0,1), attn_weights_all.transpose(0,1)


class EncoderCNN(nn.Module):
    """
    CNN Encoder
    """
    def __init__(self, pretrained_word2vec, src_max_sentence_len=10, enc_hidden_dim=512, dropout=0.1):
        super(EncoderCNN, self).__init__()
        self.enc_embed_dim = 300
        self.enc_hidden_dim = enc_hidden_dim
        self.embedding = nn.Embedding.from_pretrained(pretrained_word2vec, freeze=True).to(device)
        self.conv1_a = nn.Conv1d(300, enc_hidden_dim, kernel_size=3, padding=1).to(device)
        self.conv2_a = nn.Conv1d(enc_hidden_dim, enc_hidden_dim, kernel_size=3, padding=1).to(device)
        self.conv1_b = nn.Conv1d(300, enc_hidden_dim, kernel_size=3, padding=1).to(device)
        self.conv2_b = nn.Conv1d(enc_hidden_dim, enc_hidden_dim, kernel_size=3, padding=1).to(device)
        self.dropout_val = dropout
        self.src_max_sentence_len = src_max_sentence_len


        def forward(self, enc_input, enc_input_lens):
            enc_input = enc_input.to(device)
            enc_input_lens = enc_input_lens.to(device)
            batch_size = enc_input.size()[0]
            embedded = self.embedding(enc_input)
            embedded = F.dropout(embedded, self.dropout_val)

            # 1st net
            hidden_1_a = self.conv1_a(embedded.transpose(1,2)).transpose(1,2)
            #print(hidden_1_a.shape)
            hidden_1_a.contiguous().view(-1, hidden_1_a.size(-1))
            hidden_1_a = F.leaky_relu(hidden_1_a.contiguous().view(-1, self.enc_embed_dim)).view(batch_size, -1, hidden_1_a.size(-1))
            hidden_2_a = self.conv2_a(hidden_1_a.transpose(1,2)).transpose(1,2)
            hidden_2_a = F.leaky_relu(hidden_2_a.contiguous().view(-1, hidden_2_a.size(-1))).view(batch_size, -1, hidden_2_a.size(-1))
            # 2nd net
            hidden_1_b = self.conv1_a(embedded.transpose(1,2)).transpose(1,2)
            hidden_1_b.contiguous().view(-1, hidden_1_b.size(-1))
            hidden_1_b = F.leaky_relu(hidden_1_b.contiguous().view(-1, self.enc_embed_dim)).view(batch_size, -1, hidden_1_b.size(-1))
            hidden_2_b = self.conv2_a(hidden_1_b.transpose(1,2)).transpose(1,2)
            hidden_2_b = F.leaky_relu(hidden_2_b.contiguous().view(-1, hidden_2_b.size(-1))).view(batch_size, -1, hidden_2_b.size(-1))
            hidden_2_b = hidden_2_b.view(-1, 2, batch_size, self.enc_hidden_dim)
            hidden_2_b = hidden_2_b.transpose(0,1)

            hidden_2_b = hidden_2_b[:, 0, :, :].squeeze(dim=1) + hidden_2_b[:, 1, :, :].squeeze(dim=1)

            return hidden_2_a , hidden_2_b.view(2,batch_size, -1)