# CS 287, Homework 3: Neural Machine Translation

In [1]:
import torch
from torch.nn.utils import clip_grad_norm_
torch.__version__
from common import *
## Setup
import torch.nn.functional as F

#!pip install --upgrade pip
#!pip install -q numpy

#!pip install -q torch torchtext spacy opt_einsum
#!pip install -qU git+https://github.com/harvardnlp/namedtensor
#!python -m spacy download en
#!python -m spacy download de

# Torch
import torch.nn as nn
import torch
# Text text processing library and methods for pretrained word embeddings
from torchtext import data, datasets
# Named Tensor wrappers
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField
import numpy as np
%reload_ext autoreload
%autoreload 2

In [2]:
# split raw data into tokens
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# add beginning-of-sentence and end-of-sentence tokens to target
BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = NamedField(names=('srcSeqlen',), tokenize=tokenize_de)
EN = NamedField(names=('trgSeqlen',), tokenize=tokenize_en,
                init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS

# download dataset of 200K pairs of sentences
# start with MAXLEN = 20
MAX_LEN = 20
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)
print(train.fields)
print(len(train))
print(vars(train[0]))

# WHAT DOES THIS DO?
'''src = open("valid.src", "w")
trg = open("valid.trg", "w")
for example in val:
    print(" ".join(example.src), file=src)
    print(" ".join(example.trg), file=trg)
src.close()
trg.close()'''

# build vocab, convert words to indices
MIN_FREQ = 5
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)
print(DE.vocab.freqs.most_common(10))
print("Size of German vocab", len(DE.vocab))
print(EN.vocab.freqs.most_common(10))
print("Size of English vocab", len(EN.vocab))
print(EN.vocab.stoi["<s>"], EN.vocab.stoi["</s>"])

print(EN.vocab.stoi["<pad>"], EN.vocab.stoi["<unk>"])

{'src': <namedtensor.text.torch_text.NamedField object at 0x7fbcae599c88>, 'trg': <namedtensor.text.torch_text.NamedField object at 0x7fbca7448780>}
119076
{'src': ['David', 'Gallo', ':', 'Das', 'ist', 'Bill', 'Lange', '.', 'Ich', 'bin', 'Dave', 'Gallo', '.'], 'trg': ['David', 'Gallo', ':', 'This', 'is', 'Bill', 'Lange', '.', 'I', "'m", 'Dave', 'Gallo', '.']}
[('.', 113253), (',', 67237), ('ist', 24189), ('die', 23778), ('das', 17102), ('der', 15727), ('und', 15622), ('Sie', 15085), ('es', 13197), ('ich', 12946)]
Size of German vocab 13353
[('.', 113433), (',', 59512), ('the', 46029), ('to', 29177), ('a', 27548), ('of', 26794), ('I', 24887), ('is', 21775), ("'s", 20630), ('that', 19814)]
Size of English vocab 11560
2 3
1 0


In [3]:
# split data into batches
BATCH_SIZE = 32
device = torch.device('cuda:0')
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=device,
                                                  repeat=False, sort_key=lambda x: len(x.src))

## Sequence to Sequence Learning with Neural Networks

- English to French translation, $p \left( y_1, \dots, y_{T'} \ | \ x_1, \dots, x_T \right) = \prod_{t = 1}^{T'} p \left( y_t \ | \ v, y_1, \dots, y_{t-1} \right)$
- Each sentence ends in '<EOS\>', out-of-vocab words denoted '<UNK\>'
- Model specs: 
    * Input vocabulary of 160,000 and output vocabulary of 80,000
    * Deep LSTM to map (encode) input sequence to fixed-len vector
    * Another deep LSTM to translate (decode) fixed-len vector to output sequence
    * 4 layers per LSTM, 1000 cells per layer, 1000-dimensional word embeddings, softmax over 80,000 words
    * Reversing order of words in source (but not target) improved performance
        * Each word in the source is far from its corresponding word in the target (large minimal time lag); reversing the source reduces the minimal time lag, thereby allowing backprop to establish communication between source and target more easily
- Training specs:
    * Initialize all LSTM params $\sim Unif[-0.08,0.08]$
    * SGD w/o momentum, lr = 0.7
        * After 5 epochs, halve the lr every half-epoch
        * Train for 7.5 epochs
    * Batch size = 128; divide gradient by batch size (denoted $g$)
    * Hard constraint gradient norm; if $s = ||g||_2 > 5$, set $s = 5$
    * Make sure all sentences within a minibatch are roughly the same length
- Objective: $max \frac{1}{|S|} \sum_{(T,S) \in \mathcal{S}} log \ p(T \ | \ S)$, where $\mathcal{S}$ is the training set
- Prediction: $\hat{T} = argmax \ p(T \ | \ S)$ via beam search, where beam size $B \in {1,2}$

In [58]:
context_size = 500
num_layers = 2
seq2context = SequenceModel(len(DE.vocab),context_size,num_layers=2)
context2trg = RNNet(input_size=len(EN.vocab),hidden_size=context_size,num_layers=2,weight_tie=True)

seq2context,context2trg = seq2context.cuda(),context2trg.cuda()
seq2context_optimizer = torch.optim.SGD(seq2context.parameters(), lr=1)
context2trg_optimizer = torch.optim.SGD(context2trg.parameters(), lr=1)

lr_lambda = lambda t: 1 / (1.2**max(t-6,0))
scheduler_c2t = torch.optim.lr_scheduler.LambdaLR(context2trg_optimizer, lr_lambda, last_epoch=-1)
scheduler_s2c = torch.optim.lr_scheduler.LambdaLR(seq2context_optimizer, lr_lambda, last_epoch=-1)



In [122]:
seq2context.train()
context2trg.train()
context_size = seq2context.context_size

In [123]:
for ix,batch in enumerate(train_iter):
    seq2context_optimizer.zero_grad()
    context2trg_optimizer.zero_grad()
        
    src = batch.src.values.transpose(0,1)
    src = reverse_sequence(src)
    trg = batch.trg.values.transpose(0,1)
    context, hidden_s2c = seq2context(src)
    output, hidden_lm = context2trg(trg[:,:-1],hidden_s2c)
    
    

In [140]:
for e in range(5):
    training_loop(e,train_iter,seq2context,context2trg,seq2context_optimizer,context2trg_optimizer,BATCH_SIZE)
    validation_loop(e,val_iter,seq2context,context2trg,scheduler_s2c,scheduler_c2t,BATCH_SIZE)

Epoch: 0, Batch: 0, Loss: 30.344459533691406, Variance: 166542.078125
Epoch: 0, Batch: 100, Loss: 38.58367919921875, Variance: 152495.078125
Epoch: 0, Batch: 200, Loss: 32.62057876586914, Variance: 70155.3125
Epoch: 0, Batch: 300, Loss: 33.22350311279297, Variance: 77269.9453125
Epoch: 0, Batch: 400, Loss: 30.61174964904785, Variance: 58268.31640625
Epoch: 0, Batch: 500, Loss: 30.48539161682129, Variance: 179236.0625
Epoch: 0, Batch: 600, Loss: 30.524948120117188, Variance: 20881.13671875
Epoch: 0, Batch: 700, Loss: 36.902435302734375, Variance: 36496.84765625
Epoch: 0, Batch: 800, Loss: 31.251785278320312, Variance: 22397.3046875
Epoch: 0, Batch: 900, Loss: 36.22175598144531, Variance: 49594.46484375
Epoch: 0, Batch: 1000, Loss: 38.293609619140625, Variance: 32203.990234375
Epoch: 0, Batch: 1100, Loss: 32.65779113769531, Variance: 28263.1875
Epoch: 0, Batch: 1200, Loss: 30.370241165161133, Variance: 76748.1875
Epoch: 0, Batch: 1300, Loss: 28.27395248413086, Variance: 140897.828125
Epo

Epoch: 2, Batch: 3700, Loss: 30.963459014892578, Variance: 155103.625
Epoch: 2, Validation loss: 30.977779388427734, Validation ppl: 13.748322486877441
Epoch: 3, Batch: 0, Loss: 29.794776916503906, Variance: 126414.0390625
Epoch: 3, Batch: 100, Loss: 34.436279296875, Variance: 56234.109375
Epoch: 3, Batch: 200, Loss: 28.13518714904785, Variance: 163698.875
Epoch: 3, Batch: 300, Loss: 30.396564483642578, Variance: 38462.11328125
Epoch: 3, Batch: 400, Loss: 33.30223083496094, Variance: 92151.859375
Epoch: 3, Batch: 500, Loss: 35.09160614013672, Variance: 58381.34375
Epoch: 3, Batch: 600, Loss: 39.76861572265625, Variance: 91850.5
Epoch: 3, Batch: 700, Loss: 32.66163635253906, Variance: 18709.67578125
Epoch: 3, Batch: 800, Loss: 32.45072937011719, Variance: 53173.19140625
Epoch: 3, Batch: 900, Loss: 37.27729415893555, Variance: 47641.8828125
Epoch: 3, Batch: 1000, Loss: 29.915870666503906, Variance: 50089.875
Epoch: 3, Batch: 1100, Loss: 30.188358306884766, Variance: 141873.59375
Epoch: 3

In [9]:
for ix,batch in enumerate(train_iter):
    src = batch.src.values.transpose(0,1)
    src = reverse_sequence(src)
    trg = batch.trg.values.transpose(0,1)
    break

h0=None
context, hidden_s2c = seq2context(src,h0)
output, hidden_lm = context2trg(trg[:,:-1],hidden_s2c)


print([EN.vocab.itos[i] for i in torch.argmax(lsm(output),2)[3,:]])
print([EN.vocab.itos[i] for i in trg[3,:]])


['transfer', 'effective', 'formula', 'collide', 'open', 'breathes', 'demanding', 'Also', 'vanished', 'party', 'condom', 'Things', 'daf-2', 'formula', 'party', 'James', 'party', 'outsiders', 'Bonnet', 'party', 'averse']
['<s>', 'We', 'also', 'use', '[', 'an', ']', 'electronic', 'medical', 'record', 'system', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [43]:
#word_input, last_context, last_hidden, encoder_outputs

encoder_outputs, encoder_hidden = seq2context(src)
encoder_outputs = encoder_outputs[0,:,:]
word_input = torch.tensor([DE.vocab.stoi['<s>']], device='cuda')
last_context = torch.zeros([1, context_size], device='cuda') # 1 x 500
last_hidden = encoder_hidden
last_hidden = tuple([last_hidden[0][:,0,:].view(2,1,500).contiguous(),last_hidden[1][:,0,:].view(2,1,500).contiguous()])
word_embedded = context2trg.emb(word_input).view(1, 1, -1) # 1 x 1 x 500
rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2) # 1 x 1 x 1000
rnn_output, hidden = attn_context2trg.rnn(rnn_input, last_hidden)

In [28]:
def attn_dot(rnn_output,encoder_outputs):
    return F.softmax(torch.matmul(rnn_output.squeeze(0),encoder_outputs.transpose(0,1)).squeeze(),dim=0).unsqueeze(0).unsqueeze(0)

In [29]:
attn_weights = attn_dot(rnn_output,encoder_outputs)
context = attn_weights.bmm(encoder_outputs.unsqueeze(1).transpose(0, 1))
rnn_output = rnn_output.squeeze(0)
context = context.squeeze(1)
output = F.log_softmax(attn_context2trg.lnr(torch.cat((rnn_output, context), 1)),0)

In [104]:
encoder_outputs, encoder_hidden = seq2context(src)
#encoder_outputs = encoder_outputs[0,:,:]
word_input = torch.tensor([DE.vocab.stoi['<s>']], device='cuda')
last_context = torch.zeros(1, context_size, device='cuda') # 1 x 500
last_hidden = encoder_hidden
#last_hidden = tuple([last_hidden[0][:,0,:].view(2,1,500).contiguous(),last_hidden[1][:,0,:].view(2,1,500).contiguous()])

In [38]:
decoder_output, decoder_context, decoder_hidden, decoder_attention = attn_context2trg(word_input, last_context, last_hidden, encoder_outputs)

In [48]:
 decoder_context.shape #decoder_hidden.shape

torch.Size([1, 500])

In [52]:
word_input = trg[1,1]
decoder_output, decoder_context, decoder_hidden, decoder_attention = attn_context2trg(word_input, decoder_context, decoder_hidden, encoder_outputs)


torch.Size([1, 1, 1000])


In [53]:
def compare_sentence(trg,ix=32):
    outputs = []
    for j in range(trg.shape[1] - 1):
        

torch.Size([1, 1, 1000])


Epoch: 2, Batch: 0, Loss: 161.0377197265625
['Solar', 'the', 'we', 'we', 'slow', 'for', 'body', 'like', 'we', 'for', '<pad>', 'get', 'structures', 'the', 'incredible', '<pad>', '<pad>', 'times', '<pad>', 'the', 'off']
['<s>', 'It', 'looks', 'like', 'it', "'s", 'kind', 'of', 'been', 'there', ',', 'and', 'then', 'crashed', 'all', 'these', 'simpler', 'forms', 'into', 'it', '.', '</s>']
Epoch: 2, Batch: 100, Loss: 195.60531616210938
['To', 'I', 'be', 'comes', '?', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'Just', 'does', "n't", 'make', 'sense', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 2, Batch: 200, Loss: 258.91851806640625
['It', 'car', 'will', 'things', 'of', '<pad>', 'about', '<pad>', 'of', '<pad>', '?', 'book', '.', 'U', '.', 'enhance', '<pad>', '<pad>', '.', '<pad>', '.']
['

KeyboardInterrupt: 

In [80]:
class attn_RNNet(torch.nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, dropout=0.5, weight_tie=False, weight_init=0.05):
        super(attn_RNNet, self).__init__()
        self.emb = torch.nn.Sequential(torch.nn.Embedding(input_size, hidden_size), torch.nn.Dropout(dropout))
        self.rnn = torch.nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, num_layers=num_layers, bias=True, batch_first=True, dropout=dropout)
        self.lnr = torch.nn.Sequential(torch.nn.Dropout(dropout), torch.nn.Linear(2*hidden_size, input_size))
    
        for f in self.parameters():
            torch.nn.init.uniform_(f, a=-weight_init, b=weight_init)
      
    def attn_dot(self,rnn_output,encoder_outputs):
        return F.softmax(torch.matmul(rnn_output.squeeze(0),encoder_outputs.transpose(0,1)).squeeze(),dim=0).unsqueeze(0).unsqueeze(0)

    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        word_embedded = self.emb(word_input).view(1, 1, -1) # batch x 1 x hidden
        rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2) # batch x 1 x hiddenx2
        rnn_output, hidden = self.rnn(rnn_input, last_hidden) # batch x 1 x hidden
        attn_weights = self.attn_dot(rnn_output,encoder_outputs) # encoder_outputs: batch x src_seqlen x hidden # batch x src_seqlen x 1
        context = attn_weights.bmm(encoder_outputs.unsqueeze(1).transpose(0, 1)) # encoder_outputs: 1 x batch x src_seqlen x hidden # batch x hidden
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        output = self.lnr(torch.cat((rnn_output, context), 1)) # cat: batch x hiddenx2 # batch x vocab
        # prediction, last_context, last_hidden, weights for vis
        return output, context, hidden, attn_weights 

In [5]:
class attn_RNNet_batched(torch.nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, dropout=0.5, weight_init=0.05):
        super(attn_RNNet_batched, self).__init__()
        self.emb = torch.nn.Sequential(torch.nn.Embedding(input_size, hidden_size), torch.nn.Dropout(dropout))
        self.rnn = torch.nn.LSTM(input_size=2*hidden_size, hidden_size=hidden_size, num_layers=num_layers, bias=True, batch_first=True, dropout=dropout)
        self.lnr = torch.nn.Sequential(torch.nn.Dropout(dropout), torch.nn.Linear(2*hidden_size, input_size))
    
        for f in self.parameters():
            torch.nn.init.uniform_(f, a=-weight_init, b=weight_init)
      
    def attn_dot(self,rnn_output,encoder_outputs):
        return F.softmax(torch.matmul(rnn_output.squeeze(0),encoder_outputs.transpose(0,1)).squeeze(),dim=0).unsqueeze(0).unsqueeze(0)

    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        word_embedded = self.emb(word_input)
        rnn_input = torch.cat([word_embedded, last_context], 1).unsqueeze(1) # batch x 1 x hiddenx2
        rnn_output, hidden = self.rnn(rnn_input, last_hidden)
        attn_weights = rnn_output.bmm(encoder_outputs.transpose(1,2))# batch x src_seqlen x 1
        context = attn_weights.bmm(encoder_outputs)
        rnn_output = rnn_output.squeeze(1)
        context = context.squeeze(1)
        output = self.lnr(torch.cat((rnn_output, context), 1))
        # prediction, last_context, last_hidden, weights for vis
        return output, context, hidden, attn_weights 

In [6]:
context_size = 500
num_layers = 2
attn_context2trg = attn_RNNet_batched(input_size=len(EN.vocab),hidden_size=context_size,num_layers=num_layers)
attn_context2trg = attn_context2trg.cuda()
attn_context2trg_optimizer = torch.optim.Adam(attn_context2trg.parameters(), lr=1e-3)

seq2context = SequenceModel(len(DE.vocab),context_size,num_layers=num_layers)
seq2context_optimizer = torch.optim.Adam(seq2context.parameters(), lr=1e-3)
seq2context = seq2context.cuda()

In [7]:
criterion_train = nn.CrossEntropyLoss(reduction='sum')
lsm2 = nn.LogSoftmax(dim=1)
def attn_training_loop(e=0):
    for ix,batch in enumerate(train_iter):
        src = batch.src.values.transpose(0,1)
        src = reverse_sequence(src)
        trg = batch.trg.values.transpose(0,1)
        if trg.shape[0] == BATCH_SIZE:
        
            seq2context_optimizer.zero_grad()
            attn_context2trg_optimizer.zero_grad()
        
            encoder_outputs, encoder_hidden = seq2context(src)
            loss = 0
            decoder_context = torch.zeros(BATCH_SIZE, context_size, device='cuda') # 32 x 500
            decoder_hidden = encoder_hidden
            sentence = []
            for j in range(trg.shape[1] - 1):
                word_input = trg[:,j]
                decoder_output, decoder_context, decoder_hidden, decoder_attention = attn_context2trg(word_input, decoder_context, decoder_hidden, encoder_outputs)
                #print(decoder_output.shape, trg[i,j+1].view(-1).shape)
                loss += criterion_train(decoder_output, trg[:,j+1])
                
                if np.mod(ix,100) == 0:
                    sentence.extend([torch.argmax(decoder_output[0,:],dim=0)])
                
            loss.backward()
            seq2context_optimizer.step()
            attn_context2trg_optimizer.step()
        
            if np.mod(ix,100) == 0:
                print('Epoch: {}, Batch: {}, Loss: {}'.format(e, ix, loss.cpu().detach()/BATCH_SIZE))
                print([EN.vocab.itos[i] for i in sentence])
                print([EN.vocab.itos[i] for i in trg[0,:]])

In [10]:
for e in range(2,8):
    attn_training_loop(e)

Epoch: 2, Batch: 0, Loss: 42.48237228393555
['Child', ':', '<unk>', '!', 'Look', 'at', 'the', '!', '!', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'Child', ':', 'Whoa', '!', 'Look', 'at', 'that', 'snake', '!', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 2, Batch: 100, Loss: 30.164155960083008
['What', 'do', 'we', 'do', 'with', 'the', '<unk>', ',', ',', 'do', 'longer', 'have', 'on', 'lot', 'on', 'this', 'planet', '?', '</s>', '<pad>', '<pad>']
['<s>', 'What', 'do', 'we', 'do', 'with', 'displaced', 'fellow', '<unk>', 'who', 'no', 'longer', 'have', 'a', 'home', 'on', 'the', 'planet', '?', '</s>', '<pad>', '<pad>']
Epoch: 2, Batch: 200, Loss: 30.560657501220703
['Can', 'imagine', 'imagine', 'that', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['

Epoch: 2, Batch: 2200, Loss: 35.744720458984375
['We', 'heard', 'yesterday', 'about', 'the', 'importance', 'of', 'new', 'materials', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'We', 'heard', 'yesterday', 'about', 'the', 'importance', 'of', 'new', 'materials', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 2, Batch: 2300, Loss: 34.328792572021484
['What', 'what', 'we', 'what', 'we', 'found', 'was', 'completely', 'completely', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'And', 'here', ',', 'what', 'we', 'found', 'was', 'completely', 'unexpected', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 2, Batch: 2400, Loss: 35.16185760498047
['But', 'there', "'s", 'a', 'of', 'things', 'I', 'do', 'been', 'to', '.', '</s>', '<pad>', '<pad>', '

Epoch: 3, Batch: 500, Loss: 28.589311599731445
['How', 'do', 'you', 'be', 'that', 'that', 'to', 'to', 'be', 'anything', '?', 'you', 'is', 'you', '?', '</s>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'How', 'should', 'one', 'live', 'in', 'order', 'not', 'to', 'feel', 'regret', 'when', 'one', 'is', 'dying', '?', '</s>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 3, Batch: 600, Loss: 28.84320640563965
['So', 'that', 'is', 'what', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'So', 'this', 'is', 'true', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 3, Batch: 700, Loss: 40.946128845214844
['So', 'I', 'proposed', 'to', 'to', 'it', '.', 'Everybody', 'can', 'write', '.', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'So', 'I', 'c

Epoch: 3, Batch: 2700, Loss: 28.75575828552246
['It', "'s", 'us', 'see', 'see', 'differently', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'It', 'allows', 'us', 'to', 'see', 'different', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 3, Batch: 2800, Loss: 29.434473037719727
['We', 'have', 'to', 'have', 'a', 'new', 'paradigm', 'of', '<unk>', 'and', 'respect', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'We', 'need', 'to', 'get', 'the', 'new', 'paradigm', 'of', '<unk>', 'and', 'respect', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 3, Batch: 2900, Loss: 35.623172760009766
['We', "'re", 'building', 'our', 'way', 'into', 'a', 'new', 'economy', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad

Epoch: 4, Batch: 1100, Loss: 28.795141220092773
['They', "'re", 'all', 'members', 'tribes', 'in', 'tribes', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'You', "'re", 'all', 'a', 'member', 'of', 'tribes', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 4, Batch: 1200, Loss: 35.05225372314453
['Yet', ',', 'both', 'are', 'states', 'have', 'both', '<unk>', '<unk>', '.', '.', 'that', 'is', 'the', '<unk>', '.', 'the', '.', '.', '</s>', '<pad>']
['<s>', 'However', ',', 'these', 'two', 'countries', 'have', 'the', 'identical', '<unk>', '<unk>', ',', 'which', 'is', 'a', 'measure', 'of', 'income', 'equality', '.', '</s>', '<pad>']
Epoch: 4, Batch: 1300, Loss: 28.294179916381836
['They', "'s", '<unk>', '.', 'They', 'could', "n't", 'understand', 'why', 'I', 'was', 'so', 'miserable', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'She', 'was', '

Epoch: 4, Batch: 3300, Loss: 29.91533660888672
['That', 'can', 'be', 'able', '.', '.', '.', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'This', 'can', 'be', 'integrated', 'with', 'decisions', 'support', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 4, Batch: 3400, Loss: 31.14093017578125
['<unk>', 'fact', '<unk>', ',', '<unk>', '<unk>', ',', 'the', '<unk>', 'as', 'the', '<unk>', 'of', 'hope', 'about', 'the', '.', '</s>', '</s>', '<pad>', '<pad>']
['<s>', 'In', 'the', 'words', 'of', 'Samuel', 'Johnson', ',', '"', '<unk>', 'is', 'the', 'triumph', 'of', 'hope', 'over', 'experience', '.', '"', '</s>', '<pad>', '<pad>']
Epoch: 4, Batch: 3500, Loss: 29.63248062133789
['CA', ':', 'So', 'on', 'the', 'road', ',', 'are', '<unk>', '<unk>', '?', '</s>', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'CA', ':', 

Epoch: 5, Batch: 1700, Loss: 30.445003509521484
['So', 'again', ',', 'I', "'m", 'a', '<unk>', 'in', 'the', '<unk>', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'So', 'again', ',', 'I', 'am', 'a', 'believer', 'in', 'the', 'expressive', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 5, Batch: 1800, Loss: 31.858572006225586
['Imagine', 'imagine', ',', 'white', ',', 'white', 'paper', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'So', 'imagine', 'white', 'guys', 'pasting', 'white', 'papers', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 5, Batch: 1900, Loss: 25.317251205444336
['I', 'worked', 'working', 'in', 'a', 'London', 'London', 'London', '.', 'a', '<unk>', 'office', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>

Epoch: 6, Batch: 0, Loss: 33.66862869262695
['They', 'also', 'to', 'a', 'be', '<unk>', '<unk>', '<unk>', 'in', 'a', '.', '.', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'They', 'go', 'on', 'to', 'jam', 'innocent', 'GPS', '<unk>', 'for', 'miles', 'around', 'you', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 6, Batch: 100, Loss: 26.362197875976562
['In', 'China', ',', 'we', "'s", 'not', 'women', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'In', 'China', ',', 'it', "'s", 'all', 'women', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 6, Batch: 200, Loss: 30.999656677246094
['We', 'we', 'we', "'ve", 'heard', 'the', 'the', '<unk>', 'of', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'And', 'so', 'we', 'ju

Epoch: 6, Batch: 2200, Loss: 26.453594207763672
['And', 'were', 'been', '<unk>', '<unk>', '<unk>', 'strong', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'They', 'had', 'this', 'old', 'brass', 'and', 'copper', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 6, Batch: 2300, Loss: 32.25895690917969
['The', 'the', 'world', 'is', '<unk>', 'of', '<unk>', '-', 'ray', '.', '<unk>', '.', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'And', 'the', 'world', 'is', 'full', 'of', 'non', '-', 'zero', '-', 'sum', 'dynamics', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 6, Batch: 2400, Loss: 24.92096710205078
['Learning', 'will', 'be', 'the', 'production', 'of', 'new', 'new', 'neurons', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s

Epoch: 7, Batch: 500, Loss: 30.639307022094727
['I', 'I', 'was', 'chosen', 'to', 'build', 'a', '<unk>', 'from', 'of', '<unk>', '-', ',', '<unk>', 'paper', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'So', 'I', 'was', 'chosen', 'to', 'build', 'the', 'pavilion', 'out', 'of', 'paper', 'tubes', ',', 'recyclable', 'paper', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 7, Batch: 600, Loss: 27.066415786743164
['So', 'I', 'decided', 'to', 'give', 'to', 'myself', 'give', '.', 'well', 'job', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'So', 'I', 'decided', 'to', 'go', 'there', 'and', 'act', 'as', 'a', 'seller', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 7, Batch: 700, Loss: 22.629634857177734
['I', 'can', "n't", 'see', ',', 'and', 'I', 'need', 'help', '.', '</s>', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'I', 'ca', "n't", 'see', ',', 'and'

Epoch: 7, Batch: 2600, Loss: 29.65309715270996
['It', "'s", '<unk>', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'It', "'s", 'competitive', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 7, Batch: 2700, Loss: 31.731855392456055
['On', 'this', '<unk>', '<unk>', ',', 'a', 'kind', 'of', '<unk>', ',', 'based', ',', ',', 'we', "'re", 'in', 'in', '</s>', '<pad>', '<pad>', '<pad>']
['<s>', 'On', 'that', 'schematic', 'picture', ',', 'a', 'sort', 'of', 'time', '-', 'lapse', 'picture', ',', 'we', "'re", 'halfway', '.', '</s>', '<pad>', '<pad>', '<pad>']
Epoch: 7, Batch: 2800, Loss: 22.341169357299805
['<unk>', 'is', 'is', 'a', 'great', 'example', 'of', 'this', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', '

In [21]:
F.softmax(torch.matmul(rnn_output.squeeze(0),encoder_outputs.transpose(0,1)).squeeze(),dim=0).unsqueeze(0).unsqueeze(0)

tensor([[ 2.6539e-02,  4.2958e-02, -3.3869e-02,  ...,  3.6437e-02,
          1.6895e-02, -6.5814e-03]], device='cuda:0', grad_fn=<AddmmBackward>)

In [None]:
encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
    decoder_hidden = encoder_hidden # Use last hidden state from encoder to start decoder
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()

    # Choose whether to use teacher forcing
    use_teacher_forcing = random.random() < teacher_forcing_ratio
    if use_teacher_forcing:
        
        # Teacher forcing: Use the ground-truth target as the next input
        for di in range(target_length):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_variable[di])
            decoder_input = target_variable[di]

## Submission

In [None]:
# load test set
sentences = []
for i, l in enumerate(open("source_test.txt"), 1):
  sentences.append(re.split(' ', l))