# CS 287, Homework 3: Neural Machine Translation

In [12]:
import torch
from torch.nn.utils import clip_grad_norm_
torch.__version__
from common import *
## Setup
import torch.nn.functional as F

#!pip install --upgrade pip
#!pip install -q numpy

#!pip install -q torch torchtext spacy opt_einsum
#!pip install -qU git+https://github.com/harvardnlp/namedtensor
#!python -m spacy download en
#!python -m spacy download de

# Torch
import torch.nn as nn
import torch
# Text text processing library and methods for pretrained word embeddings
from torchtext import data, datasets
# Named Tensor wrappers
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField
import numpy as np
%reload_ext autoreload
%autoreload 2

In [3]:
# split raw data into tokens
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# add beginning-of-sentence and end-of-sentence tokens to target
BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = NamedField(names=('srcSeqlen',), tokenize=tokenize_de)
EN = NamedField(names=('trgSeqlen',), tokenize=tokenize_en,
                init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS

# download dataset of 200K pairs of sentences
# start with MAXLEN = 20
MAX_LEN = 20
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)
print(train.fields)
print(len(train))
print(vars(train[0]))

# WHAT DOES THIS DO?
'''src = open("valid.src", "w")
trg = open("valid.trg", "w")
for example in val:
    print(" ".join(example.src), file=src)
    print(" ".join(example.trg), file=trg)
src.close()
trg.close()'''

# build vocab, convert words to indices
MIN_FREQ = 5
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)
print(DE.vocab.freqs.most_common(10))
print("Size of German vocab", len(DE.vocab))
print(EN.vocab.freqs.most_common(10))
print("Size of English vocab", len(EN.vocab))
print(EN.vocab.stoi["<s>"], EN.vocab.stoi["</s>"])

print(EN.vocab.stoi["<pad>"], EN.vocab.stoi["<unk>"])

{'src': <namedtensor.text.torch_text.NamedField object at 0x7f4d721a20b8>, 'trg': <namedtensor.text.torch_text.NamedField object at 0x7f4d6a7b3ac8>}
119076
{'src': ['David', 'Gallo', ':', 'Das', 'ist', 'Bill', 'Lange', '.', 'Ich', 'bin', 'Dave', 'Gallo', '.'], 'trg': ['David', 'Gallo', ':', 'This', 'is', 'Bill', 'Lange', '.', 'I', "'m", 'Dave', 'Gallo', '.']}
[('.', 113253), (',', 67237), ('ist', 24189), ('die', 23778), ('das', 17102), ('der', 15727), ('und', 15622), ('Sie', 15085), ('es', 13197), ('ich', 12946)]
Size of German vocab 13353
[('.', 113433), (',', 59512), ('the', 46029), ('to', 29177), ('a', 27548), ('of', 26794), ('I', 24887), ('is', 21775), ("'s", 20630), ('that', 19814)]
Size of English vocab 11560
2 3
1 0


In [4]:
# split data into batches
BATCH_SIZE = 32
device = torch.device('cuda:0')
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=device,
                                                  repeat=False, sort_key=lambda x: len(x.src))

## Sequence to Sequence Learning with Neural Networks

- English to French translation, $p \left( y_1, \dots, y_{T'} \ | \ x_1, \dots, x_T \right) = \prod_{t = 1}^{T'} p \left( y_t \ | \ v, y_1, \dots, y_{t-1} \right)$
- Each sentence ends in '<EOS\>', out-of-vocab words denoted '<UNK\>'
- Model specs: 
    * Input vocabulary of 160,000 and output vocabulary of 80,000
    * Deep LSTM to map (encode) input sequence to fixed-len vector
    * Another deep LSTM to translate (decode) fixed-len vector to output sequence
    * 4 layers per LSTM, 1000 cells per layer, 1000-dimensional word embeddings, softmax over 80,000 words
    * Reversing order of words in source (but not target) improved performance
        * Each word in the source is far from its corresponding word in the target (large minimal time lag); reversing the source reduces the minimal time lag, thereby allowing backprop to establish communication between source and target more easily
- Training specs:
    * Initialize all LSTM params $\sim Unif[-0.08,0.08]$
    * SGD w/o momentum, lr = 0.7
        * After 5 epochs, halve the lr every half-epoch
        * Train for 7.5 epochs
    * Batch size = 128; divide gradient by batch size (denoted $g$)
    * Hard constraint gradient norm; if $s = ||g||_2 > 5$, set $s = 5$
    * Make sure all sentences within a minibatch are roughly the same length
- Objective: $max \frac{1}{|S|} \sum_{(T,S) \in \mathcal{S}} log \ p(T \ | \ S)$, where $\mathcal{S}$ is the training set
- Prediction: $\hat{T} = argmax \ p(T \ | \ S)$ via beam search, where beam size $B \in {1,2}$

In [58]:
context_size = 500
num_layers = 2
seq2context = SequenceModel(len(DE.vocab),context_size,num_layers=2)
context2trg = RNNet(input_size=len(EN.vocab),hidden_size=context_size,num_layers=2,weight_tie=True)

seq2context,context2trg = seq2context.cuda(),context2trg.cuda()
seq2context_optimizer = torch.optim.SGD(seq2context.parameters(), lr=1)
context2trg_optimizer = torch.optim.SGD(context2trg.parameters(), lr=1)

lr_lambda = lambda t: 1 / (1.2**max(t-6,0))
scheduler_c2t = torch.optim.lr_scheduler.LambdaLR(context2trg_optimizer, lr_lambda, last_epoch=-1)
scheduler_s2c = torch.optim.lr_scheduler.LambdaLR(seq2context_optimizer, lr_lambda, last_epoch=-1)



In [122]:
seq2context.train()
context2trg.train()
context_size = seq2context.context_size

In [123]:
for ix,batch in enumerate(train_iter):
    seq2context_optimizer.zero_grad()
    context2trg_optimizer.zero_grad()
        
    src = batch.src.values.transpose(0,1)
    src = reverse_sequence(src)
    trg = batch.trg.values.transpose(0,1)
    context, hidden_s2c = seq2context(src)
    output, hidden_lm = context2trg(trg[:,:-1],hidden_s2c)
    
    

In [140]:
for e in range(5):
    training_loop(e,train_iter,seq2context,context2trg,seq2context_optimizer,context2trg_optimizer,BATCH_SIZE)
    validation_loop(e,val_iter,seq2context,context2trg,scheduler_s2c,scheduler_c2t,BATCH_SIZE)

Epoch: 0, Batch: 0, Loss: 30.344459533691406, Variance: 166542.078125
Epoch: 0, Batch: 100, Loss: 38.58367919921875, Variance: 152495.078125
Epoch: 0, Batch: 200, Loss: 32.62057876586914, Variance: 70155.3125
Epoch: 0, Batch: 300, Loss: 33.22350311279297, Variance: 77269.9453125
Epoch: 0, Batch: 400, Loss: 30.61174964904785, Variance: 58268.31640625
Epoch: 0, Batch: 500, Loss: 30.48539161682129, Variance: 179236.0625
Epoch: 0, Batch: 600, Loss: 30.524948120117188, Variance: 20881.13671875
Epoch: 0, Batch: 700, Loss: 36.902435302734375, Variance: 36496.84765625
Epoch: 0, Batch: 800, Loss: 31.251785278320312, Variance: 22397.3046875
Epoch: 0, Batch: 900, Loss: 36.22175598144531, Variance: 49594.46484375
Epoch: 0, Batch: 1000, Loss: 38.293609619140625, Variance: 32203.990234375
Epoch: 0, Batch: 1100, Loss: 32.65779113769531, Variance: 28263.1875
Epoch: 0, Batch: 1200, Loss: 30.370241165161133, Variance: 76748.1875
Epoch: 0, Batch: 1300, Loss: 28.27395248413086, Variance: 140897.828125
Epo

Epoch: 2, Batch: 3700, Loss: 30.963459014892578, Variance: 155103.625
Epoch: 2, Validation loss: 30.977779388427734, Validation ppl: 13.748322486877441
Epoch: 3, Batch: 0, Loss: 29.794776916503906, Variance: 126414.0390625
Epoch: 3, Batch: 100, Loss: 34.436279296875, Variance: 56234.109375
Epoch: 3, Batch: 200, Loss: 28.13518714904785, Variance: 163698.875
Epoch: 3, Batch: 300, Loss: 30.396564483642578, Variance: 38462.11328125
Epoch: 3, Batch: 400, Loss: 33.30223083496094, Variance: 92151.859375
Epoch: 3, Batch: 500, Loss: 35.09160614013672, Variance: 58381.34375
Epoch: 3, Batch: 600, Loss: 39.76861572265625, Variance: 91850.5
Epoch: 3, Batch: 700, Loss: 32.66163635253906, Variance: 18709.67578125
Epoch: 3, Batch: 800, Loss: 32.45072937011719, Variance: 53173.19140625
Epoch: 3, Batch: 900, Loss: 37.27729415893555, Variance: 47641.8828125
Epoch: 3, Batch: 1000, Loss: 29.915870666503906, Variance: 50089.875
Epoch: 3, Batch: 1100, Loss: 30.188358306884766, Variance: 141873.59375
Epoch: 3

In [9]:
for ix,batch in enumerate(train_iter):
    src = batch.src.values.transpose(0,1)
    src = reverse_sequence(src)
    trg = batch.trg.values.transpose(0,1)
    break

h0=None
context, hidden_s2c = seq2context(src,h0)
output, hidden_lm = context2trg(trg[:,:-1],hidden_s2c)


print([EN.vocab.itos[i] for i in torch.argmax(lsm(output),2)[3,:]])
print([EN.vocab.itos[i] for i in trg[3,:]])


['transfer', 'effective', 'formula', 'collide', 'open', 'breathes', 'demanding', 'Also', 'vanished', 'party', 'condom', 'Things', 'daf-2', 'formula', 'party', 'James', 'party', 'outsiders', 'Bonnet', 'party', 'averse']
['<s>', 'We', 'also', 'use', '[', 'an', ']', 'electronic', 'medical', 'record', 'system', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [43]:
#word_input, last_context, last_hidden, encoder_outputs

encoder_outputs, encoder_hidden = seq2context(src)
encoder_outputs = encoder_outputs[0,:,:]
word_input = torch.tensor([DE.vocab.stoi['<s>']], device='cuda')
last_context = torch.zeros([1, context_size], device='cuda') # 1 x 500
last_hidden = encoder_hidden
last_hidden = tuple([last_hidden[0][:,0,:].view(2,1,500).contiguous(),last_hidden[1][:,0,:].view(2,1,500).contiguous()])
word_embedded = context2trg.emb(word_input).view(1, 1, -1) # 1 x 1 x 500
rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2) # 1 x 1 x 1000
rnn_output, hidden = attn_context2trg.rnn(rnn_input, last_hidden)

In [28]:
def attn_dot(rnn_output,encoder_outputs):
    return F.softmax(torch.matmul(rnn_output.squeeze(0),encoder_outputs.transpose(0,1)).squeeze(),dim=0).unsqueeze(0).unsqueeze(0)

In [29]:
attn_weights = attn_dot(rnn_output,encoder_outputs)
context = attn_weights.bmm(encoder_outputs.unsqueeze(1).transpose(0, 1))
rnn_output = rnn_output.squeeze(0)
context = context.squeeze(1)
output = F.log_softmax(attn_context2trg.lnr(torch.cat((rnn_output, context), 1)),0)

In [104]:
encoder_outputs, encoder_hidden = seq2context(src)
#encoder_outputs = encoder_outputs[0,:,:]
word_input = torch.tensor([DE.vocab.stoi['<s>']], device='cuda')
last_context = torch.zeros(1, context_size, device='cuda') # 1 x 500
last_hidden = encoder_hidden
#last_hidden = tuple([last_hidden[0][:,0,:].view(2,1,500).contiguous(),last_hidden[1][:,0,:].view(2,1,500).contiguous()])

In [38]:
decoder_output, decoder_context, decoder_hidden, decoder_attention = attn_context2trg(word_input, last_context, last_hidden, encoder_outputs)

In [48]:
 decoder_context.shape #decoder_hidden.shape

torch.Size([1, 500])

In [52]:
word_input = trg[1,1]
decoder_output, decoder_context, decoder_hidden, decoder_attention = attn_context2trg(word_input, decoder_context, decoder_hidden, encoder_outputs)


torch.Size([1, 1, 1000])


In [53]:
def compare_sentence(trg,ix=32):
    outputs = []
    for j in range(trg.shape[1] - 1):
        

torch.Size([1, 1, 1000])


Epoch: 2, Batch: 0, Loss: 161.0377197265625
['Solar', 'the', 'we', 'we', 'slow', 'for', 'body', 'like', 'we', 'for', '<pad>', 'get', 'structures', 'the', 'incredible', '<pad>', '<pad>', 'times', '<pad>', 'the', 'off']
['<s>', 'It', 'looks', 'like', 'it', "'s", 'kind', 'of', 'been', 'there', ',', 'and', 'then', 'crashed', 'all', 'these', 'simpler', 'forms', 'into', 'it', '.', '</s>']
Epoch: 2, Batch: 100, Loss: 195.60531616210938
['To', 'I', 'be', 'comes', '?', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'Just', 'does', "n't", 'make', 'sense', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
Epoch: 2, Batch: 200, Loss: 258.91851806640625
['It', 'car', 'will', 'things', 'of', '<pad>', 'about', '<pad>', 'of', '<pad>', '?', 'book', '.', 'U', '.', 'enhance', '<pad>', '<pad>', '.', '<pad>', '.']
['

KeyboardInterrupt: 

In [14]:
context_size = 500
num_layers = 2
attn_context2trg = attn_RNNet_batched(input_size=len(EN.vocab),hidden_size=context_size,num_layers=num_layers)
attn_context2trg = attn_context2trg.cuda()
attn_context2trg_optimizer = torch.optim.Adam(attn_context2trg.parameters(), lr=1e-3)

seq2context = SequenceModel(len(DE.vocab),context_size,num_layers=num_layers)
seq2context_optimizer = torch.optim.Adam(seq2context.parameters(), lr=1e-3)
seq2context = seq2context.cuda()



scheduler_c2t = torch.optim.lr_scheduler.ReduceLROnPlateau(attn_context2trg_optimizer, mode="min", patience=4)
scheduler_s2c = torch.optim.lr_scheduler.ReduceLROnPlateau(seq2context_optimizer, mode="min", patience=4)



In [15]:
best_ppl = 1e8
for e in range(0,300):
    attn_training_loop(e,train_iter,seq2context,attn_context2trg,seq2context_optimizer,attn_context2trg_optimizer)
    ppl = attn_validation_loop(e,val_iter,seq2context,attn_context2trg,scheduler_c2t,scheduler_s2c,BATCH_SIZE=32,context_size=500)
    if ppl < best_ppl:
        torch.save(seq2context.state_dict(),'best_seq2seq_withattn_seq2context.pt')
        torch.save(attn_context2trg.state_dict(),'best_seq2seq_withattn_context2trg.pt')
        best_ppl = ppl
        print('Wrote model!')

Epoch: 0, Validation PPL: 7016.48095703125
Wrote model!


In [21]:
F.softmax(torch.matmul(rnn_output.squeeze(0),encoder_outputs.transpose(0,1)).squeeze(),dim=0).unsqueeze(0).unsqueeze(0)

tensor([[ 2.6539e-02,  4.2958e-02, -3.3869e-02,  ...,  3.6437e-02,
          1.6895e-02, -6.5814e-03]], device='cuda:0', grad_fn=<AddmmBackward>)

In [None]:
encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
    decoder_hidden = encoder_hidden # Use last hidden state from encoder to start decoder
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()

    # Choose whether to use teacher forcing
    use_teacher_forcing = random.random() < teacher_forcing_ratio
    if use_teacher_forcing:
        
        # Teacher forcing: Use the ground-truth target as the next input
        for di in range(target_length):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_variable[di])
            decoder_input = target_variable[di]

## Submission

In [None]:
# load test set
sentences = []
for i, l in enumerate(open("source_test.txt"), 1):
  sentences.append(re.split(' ', l))