# CS 287, Homework 3: Neural Machine Translation

In [119]:
import torch
from torch.nn.utils import clip_grad_norm_
torch.__version__
from common import *
## Setup

#!pip install --upgrade pip
#!pip install -q numpy

#!pip install -q torch torchtext spacy opt_einsum
#!pip install -qU git+https://github.com/harvardnlp/namedtensor
#!python -m spacy download en
#!python -m spacy download de

# Torch
import torch.nn as nn
import torch
# Text text processing library and methods for pretrained word embeddings
from torchtext import data, datasets
# Named Tensor wrappers
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField
import numpy as np
%reload_ext autoreload
%autoreload 2

In [2]:
# split raw data into tokens
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# add beginning-of-sentence and end-of-sentence tokens to target
BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = NamedField(names=('srcSeqlen',), tokenize=tokenize_de)
EN = NamedField(names=('trgSeqlen',), tokenize=tokenize_en,
                init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS

# download dataset of 200K pairs of sentences
# start with MAXLEN = 20
MAX_LEN = 20
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)
print(train.fields)
print(len(train))
print(vars(train[0]))

# WHAT DOES THIS DO?
'''src = open("valid.src", "w")
trg = open("valid.trg", "w")
for example in val:
    print(" ".join(example.src), file=src)
    print(" ".join(example.trg), file=trg)
src.close()
trg.close()'''

# build vocab, convert words to indices
MIN_FREQ = 5
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)
print(DE.vocab.freqs.most_common(10))
print("Size of German vocab", len(DE.vocab))
print(EN.vocab.freqs.most_common(10))
print("Size of English vocab", len(EN.vocab))
print(EN.vocab.stoi["<s>"], EN.vocab.stoi["</s>"])

print(EN.vocab.stoi["<pad>"], EN.vocab.stoi["<unk>"])

{'src': <namedtensor.text.torch_text.NamedField object at 0x7f7656851b70>, 'trg': <namedtensor.text.torch_text.NamedField object at 0x7f764f700710>}
119076
{'src': ['David', 'Gallo', ':', 'Das', 'ist', 'Bill', 'Lange', '.', 'Ich', 'bin', 'Dave', 'Gallo', '.'], 'trg': ['David', 'Gallo', ':', 'This', 'is', 'Bill', 'Lange', '.', 'I', "'m", 'Dave', 'Gallo', '.']}
[('.', 113253), (',', 67237), ('ist', 24189), ('die', 23778), ('das', 17102), ('der', 15727), ('und', 15622), ('Sie', 15085), ('es', 13197), ('ich', 12946)]
Size of German vocab 13353
[('.', 113433), (',', 59512), ('the', 46029), ('to', 29177), ('a', 27548), ('of', 26794), ('I', 24887), ('is', 21775), ("'s", 20630), ('that', 19814)]
Size of English vocab 11560
2 3
1 0


In [3]:
# split data into batches
BATCH_SIZE = 32
device = torch.device('cuda:0')
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=device,
                                                  repeat=False, sort_key=lambda x: len(x.src))

## Sequence to Sequence Learning with Neural Networks

- English to French translation, $p \left( y_1, \dots, y_{T'} \ | \ x_1, \dots, x_T \right) = \prod_{t = 1}^{T'} p \left( y_t \ | \ v, y_1, \dots, y_{t-1} \right)$
- Each sentence ends in '<EOS\>', out-of-vocab words denoted '<UNK\>'
- Model specs: 
    * Input vocabulary of 160,000 and output vocabulary of 80,000
    * Deep LSTM to map (encode) input sequence to fixed-len vector
    * Another deep LSTM to translate (decode) fixed-len vector to output sequence
    * 4 layers per LSTM, 1000 cells per layer, 1000-dimensional word embeddings, softmax over 80,000 words
    * Reversing order of words in source (but not target) improved performance
        * Each word in the source is far from its corresponding word in the target (large minimal time lag); reversing the source reduces the minimal time lag, thereby allowing backprop to establish communication between source and target more easily
- Training specs:
    * Initialize all LSTM params $\sim Unif[-0.08,0.08]$
    * SGD w/o momentum, lr = 0.7
        * After 5 epochs, halve the lr every half-epoch
        * Train for 7.5 epochs
    * Batch size = 128; divide gradient by batch size (denoted $g$)
    * Hard constraint gradient norm; if $s = ||g||_2 > 5$, set $s = 5$
    * Make sure all sentences within a minibatch are roughly the same length
- Objective: $max \frac{1}{|S|} \sum_{(T,S) \in \mathcal{S}} log \ p(T \ | \ S)$, where $\mathcal{S}$ is the training set
- Prediction: $\hat{T} = argmax \ p(T \ | \ S)$ via beam search, where beam size $B \in {1,2}$

In [121]:
context_size = 500
seq2context = SequenceModel(len(DE.vocab),context_size,num_layers=2)
context2trg = RNNet(input_size=len(EN.vocab),hidden_size=context_size,num_layers=2,weight_tie=True)

seq2context,context2trg = seq2context.cuda(),context2trg.cuda()
seq2context_optimizer = torch.optim.SGD(seq2context.parameters(), lr=1)
context2trg_optimizer = torch.optim.SGD(context2trg.parameters(), lr=1)

lr_lambda = lambda t: learning_rate / (1.2**max(t-6,0))
scheduler_c2t = torch.optim.lr_scheduler.LambdaLR(context2trg_optimizer, lr_lambda, last_epoch=-1)
scheduler_s2c = torch.optim.lr_scheduler.LambdaLR(seq2context_optimizer, lr_lambda, last_epoch=-1)



In [122]:
seq2context.train()
context2trg.train()
context_size = seq2context.context_size

In [123]:
for ix,batch in enumerate(train_iter):
    seq2context_optimizer.zero_grad()
    context2trg_optimizer.zero_grad()
        
    src = batch.src.values.transpose(0,1)
    src = reverse_sequence(src)
    trg = batch.trg.values.transpose(0,1)
    context, hidden_s2c = seq2context(src)
    output, hidden_lm = context2trg(trg[:,:-1],hidden_s2c)
    
    

In [125]:
for e in range(10):
    training_loop(e,train_iter,seq2context,context2trg,seq2context_optimizer,context2trg_optimizer,BATCH_SIZE)
    scheduler_c2t.step()
    scheduler_s2c.step()
    #validation_loop(e,val_iter,seq2context,context2trg,seq2context_sch,context2trg_sch,BATCH_SIZE)

Epoch: 0, Batch: 0, Loss: 52.18206024169922, Variance: 1337.692626953125
Epoch: 0, Batch: 100, Loss: 54.820556640625, Variance: 1533.808349609375
Epoch: 0, Batch: 200, Loss: 45.864437103271484, Variance: 1175.11474609375
Epoch: 0, Batch: 300, Loss: 58.96554183959961, Variance: 4183.87158203125
Epoch: 0, Batch: 400, Loss: 56.798648834228516, Variance: 1819.83203125
Epoch: 0, Batch: 500, Loss: 54.7036018371582, Variance: 1071.6414794921875
Epoch: 0, Batch: 600, Loss: 61.153053283691406, Variance: 3251.94580078125
Epoch: 0, Batch: 700, Loss: 54.923439025878906, Variance: 1505.384521484375
Epoch: 0, Batch: 800, Loss: 46.069190979003906, Variance: 4835.55810546875
Epoch: 0, Batch: 900, Loss: 59.25823974609375, Variance: 2074.23779296875
Epoch: 0, Batch: 1000, Loss: 45.67032241821289, Variance: 2824.869873046875
Epoch: 0, Batch: 1100, Loss: 57.12811279296875, Variance: 4684.85546875
Epoch: 0, Batch: 1200, Loss: 49.58352279663086, Variance: 915.0695190429688
Epoch: 0, Batch: 1300, Loss: 52.87

Epoch: 2, Batch: 3500, Loss: 41.845130920410156, Variance: 5340.4267578125
Epoch: 2, Batch: 3600, Loss: 51.503623962402344, Variance: 4922.91845703125
Epoch: 2, Batch: 3700, Loss: 45.78595733642578, Variance: 6115.80224609375
Epoch: 3, Batch: 0, Loss: 46.17428207397461, Variance: 3764.007568359375
Epoch: 3, Batch: 100, Loss: 47.73191452026367, Variance: 14665.396484375
Epoch: 3, Batch: 200, Loss: 43.12935256958008, Variance: 5597.5595703125
Epoch: 3, Batch: 300, Loss: 47.79056167602539, Variance: 9760.158203125
Epoch: 3, Batch: 400, Loss: 46.635135650634766, Variance: 21501.12109375
Epoch: 3, Batch: 500, Loss: 47.73276138305664, Variance: 9774.1220703125
Epoch: 3, Batch: 600, Loss: 45.85759353637695, Variance: 9467.9189453125
Epoch: 3, Batch: 700, Loss: 46.8093376159668, Variance: 6838.7412109375
Epoch: 3, Batch: 800, Loss: 39.19867706298828, Variance: 7082.6884765625
Epoch: 3, Batch: 900, Loss: 39.12876892089844, Variance: 30252.23046875
Epoch: 3, Batch: 1000, Loss: 41.857208251953125

Epoch: 5, Batch: 3300, Loss: 34.462039947509766, Variance: 8221.2626953125
Epoch: 5, Batch: 3400, Loss: 40.93482971191406, Variance: 8114.609375
Epoch: 5, Batch: 3500, Loss: 42.83749771118164, Variance: 30532.6953125
Epoch: 5, Batch: 3600, Loss: 43.540283203125, Variance: 43135.3046875
Epoch: 5, Batch: 3700, Loss: 46.27184295654297, Variance: 6681.1279296875
Epoch: 6, Batch: 0, Loss: 37.7484245300293, Variance: 46228.7890625
Epoch: 6, Batch: 100, Loss: 46.45466232299805, Variance: 44198.33984375
Epoch: 6, Batch: 200, Loss: 41.95960235595703, Variance: 17436.609375
Epoch: 6, Batch: 300, Loss: 43.59656524658203, Variance: 84472.2421875
Epoch: 6, Batch: 400, Loss: 43.830257415771484, Variance: 8190.99951171875
Epoch: 6, Batch: 500, Loss: 40.59212875366211, Variance: 31837.552734375
Epoch: 6, Batch: 600, Loss: 40.56708908081055, Variance: 8624.8916015625
Epoch: 6, Batch: 700, Loss: 37.19648361206055, Variance: 5548.4462890625
Epoch: 6, Batch: 800, Loss: 39.65617370605469, Variance: 26506.1

Epoch: 8, Batch: 3200, Loss: 36.66256332397461, Variance: 74106.6171875
Epoch: 8, Batch: 3300, Loss: 37.974395751953125, Variance: 25507.521484375
Epoch: 8, Batch: 3400, Loss: 40.10917663574219, Variance: 11226.9541015625
Epoch: 8, Batch: 3500, Loss: 37.86681365966797, Variance: 61479.40234375
Epoch: 8, Batch: 3600, Loss: 33.697078704833984, Variance: 27347.599609375
Epoch: 8, Batch: 3700, Loss: 43.85010528564453, Variance: 43441.5234375
Epoch: 9, Batch: 0, Loss: 42.134159088134766, Variance: 40197.92578125
Epoch: 9, Batch: 100, Loss: 32.69289016723633, Variance: 37285.796875
Epoch: 9, Batch: 200, Loss: 35.8642692565918, Variance: 14448.7099609375
Epoch: 9, Batch: 300, Loss: 38.35967254638672, Variance: 110047.7109375
Epoch: 9, Batch: 400, Loss: 37.76131820678711, Variance: 11352.5830078125
Epoch: 9, Batch: 500, Loss: 41.570919036865234, Variance: 141680.046875
Epoch: 9, Batch: 600, Loss: 41.1958122253418, Variance: 15427.5673828125
Epoch: 9, Batch: 700, Loss: 38.54166030883789, Varian

In [132]:
for ix,batch in enumerate(train_iter):
    src = batch.src.values.transpose(0,1)
    src = reverse_sequence(src)
    trg = batch.trg.values.transpose(0,1)
    break

h0=None
context, hidden_s2c = seq2context(src,h0)
output, hidden_lm = context2trg(trg[:,:-1],hidden_s2c)


print([EN.vocab.itos[i] for i in torch.argmax(lsm(output),2)[3,:]])
print([EN.vocab.itos[i] for i in trg[3,:]])


['Well', ',', 'a', 'of', 'to', 'this', 'with', 'this', 'problem', 'is', '<unk>', '<unk>', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
['<s>', 'Now', ',', 'one', 'way', 'of', 'dealing', 'with', 'this', 'problem', 'is', 'by', '<unk>', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


## Submission

In [None]:
# load test set
sentences = []
for i, l in enumerate(open("source_test.txt"), 1):
  sentences.append(re.split(' ', l))