# CS 287, Homework 3: Neural Machine Translation

In [190]:
import torch
from torch.nn.utils import clip_grad_norm_
torch.__version__

## Setup

#!pip install --upgrade pip
#!pip install -q numpy

#!pip install -q torch torchtext spacy opt_einsum
#!pip install -qU git+https://github.com/harvardnlp/namedtensor
#!python -m spacy download en
#!python -m spacy download de

# Torch
import torch.nn as nn
import torch
# Text text processing library and methods for pretrained word embeddings
from torchtext import data, datasets
# Named Tensor wrappers
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField
import numpy as np

In [144]:
# split raw data into tokens
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# add beginning-of-sentence and end-of-sentence tokens to target
BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = NamedField(names=('srcSeqlen',), tokenize=tokenize_de)
EN = NamedField(names=('trgSeqlen',), tokenize=tokenize_en,
                init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS

# download dataset of 200K pairs of sentences
# start with MAXLEN = 20
MAX_LEN = 20
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)
print(train.fields)
print(len(train))
print(vars(train[0]))

# WHAT DOES THIS DO?
'''src = open("valid.src", "w")
trg = open("valid.trg", "w")
for example in val:
    print(" ".join(example.src), file=src)
    print(" ".join(example.trg), file=trg)
src.close()
trg.close()'''

# build vocab, convert words to indices
MIN_FREQ = 5
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)
print(DE.vocab.freqs.most_common(10))
print("Size of German vocab", len(DE.vocab))
print(EN.vocab.freqs.most_common(10))
print("Size of English vocab", len(EN.vocab))
print(EN.vocab.stoi["<s>"], EN.vocab.stoi["</s>"])

print(EN.vocab.stoi["<pad>"], EN.vocab.stoi["<unk>"])

{'src': <namedtensor.text.torch_text.NamedField object at 0x7f77635ff5c0>, 'trg': <namedtensor.text.torch_text.NamedField object at 0x7f776088fbe0>}
119076
{'src': ['David', 'Gallo', ':', 'Das', 'ist', 'Bill', 'Lange', '.', 'Ich', 'bin', 'Dave', 'Gallo', '.'], 'trg': ['David', 'Gallo', ':', 'This', 'is', 'Bill', 'Lange', '.', 'I', "'m", 'Dave', 'Gallo', '.']}
[('.', 113253), (',', 67237), ('ist', 24189), ('die', 23778), ('das', 17102), ('der', 15727), ('und', 15622), ('Sie', 15085), ('es', 13197), ('ich', 12946)]
Size of German vocab 13353
[('.', 113433), (',', 59512), ('the', 46029), ('to', 29177), ('a', 27548), ('of', 26794), ('I', 24887), ('is', 21775), ("'s", 20630), ('that', 19814)]
Size of English vocab 11560
2 3
1 0


In [245]:
# split data into batches
BATCH_SIZE = 128
device = torch.device('cuda:0')
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=device,
                                                  repeat=False, sort_key=lambda x: len(x.src))

## Sequence to Sequence Learning with Neural Networks

- English to French translation, $p \left( y_1, \dots, y_{T'} \ | \ x_1, \dots, x_T \right) = \prod_{t = 1}^{T'} p \left( y_t \ | \ v, y_1, \dots, y_{t-1} \right)$
- Each sentence ends in '<EOS\>', out-of-vocab words denoted '<UNK\>'
- Model specs: 
    * Input vocabulary of 160,000 and output vocabulary of 80,000
    * Deep LSTM to map (encode) input sequence to fixed-len vector
    * Another deep LSTM to translate (decode) fixed-len vector to output sequence
    * 4 layers per LSTM, 1000 cells per layer, 1000-dimensional word embeddings, softmax over 80,000 words
    * Reversing order of words in source (but not target) improved performance
        * Each word in the source is far from its corresponding word in the target (large minimal time lag); reversing the source reduces the minimal time lag, thereby allowing backprop to establish communication between source and target more easily
- Training specs:
    * Initialize all LSTM params $\sim Unif[-0.08,0.08]$
    * SGD w/o momentum, lr = 0.7
        * After 5 epochs, halve the lr every half-epoch
        * Train for 7.5 epochs
    * Batch size = 128; divide gradient by batch size (denoted $g$)
    * Hard constraint gradient norm; if $s = ||g||_2 > 5$, set $s = 5$
    * Make sure all sentences within a minibatch are roughly the same length
- Objective: $max \frac{1}{|S|} \sum_{(T,S) \in \mathcal{S}} log \ p(T \ | \ S)$, where $\mathcal{S}$ is the training set
- Prediction: $\hat{T} = argmax \ p(T \ | \ S)$ via beam search, where beam size $B \in {1,2}$

In [274]:
class SequenceModel(nn.Module):
    def __init__(self, src_vocab_size, context_size,weight_init = 0.08):
        super(SequenceModel, self).__init__()
        # embedding
        self.embedding = nn.Embedding(src_vocab_size, context_size)
        # langauge summarization
        self.lstm = nn.LSTM(input_size=context_size, hidden_size=context_size, num_layers=6, batch_first=True)
        for p in self.lstm.parameters():
            torch.nn.init.uniform_(p, a=weight_init, b=weight_init)

    def forward(self, inputs, h0=None):
        # embed the words 
        embedded = self.embedding(inputs)
        # summarize context
        context, hidden = self.lstm(embedded,h0)
        return context, hidden
    
class LanguageModel(nn.Module):
    def __init__(self, target_vocab_size, hidden_size, context_size, weight_init = 0.08 ):
        super(LanguageModel, self).__init__()
        # context is batch_size x seq_len x context_size
        # context to hidden
        self.embedding = nn.Embedding(target_vocab_size, hidden_size)
        # hidden to hidden 
        self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        # decode hidden state for y_t
        for p in self.lstm.parameters():
            torch.nn.init.uniform_(p, a=weight_init, b=weight_init)
            
        self.translate = nn.Linear(hidden_size, target_vocab_size)

    def forward(self, inputs, h0=None):
        # embed the trg words
        embedded = self.embedding(inputs)
        # setting hidden state to context at t=0
        # otherwise context = prev hidden state
        output, hidden = self.lstm(embedded, h0)
        output = self.translate(output)
        return output,hidden


In [308]:
context_size = 1500
seq2context = SequenceModel(len(DE.vocab),context_size)
context2trg = LanguageModel(len(EN.vocab),hidden_size=1500,context_size=context_size)
seq2context,context2trg = seq2context.cuda(),context2trg.cuda()
seq2context_optimizer = torch.optim.Adam(seq2context.parameters(), lr=1e-5)
context2trg_optimizer = torch.optim.Adam(context2trg.parameters(), lr=1e-5)

In [309]:
criterion = nn.CrossEntropyLoss(reduction='none')

In [310]:
def repackage_hidden(h):
    return tuple(v.detach() for v in h)
def repackage_layer(hidden_s2c,hidden=100):
    return tuple([hidden_s2c[0][-1].detach().view(1,BATCH_SIZE,hidden),hidden_s2c[1][-1].detach().view(1,BATCH_SIZE,hidden)])
    

In [311]:
#src = batch.src.values.transpose(0,1)
#trg = batch.trg.values.transpose(0,1)
#trg_str = torch.tensor(EN.vocab.stoi["<s>"],device='cuda').repeat(BATCH_SIZE).view(-1,1)
## example forward pass
#context, hidden_s2c = seq2context(src)
#hidden = repackage_layer(hidden_s2c)
#output, hidden_lm = context2trg(trg_str,hidden)
#hidden = repackage_hidden(hidden_lm)

In [312]:
def training_loop(e=0):
    seq2context.train()
    context2trg.train()
    h0 = None
    for ix,batch in enumerate(train_iter):
        seq2context_optimizer.zero_grad()
        context2trg_optimizer.zero_grad()
        
        src = batch.src.values.transpose(0,1)
        trg = batch.trg.values.transpose(0,1)
        if src.shape[0]!=BATCH_SIZE:
            break
        else:
            # generate hidden state for decoder
            context, hidden_s2c = seq2context(src,h0)
            hidden = repackage_layer(hidden_s2c,context_size)
            output, hidden_lm = context2trg(trg[:,:-1],hidden)
            loss = criterion(output.transpose(2,1),trg[:,1:])
            mask = trg[:,1:]!=1
            loss = loss[mask].sum()
            #clip_grad_norm_(seq2context.parameters(), max_norm=5)
            #clip_grad_norm_(context2trg.parameters(), max_norm=5)
            loss.backward()
            seq2context_optimizer.step()
            context2trg_optimizer.step()
        if np.mod(ix,100) == 0:
            print('Epoch: {}, Batch: {}, loss: {}'.format(e, ix, loss.cpu().detach()))
            
            

In [None]:
for e in range(3):
    training_loop(e)

Epoch: 0, Batch: 0, loss: 16886.04296875
Epoch: 0, Batch: 100, loss: 13038.07421875
Epoch: 0, Batch: 200, loss: 11312.76953125
Epoch: 0, Batch: 300, loss: 10550.916015625
Epoch: 0, Batch: 400, loss: 9419.884765625
Epoch: 0, Batch: 500, loss: 9613.4921875
Epoch: 0, Batch: 600, loss: 10366.25


In [282]:
[EN.vocab.itos[i] for i in trg[3,:]],[DE.vocab.itos[i] for i in src[3,:]]

(['<s>',
  'The',
  'oxygen',
  'tanks',
  'did',
  "n't",
  'quite',
  'work',
  'right',
  '.',
  '</s>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>'],
 ['Die',
  '<unk>',
  'funktionierten',
  'nicht',
  'richtig',
  '.',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>',
  '<pad>'])

In [300]:
h0=None
context, hidden_s2c = seq2context(src,h0)
hidden = repackage_layer(hidden_s2c,context_size)
output, hidden_lm = context2trg(trg[:,:-1],hidden)
lsm = torch.nn.LogSoftmax(dim=2)

In [307]:
output

tensor([[[  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062],
         [  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062],
         [  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062],
         ...,
         [  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062],
         [  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062],
         [  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062]],

        [[  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062],
         [  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062],
         [  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062],
         ...,
         [  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062],
         [  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062],
         [  5.8818, -14.0210, -13.9399,  ...,  -5.9548,  -8.9341,  -9.0062]],

        [[  5.8818, -14.0210, -13.9399,  ...

## Submission

In [None]:
# load test set
sentences = []
for i, l in enumerate(open("source_test.txt"), 1):
  sentences.append(re.split(' ', l))