# CS 287, Homework 3: Neural Machine Translation

In [2]:
from common import *

import torch
from torch.nn.utils import clip_grad_norm_
import torch.nn as nn
from torchtext import data, datasets
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField
import numpy as np

%reload_ext autoreload
%autoreload 2

In [3]:
# split raw data into tokens
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# add beginning-of-sentence and end-of-sentence tokens to target
BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = NamedField(names=('srcSeqlen',), tokenize=tokenize_de)
EN = NamedField(names=('trgSeqlen',), tokenize=tokenize_en,
                init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS

# download dataset of 200K pairs of sentences
# start with MAXLEN = 20
MAX_LEN = 20
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)
#print(train.fields)
#print(len(train))
#print(vars(train[0]))

# build vocab, convert words to indices
MIN_FREQ = 5
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)
#print(DE.vocab.freqs.most_common(10))
#print("Size of German vocab", len(DE.vocab))
#print(EN.vocab.freqs.most_common(10))
#print("Size of English vocab", len(EN.vocab))
print(EN.vocab.stoi["<s>"], EN.vocab.stoi["</s>"])
print(EN.vocab.stoi["<pad>"], EN.vocab.stoi["<unk>"])

2 3
1 0


In [4]:
# split data into batches
BATCH_SIZE = 32
device = torch.device('cuda:0')
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=device,
                                                  repeat=False, sort_key=lambda x: len(x.src))

## Sequence to Sequence Learning with Neural Networks

- English to French translation, $p \left( y_1, \dots, y_{T'} \ | \ x_1, \dots, x_T \right) = \prod_{t = 1}^{T'} p \left( y_t \ | \ v, y_1, \dots, y_{t-1} \right)$
- Each sentence ends in '<EOS\>', out-of-vocab words denoted '<UNK\>'
- Model specs: 
    * Input vocabulary of 160,000 and output vocabulary of 80,000
    * Deep LSTM to map (encode) input sequence to fixed-len vector
    * Another deep LSTM to translate (decode) fixed-len vector to output sequence
    * 4 layers per LSTM, 1000 cells per layer, 1000-dimensional word embeddings, softmax over 80,000 words
    * Reversing order of words in source (but not target) improved performance
        * Each word in the source is far from its corresponding word in the target (large minimal time lag); reversing the source reduces the minimal time lag, thereby allowing backprop to establish communication between source and target more easily
- Training specs:
    * Initialize all LSTM params $\sim Unif[-0.08,0.08]$
    * SGD w/o momentum, lr = 0.7
        * After 5 epochs, halve the lr every half-epoch
        * Train for 7.5 epochs
    * Batch size = 128; divide gradient by batch size (denoted $g$)
    * Hard constraint gradient norm; if $s = ||g||_2 > 5$, set $s = 5$
    * Make sure all sentences within a minibatch are roughly the same length
- Objective: $max \frac{1}{|S|} \sum_{(T,S) \in \mathcal{S}} log \ p(T \ | \ S)$, where $\mathcal{S}$ is the training set
- Prediction: $\hat{T} = argmax \ p(T \ | \ S)$ via beam search, where beam size $B \in {1,2}$

In [5]:
'''class SequenceModel(nn.Module):
    def __init__(self, src_vocab_size, context_size,weight_init = 0.08):
        super(SequenceModel, self).__init__()
        # embedding
        self.embedding = nn.Embedding(src_vocab_size, context_size)
        # langauge summarization
        self.lstm = nn.LSTM(input_size=context_size, hidden_size=context_size, num_layers=2, batch_first=True)
        for p in self.lstm.parameters():
            torch.nn.init.uniform_(p, a=weight_init, b=weight_init)

    def forward(self, inputs, h0=None):
        # embed the words 
        embedded = self.embedding(inputs)
        # summarize context
        context, hidden = self.lstm(embedded,h0)
        return context, hidden
    
class LanguageModel(nn.Module):
    def __init__(self, target_vocab_size, hidden_size, context_size, weight_init = 0.08):
        super(LanguageModel, self).__init__()
        # context is batch_size x seq_len x context_size
        # context to hidden
        self.embedding = nn.Embedding(target_vocab_size, hidden_size)
        # hidden to hidden 
        self.lstm = nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        # decode hidden state for y_t
        for p in self.lstm.parameters():
            torch.nn.init.uniform_(p, a=weight_init, b=weight_init)
            
        self.translate = nn.Linear(hidden_size, target_vocab_size)

    def forward(self, inputs, h0=None):
        # embed the trg words
        embedded = self.embedding(inputs)
        # setting hidden state to context at t=0
        # otherwise context = prev hidden state
        output, hidden = self.lstm(embedded, h0)
        output = self.translate(output)
        return output,hidden'''

'class SequenceModel(nn.Module):\n    def __init__(self, src_vocab_size, context_size,weight_init = 0.08):\n        super(SequenceModel, self).__init__()\n        # embedding\n        self.embedding = nn.Embedding(src_vocab_size, context_size)\n        # langauge summarization\n        self.lstm = nn.LSTM(input_size=context_size, hidden_size=context_size, num_layers=2, batch_first=True)\n        for p in self.lstm.parameters():\n            torch.nn.init.uniform_(p, a=weight_init, b=weight_init)\n\n    def forward(self, inputs, h0=None):\n        # embed the words \n        embedded = self.embedding(inputs)\n        # summarize context\n        context, hidden = self.lstm(embedded,h0)\n        return context, hidden\n    \nclass LanguageModel(nn.Module):\n    def __init__(self, target_vocab_size, hidden_size, context_size, weight_init = 0.08):\n        super(LanguageModel, self).__init__()\n        # context is batch_size x seq_len x context_size\n        # context to hidden\n        s

In [6]:
'''lsm = nn.LogSoftmax(dim=2)
criterion = nn.CrossEntropyLoss(reduction='none')'''

"lsm = nn.LogSoftmax(dim=2)\ncriterion = nn.CrossEntropyLoss(reduction='none')"

In [7]:
'''def repackage_hidden(h):
    return tuple(v.detach() for v in h)
def repackage_layer(hidden_s2c,hidden=100):
    return tuple([hidden_s2c[0][-1].detach().view(1,BATCH_SIZE,hidden),hidden_s2c[1][-1].detach().view(1,BATCH_SIZE,hidden)])
def reverse_sequence(src):
    length = list(src.shape)[1]
    idx = torch.linspace(length-1, 0, steps=length).long()
    rev_src = src[:,idx]
    return rev_src'''

'def repackage_hidden(h):\n    return tuple(v.detach() for v in h)\ndef repackage_layer(hidden_s2c,hidden=100):\n    return tuple([hidden_s2c[0][-1].detach().view(1,BATCH_SIZE,hidden),hidden_s2c[1][-1].detach().view(1,BATCH_SIZE,hidden)])\ndef reverse_sequence(src):\n    length = list(src.shape)[1]\n    idx = torch.linspace(length-1, 0, steps=length).long()\n    rev_src = src[:,idx]\n    return rev_src'

In [8]:
context_size = 500
num_layers = 2
seq2context = SequenceModel(len(DE.vocab),context_size,num_layers)
context2trg = LanguageModel(len(EN.vocab),hidden_size=context_size,context_size=context_size)
seq2context,context2trg = seq2context.cuda(),context2trg.cuda()
seq2context_optimizer = torch.optim.Adam(seq2context.parameters(), lr=1e-5)
context2trg_optimizer = torch.optim.Adam(context2trg.parameters(), lr=1e-5)

In [9]:
'''def training_loop(e=0):
    seq2context.train()
    context2trg.train()
    h0 = None
    for ix,batch in enumerate(train_iter):
        seq2context_optimizer.zero_grad()
        context2trg_optimizer.zero_grad()
        
        src = batch.src.values.transpose(0,1)
        src = reverse_sequence(src)
        trg = batch.trg.values.transpose(0,1)
        if src.shape[0]!=BATCH_SIZE:
            break
        else:
            # generate hidden state for decoder
            context, hidden_s2c = seq2context(src,h0)
            hidden = repackage_layer(hidden_s2c,context_size)
            output, hidden_lm = context2trg(trg[:,:-1],hidden)
            loss = criterion(output.transpose(2,1),trg[:,1:])
            mask = trg[:,1:]!=1
            loss = loss[mask].sum()
            #clip_grad_norm_(seq2context.parameters(), max_norm=5)
            #clip_grad_norm_(context2trg.parameters(), max_norm=5)
            loss.backward()
            seq2context_optimizer.step()
            context2trg_optimizer.step()
        if np.mod(ix,100) == 0:
            var = torch.var(torch.argmax(lsm(output).cpu().detach(),2).float())
            print('Epoch: {}, Batch: {}, loss: {}, var: {},'.format(e, ix, loss.cpu().detach()/BATCH_SIZE, var))
    loss = 0
    for b in iter(val_iter):
        src = b.src.values.transpose(0,1)
        src = reverse_sequence(src)
        trg = b.trg.values.transpose(0,1)
        if src.shape[0]!=BATCH_SIZE:
            break
        else:
            # generate hidden state for decoder
            context, hidden_s2c = seq2context(src,h0)
            hidden = repackage_layer(hidden_s2c,context_size)
            output, hidden_lm = context2trg(trg[:,:-1],hidden)
            bloss = criterion(output.transpose(2,1),trg[:,1:])
            mask = trg[:,1:]!=1
            loss += bloss[mask].sum()
    print('Epoch: {}, loss: {}, var: {},'.format(e, loss.cpu().detach()/(BATCH_SIZE*len(val_iter))))'''

"def training_loop(e=0):\n    seq2context.train()\n    context2trg.train()\n    h0 = None\n    for ix,batch in enumerate(train_iter):\n        seq2context_optimizer.zero_grad()\n        context2trg_optimizer.zero_grad()\n        \n        src = batch.src.values.transpose(0,1)\n        src = reverse_sequence(src)\n        trg = batch.trg.values.transpose(0,1)\n        if src.shape[0]!=BATCH_SIZE:\n            break\n        else:\n            # generate hidden state for decoder\n            context, hidden_s2c = seq2context(src,h0)\n            hidden = repackage_layer(hidden_s2c,context_size)\n            output, hidden_lm = context2trg(trg[:,:-1],hidden)\n            loss = criterion(output.transpose(2,1),trg[:,1:])\n            mask = trg[:,1:]!=1\n            loss = loss[mask].sum()\n            #clip_grad_norm_(seq2context.parameters(), max_norm=5)\n            #clip_grad_norm_(context2trg.parameters(), max_norm=5)\n            loss.backward()\n            seq2context_optimizer.ste

In [10]:
'''for e in range(2):
    training_loop(e,train_iter,seq2context,context2trg,seq2context_optimizer,context2trg_optimizer,BATCH_SIZE)
    validation_loop(e,val_iter,seq2context,context2trg,BATCH_SIZE)'''

'for e in range(2):\n    training_loop(e,train_iter,seq2context,context2trg,seq2context_optimizer,context2trg_optimizer,BATCH_SIZE)\n    validation_loop(e,val_iter,seq2context,context2trg,BATCH_SIZE)'

In [11]:
'''for ix,batch in enumerate(train_iter):
    src = batch.src.values.transpose(0,1)
    trg = batch.trg.values.transpose(0,1)
    break

h0 = None
context, hidden_s2c = seq2context(reverse_sequence(src),h0)
hidden = repackage_layer(hidden_s2c,context_size)
output, hidden_lm = context2trg(trg[:,:-1],hidden)

[EN.vocab.itos[i] for i in torch.argmax(lsm(output),2)[30,:]]'''

'for ix,batch in enumerate(train_iter):\n    src = batch.src.values.transpose(0,1)\n    trg = batch.trg.values.transpose(0,1)\n    break\n\nh0 = None\ncontext, hidden_s2c = seq2context(reverse_sequence(src),h0)\nhidden = repackage_layer(hidden_s2c,context_size)\noutput, hidden_lm = context2trg(trg[:,:-1],hidden)\n\n[EN.vocab.itos[i] for i in torch.argmax(lsm(output),2)[30,:]]'

## Beam Search

In [12]:
START_IDX = EN.vocab.stoi["<s>"]
END_IDX = EN.vocab.stoi["</s>"]
BEAM_WIDTH = 3
lsm = torch.nn.LogSoftmax(dim=2)

In [13]:
it = iter(val_iter)
b = next(it)
ger = torch.unsqueeze(b.src.values.transpose(0,1)[0,:],0)
#eng = b.trg.values.transpose(0,1)[0,:]
ger.shape

torch.Size([1, 4])

In [14]:
h0 = None
context, hidden_s2c = seq2context(ger,h0)
hidden = repackage_layer(hidden_s2c,context_size,BATCH_SIZE=1)

In [15]:
x = torch.tensor([START_IDX], device='cuda')
x = torch.unsqueeze(x.repeat(BEAM_WIDTH),-1)
x.shape

torch.Size([3, 1])

In [16]:
a = hidden[0]
a_size = a.shape
b = hidden[1]
b_size = b.shape
hidden = tuple((a.repeat(1,BEAM_WIDTH,1),b.repeat(1,BEAM_WIDTH,1)))
hidden[0].shape, hidden[1].shape

(torch.Size([1, 3, 500]), torch.Size([1, 3, 500]))

In [17]:
output, hidden_lm = context2trg(x,hidden)
temp = lsm(output).squeeze()

In [18]:
top = torch.topk(temp.view(-1), BEAM_WIDTH, sorted=False)
top[1].view(BEAM_WIDTH,-1).shape

torch.Size([3, 1])

In [19]:
x = torch.cat((x,top[1].view(BEAM_WIDTH,-1)),dim=1)
x.shape

torch.Size([3, 2])

In [20]:
hidden = hidden_lm
output, hidden_lm = context2trg(x,hidden)
temp = lsm(output)
temp.shape

torch.Size([3, 2, 11560])

In [21]:
#torch.argmax(temp[:,-1,:])

RuntimeError: cuda runtime error (59) : device-side assert triggered at /home/sager/pytorch/aten/src/THC/THCTensorCopy.cu:205

In [None]:
[EN.vocab.itos[i] for i in eng],[DE.vocab.itos[i] for i in ger]

## Submission

In [None]:
'''# load test set
sentences = []
for i, l in enumerate(open("source_test.txt"), 1):
  sentences.append(re.split(' ', l))'''