# CS 287, Homework 3: Neural Machine Translation

In [1]:
import torch
from torch.nn.utils import clip_grad_norm_
torch.__version__
from common import *
## Setup
import torch.nn.functional as F

#!pip install --upgrade pip
#!pip install -q numpy

#!pip install -q torch torchtext spacy opt_einsum
#!pip install -qU git+https://github.com/harvardnlp/namedtensor
#!python -m spacy download en
#!python -m spacy download de

# Torch
import torch.nn as nn
import torch
# Text text processing library and methods for pretrained word embeddings
from torchtext import data, datasets
# Named Tensor wrappers
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField
import numpy as np
%reload_ext autoreload
%autoreload 2

In [2]:
# split raw data into tokens
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# add beginning-of-sentence and end-of-sentence tokens to target
BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = NamedField(names=('srcSeqlen',), tokenize=tokenize_de)
EN = NamedField(names=('trgSeqlen',), tokenize=tokenize_en,
                init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS

# download dataset of 200K pairs of sentences
# start with MAXLEN = 20
MAX_LEN = 20
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)
print(train.fields)
print(len(train))
print(vars(train[0]))

# WHAT DOES THIS DO?
'''src = open("valid.src", "w")
trg = open("valid.trg", "w")
for example in val:
    print(" ".join(example.src), file=src)
    print(" ".join(example.trg), file=trg)
src.close()
trg.close()'''

# build vocab, convert words to indices
MIN_FREQ = 5
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)
print(DE.vocab.freqs.most_common(10))
print("Size of German vocab", len(DE.vocab))
print(EN.vocab.freqs.most_common(10))
print("Size of English vocab", len(EN.vocab))
print(EN.vocab.stoi["<s>"], EN.vocab.stoi["</s>"])

print(EN.vocab.stoi["<pad>"], EN.vocab.stoi["<unk>"])

{'src': <namedtensor.text.torch_text.NamedField object at 0x7fca278930b8>, 'trg': <namedtensor.text.torch_text.NamedField object at 0x7fca27893080>}
119076
{'src': ['David', 'Gallo', ':', 'Das', 'ist', 'Bill', 'Lange', '.', 'Ich', 'bin', 'Dave', 'Gallo', '.'], 'trg': ['David', 'Gallo', ':', 'This', 'is', 'Bill', 'Lange', '.', 'I', "'m", 'Dave', 'Gallo', '.']}
[('.', 113253), (',', 67237), ('ist', 24189), ('die', 23778), ('das', 17102), ('der', 15727), ('und', 15622), ('Sie', 15085), ('es', 13197), ('ich', 12946)]
Size of German vocab 13353
[('.', 113433), (',', 59512), ('the', 46029), ('to', 29177), ('a', 27548), ('of', 26794), ('I', 24887), ('is', 21775), ("'s", 20630), ('that', 19814)]
Size of English vocab 11560
2 3
1 0


In [3]:
# split data into batches
BATCH_SIZE = 100
device = torch.device('cuda:0')
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=device,
                                                  repeat=False, sort_key=lambda x: len(x.src))

In [26]:
b.trg.values.transpose(0,1)[:,0]

tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2], device='cuda:0')

## Sequence to Sequence Learning with Neural Networks

- English to French translation, $p \left( y_1, \dots, y_{T'} \ | \ x_1, \dots, x_T \right) = \prod_{t = 1}^{T'} p \left( y_t \ | \ v, y_1, \dots, y_{t-1} \right)$
- Each sentence ends in '<EOS\>', out-of-vocab words denoted '<UNK\>'
- Model specs: 
    * Input vocabulary of 160,000 and output vocabulary of 80,000
    * Deep LSTM to map (encode) input sequence to fixed-len vector
    * Another deep LSTM to translate (decode) fixed-len vector to output sequence
    * 4 layers per LSTM, 1000 cells per layer, 1000-dimensional word embeddings, softmax over 80,000 words
    * Reversing order of words in source (but not target) improved performance
        * Each word in the source is far from its corresponding word in the target (large minimal time lag); reversing the source reduces the minimal time lag, thereby allowing backprop to establish communication between source and target more easily
- Training specs:
    * Initialize all LSTM params $\sim Unif[-0.08,0.08]$
    * SGD w/o momentum, lr = 0.7
        * After 5 epochs, halve the lr every half-epoch
        * Train for 7.5 epochs
    * Batch size = 128; divide gradient by batch size (denoted $g$)
    * Hard constraint gradient norm; if $s = ||g||_2 > 5$, set $s = 5$
    * Make sure all sentences within a minibatch are roughly the same length
- Objective: $max \frac{1}{|S|} \sum_{(T,S) \in \mathcal{S}} log \ p(T \ | \ S)$, where $\mathcal{S}$ is the training set
- Prediction: $\hat{T} = argmax \ p(T \ | \ S)$ via beam search, where beam size $B \in {1,2}$

In [4]:

context_size=1000
num_layers=4

attn_context2trg = attn_RNNet_batched(input_size=len(EN.vocab),hidden_size=context_size,num_layers=num_layers)
attn_context2trg = attn_context2trg.cuda()
attn_context2trg_optimizer = torch.optim.Adam(attn_context2trg.parameters(), lr=1e-3)

seq2context = SequenceModel(len(DE.vocab),context_size,num_layers=num_layers)
seq2context_optimizer = torch.optim.Adam(seq2context.parameters(), lr=1e-3)
seq2context = seq2context.cuda()


scheduler_c2t = torch.optim.lr_scheduler.ReduceLROnPlateau(attn_context2trg_optimizer, mode="min", patience=4)
scheduler_s2c = torch.optim.lr_scheduler.ReduceLROnPlateau(seq2context_optimizer, mode="min", patience=4)



In [None]:
best_loss = 1e8

for e in range(1,300):
    attn_training_loop(e,train_iter,seq2context,attn_context2trg,seq2context_optimizer,attn_context2trg_optimizer,BATCH_SIZE=BATCH_SIZE,context_size=context_size,EN=EN)
    loss = attn_validation_loop(e,val_iter,seq2context,attn_context2trg,scheduler_c2t,scheduler_s2c,BATCH_SIZE=BATCH_SIZE,context_size=context_size,EN=EN)
    if loss < best_loss:
        torch.save(seq2context.state_dict(),'best_seq2seq_seq2context_attn_late.pt')
        torch.save(attn_context2trg.state_dict(),'best_seq2seq_attn_context2trg_late.pt')
        best_loss = loss
        print('Wrote model!')

Epoch: 1, Batch: 0, Loss: 108.34652709960938
Epoch: 1, Batch: 100, Loss: 68.9512710571289
Epoch: 1, Batch: 200, Loss: 61.18465805053711
Epoch: 1, Batch: 300, Loss: 64.8404312133789
Epoch: 1, Batch: 400, Loss: 59.554649353027344
Epoch: 1, Batch: 500, Loss: 52.91864776611328
Epoch: 1, Batch: 600, Loss: 52.738067626953125
Epoch: 1, Batch: 700, Loss: 53.7872200012207
Epoch: 1, Batch: 800, Loss: 49.517024993896484
Epoch: 1, Batch: 900, Loss: 52.06391143798828
Epoch: 1, Batch: 1000, Loss: 51.15459442138672
Epoch: 1, Batch: 1100, Loss: 51.567867279052734
Epoch: 1, Validation PPL: 42.08247756958008, Validation Loss: 21730.998046875
Wrote model!
Epoch: 2, Batch: 0, Loss: 52.247047424316406
Epoch: 2, Batch: 100, Loss: 47.6236686706543
Epoch: 2, Batch: 200, Loss: 47.986671447753906
Epoch: 2, Batch: 300, Loss: 49.60618209838867
Epoch: 2, Batch: 400, Loss: 48.80210494995117
Epoch: 2, Batch: 500, Loss: 47.934139251708984
Epoch: 2, Batch: 600, Loss: 46.388580322265625
Epoch: 2, Batch: 700, Loss: 43.1

In [27]:
seq2context_optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.001
    weight_decay: 0
)

In [None]:
seq2context_optimizer = torch.optim.Adam(seq2context.parameters(), lr=1e-3)
attn_context2trg_optimizer = torch.optim.Adam(attn_context2trg.parameters(), lr=1e-3)
scheduler_c2t = torch.optim.lr_scheduler.ReduceLROnPlateau(attn_context2trg_optimizer, mode="min", patience=4)
scheduler_s2c = torch.optim.lr_scheduler.ReduceLROnPlateau(seq2context_optimizer, mode="min", patience=4)


In [34]:
best_ppl = 1e8
for e in range(0,300):
    attn_training_split_loop(e,train_iter,seq2context,attn_context2trg,seq2context_optimizer,attn_context2trg_optimizer,EN=EN,context_size=500,BATCH_SIZE=BATCH_SIZE)
    ppl = attn_validation_loop(e,val_iter,seq2context,attn_context2trg,scheduler_c2t,scheduler_s2c,BATCH_SIZE=BATCH_SIZE,context_size=500)
    if ppl < best_ppl:
        torch.save(seq2context.state_dict(),'best_seq2seq_withattn_seq2context_splittrain_top3.pt')
        torch.save(attn_context2trg.state_dict(),'best_seq2seq_withattn_context2trg_splittrain_top3.pt')
        best_ppl = ppl
        print('Wrote model!')

Epoch: 0, Batch: 0, Loss: 27.17890167236328
Epoch: 0, Batch: 500, Loss: 28.94061279296875
Epoch: 0, Validation PPL: 9.539901733398438, Validation Loss: 9696.322265625
Wrote model!
Epoch: 1, Batch: 0, Loss: 28.223543167114258
Epoch: 1, Batch: 500, Loss: 47.28041076660156
Epoch: 1, Validation PPL: 8.965592384338379, Validation Loss: 9429.4013671875
Wrote model!
Epoch: 2, Batch: 0, Loss: 27.1455135345459
Epoch: 2, Batch: 500, Loss: 26.679922103881836
Epoch: 2, Validation PPL: 8.621665954589844, Validation Loss: 9261.2421875
Wrote model!
Epoch: 3, Batch: 0, Loss: 43.625587463378906
Epoch: 3, Batch: 500, Loss: 41.136505126953125
Epoch: 3, Validation PPL: 9.451292037963867, Validation Loss: 9656.205078125
Epoch: 4, Batch: 0, Loss: 42.23749923706055
Epoch: 4, Batch: 500, Loss: 40.912757873535156
Epoch: 4, Validation PPL: 9.303613662719727, Validation Loss: 9588.501953125
Epoch: 5, Batch: 0, Loss: 41.40532684326172
Epoch: 5, Batch: 500, Loss: 25.934093475341797
Epoch: 5, Validation PPL: 9.8296

KeyboardInterrupt: 

In [17]:
best_ppl = 1e8
for e in range(20,300):
    attn_training_split_loop_top_3(e,train_iter,seq2context,attn_context2trg,seq2context_optimizer,attn_context2trg_optimizer,BATCH_SIZE=BATCH_SIZE,context_size=context_size,EN=EN)
    ppl = attn_validation_loop(e,val_iter,seq2context,attn_context2trg,scheduler_c2t,scheduler_s2c,BATCH_SIZE=BATCH_SIZE,context_size=context_size)
    if ppl < best_ppl:
        torch.save(seq2context.state_dict(),'best_seq2seq_withattn_seq2context_splittrain_3.6.pt')
        torch.save(attn_context2trg.state_dict(),'best_seq2seq_withattn_context2trg_splittrain_3.6.pt')
        best_ppl = ppl
        print('Wrote model!')




Epoch: 20, Batch: 0, Loss: 26.266517639160156
Epoch: 20, Batch: 100, Loss: 27.572736740112305
Epoch: 20, Batch: 200, Loss: 24.789466857910156
Epoch: 20, Batch: 300, Loss: 4.048263072967529
Epoch: 20, Batch: 400, Loss: 26.88125228881836
Epoch: 20, Batch: 500, Loss: 3.5436458587646484
Epoch: 20, Batch: 600, Loss: 26.24700355529785
Epoch: 20, Batch: 700, Loss: 4.560873508453369
Epoch: 20, Batch: 800, Loss: 30.419139862060547
Epoch: 20, Batch: 900, Loss: 4.189573287963867
Epoch: 20, Batch: 1000, Loss: 3.6460113525390625
Epoch: 20, Batch: 1100, Loss: 3.338818311691284
Epoch: 20, Validation PPL: 11.902926445007324, Validation Loss: 14870.6123046875
Epoch: 21, Batch: 0, Loss: 3.9499988555908203
Epoch: 21, Batch: 100, Loss: 3.9464218616485596
Epoch: 21, Batch: 200, Loss: 2.8606319427490234
Epoch: 21, Batch: 300, Loss: 3.382117986679077
Epoch: 21, Batch: 400, Loss: 25.091960906982422


KeyboardInterrupt: 

In [13]:
best_ppl = 1e8
for e in range(0,300):
    attn_training_split_loop_top_3(e,train_iter,seq2context,attn_context2trg,seq2context_optimizer,attn_context2trg_optimizer,BATCH_SIZE=BATCH_SIZE,context_size=context_size,EN=EN)
    ppl = attn_validation_loop(e,val_iter,seq2context,attn_context2trg,scheduler_c2t,scheduler_s2c,BATCH_SIZE=BATCH_SIZE,context_size=context_size)
    if ppl < best_ppl:
        torch.save(seq2context.state_dict(),'best_seq2seq_withattn_seq2context_splittrain_full_str.pt')
        torch.save(attn_context2trg.state_dict(),'best_seq2seq_withattn_context2trg_splittrain_full_str.pt')
        best_ppl = ppl
        print('Wrote model!')

Epoch: 0, Batch: 0, Loss: 60.514434814453125
Epoch: 0, Batch: 100, Loss: 59.80680847167969


KeyboardInterrupt: 

In [9]:
seq2context.train()
attn_context2trg.train()
for ix,batch in enumerate(train_iter):
        src = batch.src.values.transpose(0,1)
        src = reverse_sequence(src)
        trg = batch.trg.values.transpose(0,1)
        break
        if trg.shape[0] == BATCH_SIZE:
        
            seq2context_optimizer.zero_grad()
            attn_context2trg_optimizer.zero_grad()
        
            encoder_outputs, encoder_hidden = seq2context(src)
            loss = 0
            decoder_context = torch.zeros(BATCH_SIZE, context_size, device='cuda') # 32 x 500
            decoder_hidden = encoder_hidden
            sentence = []
            for j in range(trg.shape[1] - 1):
                word_input = trg[:,j]
                decoder_output, decoder_context, decoder_hidden, decoder_attention = attn_context2trg(word_input, decoder_context, decoder_hidden, encoder_outputs)
                #print(decoder_output.shape, trg[i,j+1].view(-1).shape)
                loss += criterion_train(decoder_output, trg[:,j+1])
                
                if np.mod(ix,100) == 0:
                    sentence.extend([torch.argmax(decoder_output[0,:],dim=0)])
                
            loss.backward()
            seq2context_optimizer.step()
            attn_context2trg_optimizer.step()
        
            if np.mod(ix,500) == 0:
                print('Epoch: {}, Batch: {}, Loss: {}'.format(e, ix, loss.cpu().detach()/BATCH_SIZE))
                #print([EN.vocab.itos[i] for i in sentence])
                #print([EN.vocab.itos[i] for i in trg[0,:]])
    

In [35]:
torch.save(seq2context.state_dict(),'best_seq2seq_withattn_seq2context_splittrain_latest.pt')
torch.save(attn_context2trg.state_dict(),'best_seq2seq_withattn_context2trg_splittrain_latest.pt')
        

In [231]:
next_words.shape
        

torch.Size([64])

In [304]:
next_words.shape

torch.Size([64])

In [291]:
update = []
for ix,p in enumerate(words):
    update.append([torch.stack([p[b]]+([next_words[ix,b]])) for b in range(BEAM_WIDTH)])

In [307]:
update

[[tensor([2, 0], device='cuda:0'), tensor([ 2, 14], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 24], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 27], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 14], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 14], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 27], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 42], device='cuda:0')],
 [tensor([ 2, 27], device='cuda:0'), tensor([ 2, 24], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 24], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 24], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 52], device='cuda:0')],
 [tensor([ 2, 14], device='cuda:0'), tensor([ 2, 27], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 27], device='cuda:0')],
 [tensor([2, 0], device='cuda:0'), tensor([ 2, 24], device='cuda:0')],
 [

tensor([0, 5], device='cuda:0')

In [184]:
beam_indicator = torch.argsort(p_words_running,dim=1,descending=True)[:,:BEAM_WIDTH]>=2
indexs = torch.zeros(BATCH_SIZE,2,device='cuda')
for i in range(BATCH_SIZE):
    indexs[i,:] += i+(BATCH_SIZE*beam_indicator[i,:].float())

In [190]:
indexs = indexs.long()
indexs = indexs.transpose(0,1).flatten()
decoder_hidden = tuple([torch.index_select(h,1,indexs) for h in decoder_hidden])
decoder_context = torch.index_select(decoder_context,0,indexs)

torch.Size([64, 500])

In [116]:
next_words = torch.cat([args[BATCH_SIZE*(b):BATCH_SIZE*(b+1),:] for b in range(BEAM_WIDTH)],dim=1)

In [117]:
p_words = torch.stack([torch.index_select(decoder_output[i,:],-1,next_words[i,:]) for i in range(BATCH_SIZE)])

In [225]:
dc = dict()

In [228]:
items = []
for i in range(10):
    items.append(i)
    dc[i] = []
x = np.random.rand(10)
dc.update(dict(zip(items, x)))
dc

{0: 0.12301008936547375,
 1: 0.1743470876801082,
 2: 0.024813375393648696,
 3: 0.4313473508338772,
 4: 0.7169799087697334,
 5: 0.2156059142994965,
 6: 0.07298992879668831,
 7: 0.8894616281344004,
 8: 0.3090598195280607,
 9: 0.6069338103608437}

## Submission

In [None]:
# load test set
sentences = []
for i, l in enumerate(open("source_test.txt"), 1):
    sentences.append(re.split(' ', l))