# CS 287, Homework 3: Neural Machine Translation

In [147]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_

from common import *

import re
from torchtext import data, datasets
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField
import numpy as np

%reload_ext autoreload
%autoreload 2

In [2]:
# split raw data into tokens
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# add beginning-of-sentence and end-of-sentence tokens to target
BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = NamedField(names=('srcSeqlen',), tokenize=tokenize_de)
EN = NamedField(names=('trgSeqlen',), tokenize=tokenize_en,
                init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS

# download dataset of 200K pairs of sentences
# start with MAXLEN = 20
MAX_LEN = 20
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)

# build vocab, convert words to indices
MIN_FREQ = 5
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)
print("Size of German vocab", len(DE.vocab))
print("Size of English vocab", len(EN.vocab))
print(EN.vocab.stoi["<s>"], EN.vocab.stoi["</s>"])
print(EN.vocab.stoi["<pad>"], EN.vocab.stoi["<unk>"])

Size of German vocab 13353
Size of English vocab 11560
2 3
1 0


In [3]:
# split data into batches
BATCH_SIZE = 32
device = torch.device('cuda:0')
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=device,
                                                  repeat=False, sort_key=lambda x: len(x.src))

In [148]:
context_size = 1000
num_layers = 4

#BEAM_WIDTH = 100
#max_len = 3

attn_seq2context = SequenceModel(len(DE.vocab),context_size,num_layers=num_layers)
state_dict = torch.load('best_seq2seq_withattn_seq2context_big_network_no_unk.pt')
attn_seq2context.load_state_dict(state_dict)
attn_seq2context = attn_seq2context.cuda()

attn_context2trg = attn_RNNet_batched(input_size=len(EN.vocab),hidden_size=context_size,num_layers=num_layers)
state_dict = torch.load('best_seq2seq_withattn_context2trg_big_network_no_unk.pt') 
attn_context2trg.load_state_dict(state_dict)
attn_context2trg = attn_context2trg.cuda()

In [13]:
# load test set
sentences = []
for i, l in enumerate(open("source_test.txt")):
    sentences.append(re.split(' ', l))

max_sent_len = 0
for i in range(len(sentences)):
    sentences[i][-1] = '.'
    if len(sentences[i]) > max_sent_len:
        max_sent_len = len(sentences[i])

batch = torch.tensor([], device='cuda')
for b in range(len(sentences)):
    m = nn.ConstantPad1d((0, max_sent_len - len(sentences[b])), EN.vocab.stoi['<pad>'])
    src = m(torch.tensor([DE.vocab.stoi[i] for i in sentences[b]], device='cuda').unsqueeze(0)).float()
    batch = torch.cat((batch,src), dim=0)
batch_rev = reverse_sequence(batch)

In [27]:
sentences[10][-1]

'.'

In [17]:
#[DE.vocab.itos[i] for i in .long()]
batch_rev[10].long()

tensor([   1,    1,    1,    1,    1,    2,  343,   15,  182,   14,    0,  339,
        6466,    3,    0,  146,   62,    0, 1131,   40], device='cuda:0')

In [18]:
batch_rev_data = torch.utils.data.TensorDataset(batch_rev)
#for i in batch_rev_data_loader:
#    src = i[0]
#    break

In [19]:
BATCH_SIZE = 32
batch_rev_data_loader = torch.utils.data.DataLoader(batch_rev_data, batch_size=BATCH_SIZE, shuffle=False)
#BATCH_SIZE = src.shape[0]

In [18]:
p_words_running.shape

torch.Size([50, 2500])

## Submission

In [51]:
outputs = beam_search(src, attn_seq2context, attn_context2trg, BEAM_WIDTH = 8, BATCH_SIZE=BATCH_SIZE, max_len=3,EN=EN)
preds = torch.stack([i[:BATCH_SIZE,:] for i in outputs]).transpose(0,1).transpose(1,2)

In [45]:
' '.join(ints_to_sentences(list_of_phrases))

"I|was|an I|was|a I|was|going I|was|having I|was|talking <unk>|'ve|watching I|was|in I|was|looking I|was|running I|was|writing I|do|responsible I|miss|what I|was|wearing I|see|on A|'s|there I|was|supposed I|did|showing I|'m|doing I|was|willing I|is|taking I|was|holding And|want|putting I|think|entering I|just|teaching I|was|saying I|'m|the I|'ll|making I|miss|<unk> I|'m|able I|guess|wondering I|was|flying I|always|giving I|<unk>|losing I|was|just I|first|so I|have|trying I|was|creating I|was|reading I|would|like I|'d|standing I|get|seeing I|wrote|curious I|was|using I|call|playing I|live|building And|feel|leaving I|was|experiencing I|'m|thinking And|want|several I|was|here I|do|one I|was|hanging <unk>|'ve|then I|am|eating I|always|launching I|'m|, I|read|asking I|was|leading One|felt|to I|was|somebody And|feel|listening Well|go|this I|<unk>|sending I|negotiate|actually I|was|following <unk>|'ve|calling <unk>|like|being Voice|kind|offering I|do|glad I|wrote|that I|was|my I|look|some I|'

In [23]:
#preds = " ".join("|".join([EN.vocab.itos[j] for j in top_s[0][0][1:]]) for )
#preds
EN.vocab.stoi['thought']

207

In [28]:
# for kaggle
line_counter = 0
with open("pred_kaggle_best_model_no-split_big.txt", "w+") as f:
    f.write("Id,Predicted\n")
    for b in batch_rev_data_loader:
        src = b[0].long()
        top_s = beam_search(src, attn_seq2context, attn_context2trg, BEAM_WIDTH=BEAM_WIDTH, BATCH_SIZE=BATCH_SIZE, max_len=max_len, context_size=context_size, EN=EN)
        lists_per_sample = [torch.stack(top_s[i])[:,1:4] for i in range(BATCH_SIZE)] 
        for i in range(BATCH_SIZE):
            #temp = pd.DataFrame(torch.stack(top_s[i])[:,1:4].cpu().numpy())
            #list_of_phrases = torch.tensor(temp.drop_duplicates().values)[:100,]
            list_of_phrases = lists_per_sample[i]
            f.write("%d,%s"%(line_counter,escape(' '.join(ints_to_sentences(list_of_phrases,EN)))))
            f.write('\n')
            line_counter+=1

In [30]:
import pandas as pd
temp = pd.DataFrame(torch.stack(top_s[0])[:,1:4].cpu().numpy())
#temp.shape
torch.tensor(temp.drop_duplicates().values)[:100,].shape

torch.Size([87, 3])

In [None]:
# for bleu
for i, l in enumerate(open("source_test.txt")):
    stop_idx = torch.tensor([max_len], device='cuda') if (top_s[i][0] == 3).nonzero().size()[0]==0 else (top_s[i][0] == 3).nonzero()
    f.write("%s\n"%(" ".join([EN.vocab.itos[j] for j in top_s[i][0][1:stop_idx]])))

In [60]:
p_words_running = torch.stack([update_p[:,b].repeat(1,BEAM_WIDTH) for b in range(BEAM_WIDTH)]).view(BEAM_WIDTH**2,BATCH_SIZE).transpose(0,1)

In [74]:
#max_len=20
line_counter = 0
with open("pred_bleu_update.txt", "w+") as f:
    for b in batch_rev_data_loader:     
        src = b[0].long()
        top_s = beam_search(src, attn_seq2context, attn_context2trg, BEAM_WIDTH = 8, BATCH_SIZE=BATCH_SIZE, max_len=max_len,context_size=context_size,EN=EN)
        for i in range(BATCH_SIZE):
            stop_idx = torch.min(torch.tensor([max_len], device='cuda') if (top_s[i][0] == 3).nonzero().size()[0]==0 else (top_s[i][0] == 3).nonzero())
            f.write("%s\n"%(" ".join([EN.vocab.itos[j] for j in top_s[i][0][1:stop_idx]])))

In [94]:
top_s[0][0]

torch.Size([21])

In [98]:
decoder_output

torch.Size([500, 11560])

In [29]:
batch_rev.shape

torch.Size([800, 20])

In [50]:
BATCH_SIZE = 1
context_size = 500
#src = batch_rev.long()

In [98]:
for i in range(batch_rev.shape[0]):
    src = batch_rev[i].long().unsqueeze(0)
    decoder_context = torch.zeros(BATCH_SIZE, context_size, device='cuda') # 32 x 500
    encoder_outputs, encoder_hidden = attn_seq2context(src)    
    decoder_hidden = encoder_hidden

    word_input_0 = torch.zeros(BATCH_SIZE, device='cuda') + EN.vocab.stoi['<s>']
    decoder_output, decoder_context, decoder_hidden, decoder_attention = attn_context2trg(word_input_0.long(), decoder_context, decoder_hidden, encoder_outputs)

    word_input_0 = torch.argsort(decoder_output, dim=1, descending=True)[:,:10]
    probs = torch.stack([torch.index_select(decoder_output[b,:], 0, word_input_0[b,:]) for b in range(BATCH_SIZE)])
    break

In [59]:
[EN.vocab.itos[i] for i in word_input.squeeze().long()]

['When', 'And', 'I', 'As', 'If', 'In', 'So', 'Now', 'when', 'Because']

In [99]:
decoder_hidden = tuple([h.repeat(1,10,1) for h in decoder_hidden])
decoder_context = decoder_context.repeat(10,1)

In [100]:
encoder_outputs = encoder_outputs.repeat(10,1,1)

In [101]:
decoder_output, decoder_context, decoder_hidden, decoder_attention = attn_context2trg(word_input_0.squeeze(0).long(), decoder_context, decoder_hidden, encoder_outputs)

In [103]:
word_input_1 = torch.argsort(decoder_output, dim=1, descending=True)[:,:10] # BEAM1 x BEAM2
temp = torch.stack([torch.index_select(decoder_output[b,:], 0, word_input_1[b,:]) for b in range(10)]) # BEAM1 x BEAM2
probs = probs.transpose(0,1).repeat(1,10) + temp # BEAM1 x BEAM2

In [106]:
EN.vocab.itos[word_input_0.squeeze()[0].long()], [EN.vocab.itos[word_input_1[0,i].long()] for i in range(10)]

('When', ['I', 'my', 'me', 'he', 'in', 'you', 'most', 'we', 'it', ','])

In [113]:
word_input_2 = torch.zeros((10,10), device='cuda')
prob_2 = torch.zeros((10,10), device='cuda')
for i in range(10):
    word_input = word_input_1[i,:]
    decoder_output, decoder_context_temp, decoder_hidden_temp, decoder_attention_temp = attn_context2trg(word_input.long(), decoder_context, decoder_hidden, encoder_outputs)
    word_input_2[:,i] = torch.argsort(decoder_output, dim=1, descending=True)[:,0]
    prob_2[:,i], _ = torch.max(decoder_output, dim=1)

In [136]:
final_prob = probs + prob_2
final_prob[1,1]

tensor(15.2501, device='cuda:0', grad_fn=<SelectBackward>)

In [118]:
word = word_input_0.transpose(0,1).repeat(1,10)

In [119]:
word.shape, word_input_0.shape, word_input_1.shape, word_input_2.shape

(torch.Size([10, 10]),
 torch.Size([1, 10]),
 torch.Size([10, 10]),
 torch.Size([10, 10]))

In [126]:
final_words = torch.stack((word.long(),word_input_1.long(),word_input_2.long())).view(3,100).transpose(0,1)

In [135]:
[EN.vocab.itos[i] for i in final_words[11]]

['And', 'I', 'was']

In [133]:
EN.vocab.itos[word[1,1].long()],EN.vocab.itos[word_input_1[1,1].long()],EN.vocab.itos[word_input_2[1,1].long()]

('And', 'I', 'was')

In [138]:
fp = final_prob.view(1,100)
fp[0,11]

tensor(15.2501, device='cuda:0', grad_fn=<SelectBackward>)

In [144]:
with open("pred_kaggle_best_model_no-split_big.txt", "w+") as f:
    list_of_phrases = final_words[torch.argsort(fp.squeeze(), descending=True),:]
    f.write("%d,%s"%(line_counter,escape(' '.join(ints_to_sentences(list_of_phrases,EN)))))
    f.write('\n')

In [143]:
[EN.vocab.itos[i] for i in final_words[2,:]]

['When', 'me', 'in']

In [149]:
BATCH_SIZE = 1
context_size = 1000

with open("pred_kaggle_best_model_no-split_large.txt", "w+") as f:
    f.write("Id,Predicted\n")
    for i in range(batch_rev.shape[0]):
        src = batch_rev[i].long().unsqueeze(0)
        decoder_context = torch.zeros(BATCH_SIZE, context_size, device='cuda') # 32 x 500
        encoder_outputs, encoder_hidden = attn_seq2context(src)    
        decoder_hidden = encoder_hidden

        word_input_0 = torch.zeros(BATCH_SIZE, device='cuda') + EN.vocab.stoi['<s>']
        decoder_output, decoder_context, decoder_hidden, decoder_attention = attn_context2trg(word_input_0.long(), decoder_context, decoder_hidden, encoder_outputs)

        word_input_0 = torch.argsort(decoder_output, dim=1, descending=True)[:,:10]
        probs = torch.stack([torch.index_select(decoder_output[b,:], 0, word_input_0[b,:]) for b in range(BATCH_SIZE)])

        decoder_hidden = tuple([h.repeat(1,10,1) for h in decoder_hidden])
        decoder_context = decoder_context.repeat(10,1)
        encoder_outputs = encoder_outputs.repeat(10,1,1)
        decoder_output, decoder_context, decoder_hidden, decoder_attention = attn_context2trg(word_input_0.squeeze(0).long(), decoder_context, decoder_hidden, encoder_outputs)

        word_input_1 = torch.argsort(decoder_output, dim=1, descending=True)[:,:10] # BEAM1 x BEAM2
        temp = torch.stack([torch.index_select(decoder_output[b,:], 0, word_input_1[b,:]) for b in range(10)]) # BEAM1 x BEAM2
        probs = probs.transpose(0,1).repeat(1,10) + temp # BEAM1 x BEAM2

        word_input_2 = torch.zeros((10,10), device='cuda')
        prob_2 = torch.zeros((10,10), device='cuda')
        for j in range(10):
            word_input = word_input_1[j,:]
            decoder_output, decoder_context_temp, decoder_hidden_temp, decoder_attention_temp = attn_context2trg(word_input.long(), decoder_context, decoder_hidden, encoder_outputs)
            word_input_2[:,j] = torch.argsort(decoder_output, dim=1, descending=True)[:,0]
            prob_2[:,j], _ = torch.max(decoder_output, dim=1)

        final_prob = probs + prob_2
        fp = final_prob.view(1,100)

        word = word_input_0.transpose(0,1).repeat(1,10)
        final_words = torch.stack((word.long(),word_input_1.long(),word_input_2.long())).view(3,100).transpose(0,1)

        list_of_phrases = final_words[torch.argsort(fp.squeeze(), descending=True),:]
        f.write("%d,%s"%(i,escape(' '.join(ints_to_sentences(list_of_phrases,EN)))))
        f.write('\n')