# CS 287, Homework 3: Neural Machine Translation

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_

from common import *

import re
from torchtext import data, datasets
from namedtensor import ntorch, NamedTensor
from namedtensor.text import NamedField
import numpy as np

%reload_ext autoreload
%autoreload 2

In [2]:
# split raw data into tokens
import spacy
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

# add beginning-of-sentence and end-of-sentence tokens to target
BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = NamedField(names=('srcSeqlen',), tokenize=tokenize_de)
EN = NamedField(names=('trgSeqlen',), tokenize=tokenize_en,
                init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS

# download dataset of 200K pairs of sentences
# start with MAXLEN = 20
MAX_LEN = 20
train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)

# build vocab, convert words to indices
MIN_FREQ = 5
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)
print("Size of German vocab", len(DE.vocab))
print("Size of English vocab", len(EN.vocab))
print(EN.vocab.stoi["<s>"], EN.vocab.stoi["</s>"])
print(EN.vocab.stoi["<pad>"], EN.vocab.stoi["<unk>"])

Size of German vocab 13353
Size of English vocab 11560
2 3
1 0


In [3]:
# split data into batches
BATCH_SIZE = 32
device = torch.device('cuda:0')
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=device,
                                                  repeat=False, sort_key=lambda x: len(x.src))

In [40]:
context_size = 1000
num_layers = 4

BEAM_WIDTH = 10
max_len = 3

attn_seq2context = SequenceModel(len(DE.vocab),context_size,num_layers=num_layers)
state_dict = torch.load('best_seq2seq_withattn_seq2context_big_network_no_unk.pt')
attn_seq2context.load_state_dict(state_dict)
attn_seq2context = attn_seq2context.cuda()

attn_context2trg = attn_RNNet_batched(input_size=len(EN.vocab),hidden_size=context_size,num_layers=num_layers)
state_dict = torch.load('best_seq2seq_withattn_context2trg_big_network_no_unk.pt')
attn_context2trg.load_state_dict(state_dict)
attn_context2trg = attn_context2trg.cuda()

In [5]:
# load test set
sentences = []
for i, l in enumerate(open("source_test.txt")):
    sentences.append(re.split(' ', l))

max_sent_len = 0
for i in range(len(sentences)):
    if len(sentences[i]) > max_sent_len:
        max_sent_len = len(sentences[i])

batch = torch.tensor([], device='cuda')
for b in range(len(sentences)):
    m = nn.ConstantPad1d((0, max_sent_len - len(sentences[b])), EN.vocab.stoi['<pad>'])
    src = m(torch.tensor([DE.vocab.stoi[i] for i in sentences[b]], device='cuda').unsqueeze(0)).float()
    batch = torch.cat((batch,src), dim=0)
batch_rev = reverse_sequence(batch)

In [10]:
batch_rev_data = torch.utils.data.TensorDataset(batch_rev)
for i in batch_rev_data_loader:
    src = i[0]
    break

In [7]:
BATCH_SIZE = 50
batch_rev_data_loader = torch.utils.data.DataLoader(batch_rev_data, batch_size=BATCH_SIZE, shuffle=False)
#BATCH_SIZE = src.shape[0]

In [18]:
p_words_running.shape

torch.Size([50, 2500])

## Submission

In [51]:
outputs = beam_search(src, attn_seq2context, attn_context2trg, BEAM_WIDTH = 8, BATCH_SIZE=BATCH_SIZE, max_len=3,EN=EN)
preds = torch.stack([i[:BATCH_SIZE,:] for i in outputs]).transpose(0,1).transpose(1,2)

torch.Size([100, 11560, 3])

In [45]:
' '.join(ints_to_sentences(list_of_phrases))

"I|was|an I|was|a I|was|going I|was|having I|was|talking <unk>|'ve|watching I|was|in I|was|looking I|was|running I|was|writing I|do|responsible I|miss|what I|was|wearing I|see|on A|'s|there I|was|supposed I|did|showing I|'m|doing I|was|willing I|is|taking I|was|holding And|want|putting I|think|entering I|just|teaching I|was|saying I|'m|the I|'ll|making I|miss|<unk> I|'m|able I|guess|wondering I|was|flying I|always|giving I|<unk>|losing I|was|just I|first|so I|have|trying I|was|creating I|was|reading I|would|like I|'d|standing I|get|seeing I|wrote|curious I|was|using I|call|playing I|live|building And|feel|leaving I|was|experiencing I|'m|thinking And|want|several I|was|here I|do|one I|was|hanging <unk>|'ve|then I|am|eating I|always|launching I|'m|, I|read|asking I|was|leading One|felt|to I|was|somebody And|feel|listening Well|go|this I|<unk>|sending I|negotiate|actually I|was|following <unk>|'ve|calling <unk>|like|being Voice|kind|offering I|do|glad I|wrote|that I|was|my I|look|some I|'

In [24]:
preds = " ".join("|".join([EN.vocab.itos[j] for j in top_s[0][0][1:]]) for )
preds

'<unk>|I|<unk>'

In [67]:
# for kaggle
def escape(l):
    return l.replace("\"", "<quote>").replace(",", "<comma>")

# for kaggle
line_counter = 0
with open("pred_kaggle_best_model_no-split_large_no_unk_fix_bsearch.txt", "w+") as f:
    f.write("Id,Predicted\n")
    for b in batch_rev_data_loader:
        
        src = b[0].long()
        top_s = beam_search(src, attn_seq2context, attn_context2trg, BEAM_WIDTH = 100, BATCH_SIZE=BATCH_SIZE, max_len=3,context_size=context_size,EN=EN)
        lists_per_sample = [torch.stack(top_s[i])[:,1:] for i in range(BATCH_SIZE)]
        for i in range(BATCH_SIZE):
            list_of_phrases = lists_per_sample[i]
            f.write("%d,%s"%(line_counter,escape(' '.join(ints_to_sentences(list_of_phrases,EN)))))
            f.write('\n')
            line_counter+=1

In [None]:
# for bleu
    for i, l in enumerate(open("source_test.txt")):
        stop_idx = torch.tensor([max_len], device='cuda') if (top_s[i][0] == 3).nonzero().size()[0]==0 else (top_s[i][0] == 3).nonzero()
        f.write("%s\n"%(" ".join([EN.vocab.itos[j] for j in top_s[i][0][1:stop_idx]])))

In [60]:
p_words_running = torch.stack([update_p[:,b].repeat(1,BEAM_WIDTH) for b in range(BEAM_WIDTH)]).view(BEAM_WIDTH**2,BATCH_SIZE).transpose(0,1)


In [100]:
max_len=20
line_counter = 0
with open("pred_bleu_update.txt", "w+") as f:
    for b in batch_rev_data_loader:
        
        src = b[0].long()
        top_s = beam_search(src, attn_seq2context, attn_context2trg, BEAM_WIDTH = 1, BATCH_SIZE=BATCH_SIZE, max_len=max_len,context_size=context_size,EN=EN)
        for i in range(BATCH_SIZE):
            stop_idx = torch.min(torch.tensor([max_len], device='cuda') if (top_s[i][0] == 3).nonzero().size()[0]==0 else (top_s[i][0] == 3).nonzero())
            f.write("%s\n"%(" ".join([EN.vocab.itos[j] for j in top_s[i][0][1:stop_idx]])))

In [94]:
top_s[0][0]

torch.Size([21])

In [98]:
decoder_output

torch.Size([500, 11560])

In [84]:
next_words.shape

torch.Size([50, 100])

In [99]:
mask

NameError: name 'mask' is not defined