## Just do two things:

1. Look at the dataset and make sure it's not crazy
2. make sure you can train on a single target, eg nothing's broken

In [25]:
import torch
import torchtext
import spacy
import time

In [2]:
from model import Seq2SeqA

In [3]:
from torchtext.datasets import Multi30k, TranslationDataset
from torchtext.data import Field, BucketIterator, Iterator

Usage cribbed from [ben trevett](https://github.com/bentrevett/pytorch-seq2seq/blob/master/3%20-%20Neural%20Machine%20Translation%20by%20Jointly%20Learning%20to%20Align%20and%20Translate.ipynb)

Defines tokenizers for each language

In [4]:
spacy_en = spacy.load('en')
spacy_de = spacy.load('de')

In [5]:
def tokenize_en(sent):
    return [tok.text for tok in spacy_en.tokenizer(sent)]
def tokenize_de(sent):
    return [tok.text for tok in spacy_de.tokenizer(sent)]

This specifies how the torchtext datasets are loaded

In [6]:
init_token = '<sos>'
eos_token = '<eos>'
pad_token = '<pad>'
unk_token = '<unk>'

src = Field(tokenize = tokenize_de, 
            init_token=init_token,
            eos_token=eos_token,
            pad_token=pad_token,
            unk_token=unk_token,
            include_lengths=True,
            lower = True)

trg = Field(tokenize = tokenize_en, 
            init_token=init_token,
            eos_token=eos_token,
            pad_token=pad_token,
            unk_token=unk_token,
            lower = True)

In [7]:
train_data, val_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (src, trg))

In [8]:
src.build_vocab(train_data, min_freq = 2)
trg.build_vocab(train_data, min_freq = 2)

In [9]:
print(len(src.vocab), len(trg.vocab))

7855 5893


In [10]:
print(trg.unk_token, trg.pad_token, trg.init_token, trg.eos_token)

<unk> <pad> <sos> <eos>


In [11]:
pad_idx = trg.vocab.stoi[trg.pad_token]
unk_idx = trg.vocab.stoi[trg.unk_token]
sos_idx = trg.vocab.stoi[trg.init_token]
eos_idx = trg.vocab.stoi[trg.eos_token]
print(f"Pad index: {pad_idx}, unk index: {unk_idx}")
print(f"sos index: {sos_idx}, eos index: {eos_idx}")

Pad index: 1, unk index: 0
sos index: 2, eos index: 3


In [12]:
device = torch.device('cpu')

The iterators provide source and target long-tensors in the .src and .trg attributes

In [13]:
batch_size = 5

train_iterator, val_iterator, test_iterator = Iterator.splits(
    (train_data, val_data, test_data), 
    sort_within_batch = True, 
    sort_key = lambda x : len(x.src),
    batch_size = batch_size,
    device = device)

### Building the model

In [26]:
# number of tokens in the input vocabulary
src_vocab_size = len(src.vocab)
# and the ouput
trg_vocab_size = len(trg.vocab)
# hidden dimension, shared
hidden_dim = 256
# word embedding dimension
embedding_dim = 200
# dimension of the attention vector
attention_dim = hidden_dim
# dimension of the single output hidden layer
output_hidden_dim = hidden_dim

model_params = {'src_vocab_size': src_vocab_size, 'trg_vocab_size': trg_vocab_size, 
                'hidden_dim': hidden_dim, 'embedding_dim': embedding_dim, 
                 'attention_dim': attention_dim, 'output_hidden_dim': output_hidden_dim, 
                  'pad_token': pad_idx}

In [30]:
model = Seq2SeqA(model_params)

In [31]:
nparam = sum([p.numel() for p in model.parameters() if p.requires_grad])
print(f"{nparam} trainable parameters")

6462624 trainable parameters


In [32]:
def decode(tokens, which='src'):
    v = src.vocab if which == 'src' else trg.vocab
    return [v.itos[t] for t in tokens]

#### check that padding makes sense
Note - if you use the bucket iterator, batches are chosen to have the same size when possible

In [33]:
batch = next(iter(train_iterator))
print(batch.src)

(tensor([[   2,    2,    2,    2,    2],
        [   5,   26,   18,   43,    8],
        [  13,   73,   80,   30,   16],
        [  20,   53,    7,   11,  404],
        [ 255,   10,  237,  410,   17],
        [ 183,  185, 2799,  377, 2063],
        [  28,   23,   57,  956,    4],
        [  14,    7,  215,  499,    3],
        [  16,   14,    4,    4,    1],
        [   9,    0,    3,    3,    1],
        [  17,    4,    1,    1,    1],
        [  31,    3,    1,    1,    1],
        [   4,    1,    1,    1,    1],
        [   3,    1,    1,    1,    1]]), tensor([14, 12, 10, 10,  8]))


In [34]:
print(' '.join(decode(batch.src[0][:, 0], which='src')))
print(' '.join(decode(batch.trg[:, 0], which='trg')))

<sos> ein mann im anzug spricht zu einer frau , die sitzt . <eos>
<sos> a man in a suit speaking to a seated woman . <eos>


In [35]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)

In [36]:
N = 1000
losses = []
for i in range(N):
    t0 = time.time()
    loss = model.loss(batch.src[0], batch.src[1], batch.trg)
    dt = time.time() - t0
    model.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.detach().item())
    if i % 10 == 0:
        print(f"step {i} in {dt:.3f} sec")

step 0 in 1.691 sec
step 10 in 0.684 sec
step 20 in 0.609 sec


KeyboardInterrupt: 