In [38]:
import torch
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator
import spacy

In [17]:
Multi30k.download('.data')

'.data/multi30k/'

In [18]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [19]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [44]:
SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>')
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')

In [45]:
train, valid, test = TranslationDataset.splits(      
  path = '.data/multi30k',  
  exts = ['.de', '.en'],   
  fields = [('src', SRC), ('trg', TRG)],
  train = 'train', 
  validation = 'val', 
  test = 'test2016')

In [46]:
SRC.build_vocab(train.src)
TRG.build_vocab(train.trg)

In [47]:
BATCH_SIZE = 32

In [48]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train, valid, test), batch_size=BATCH_SIZE, repeat=False)

In [49]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, bidirectional, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, bidirectional=bidirectional, dropout=dropout)

        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, hidden=None):
        
        #src = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [sent len, batch size, emb dim]
        
        outputs, hidden = self.rnn(embedded, hidden)
        
        #outputs = TODO
        #hidden = TODO
        
        if self.bidirectional:
            # sum bidirectional outputs
            outputs = (outputs[:, :, :self.hidden_size] +
                       outputs[:, :, self.hidden_size:])
        
        return outputs, hidden

In [50]:
INPUT_DIM = len(SRC.vocab)
EMB_DIM = 256
HID_DIM = 512
ENC_N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5

enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [51]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.GRU(emb_dim + hid_dim, hid_dim, n_layers)
        
        self.out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden):
        
        #input = [1, bsz]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, bsz, emb dim]
        
        output, hidden = self.rnn(embedded, hidden)
        
        #output = TODO
        #hidden = TODO
        
        output = output.squeeze(0)
        
        return self.out(output)

In [52]:
OUTPUT_DIM = len(TRG.vocab)
EMB_DIM = 256
HID_DIM = 512
DEC_N_LAYERS = 1
DROPOUT = 0.5

dec = Decoder(INPUT_DIM, EMB_DIM, HID_DIM, DEC_N_LAYERS, DROPOUT)

In [53]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, tf):
        
        batch_size = src.shape[1]
        max_len = trg.shape[0]
        vocab_size = decoder.output_dim
        
        outputs = torch.zeros(max_len, batch_size, vocab_size).to(self.device)
        
        encoder_output, hidden = self.encoder(src)
        
        hidden = hidden[:self.decoder.n_layers]
        output = Variable(trg.data[0, :])  # <sos> tokens
        
        for t in range(1, max_len):
            output, hidden = self.decoder(output, hidden, encoder_output)
            outputs[t] = output
            is_teacher = random.random() < teacher_forcing_ratio
            top1 = output.data.max(1)[1]
            output = (trg.data[t] if is_teacher else top1).to(self.device)
        return outputs

In [54]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Seq2Seq(enc, dec, device)

In [56]:
for i, batch in enumerate(train_iterator):
    src = batch.src
    trg = batch.trg
    print(src.shape)
    print(trg.shape)
    break

torch.Size([27, 32])
torch.Size([26, 32])
