In [0]:
# https://github.com/pytorch/text/blob/master/test/translation.py
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
# https://graviraja.github.io/seqtoseqimp/#

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
from torchtext import datasets
import re
import spacy

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cuda')

In [4]:
!python -m spacy download de_core_news_sm

Collecting de_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9MB)
[K     |████████████████████████████████| 14.9MB 1.7MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-cp36-none-any.whl size=14907056 sha256=be78d6a2e6d57df26fcc7e447f4db83a61231b3496324f7be22e4170437f78ca
  Stored in directory: /tmp/pip-ephem-wheel-cache-h3kpx_tx/wheels/ba/3f/ed/d4aa8e45e7191b7f32db4bfad565e7da1edbf05c916ca7a1ca
Successfully built de-core-news-sm
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [5]:
# https://stackoverflow.com/questions/56927602/unable-to-load-the-spacy-model-en-core-web-lg-on-google-colab
# https://spacy.io/usage/models

!pip install de_core_news_sm



In [0]:
import de_core_news_sm
spacy_de = de_core_news_sm.load()

In [7]:
!pip install en_core_web_sm



In [0]:
import en_core_web_sm
spacy_en = en_core_web_sm.load()

In [0]:
url = re.compile('(<url>.*</url>)')

In [0]:
def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

In [0]:
from torchtext.data.utils import get_tokenizer

DE = data.Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>')
EN = data.Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>')

In [12]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/data/data/translation/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [15]:

train_data, valid_data, test_data = datasets.Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))
print('Loaded data...')

downloading training.tar.gz


training.tar.gz: 100%|██████████| 1.21M/1.21M [00:01<00:00, 741kB/s] 


downloading validation.tar.gz


validation.tar.gz: 100%|██████████| 46.3k/46.3k [00:00<00:00, 219kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|██████████| 66.2k/66.2k [00:00<00:00, 211kB/s]


Loaded data...


In [16]:
print(train_data.fields)
print(len(train_data))
print(len(valid_data))
print(vars(train_data[0]))
print(vars(train_data[100]))

{'src': <torchtext.data.field.Field object at 0x7f9b9c4932b0>, 'trg': <torchtext.data.field.Field object at 0x7f9b9c493320>}
29000
1014
{'src': ['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.'], 'trg': ['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}
{'src': ['Männliches', 'Kleinkind', 'in', 'einem', 'roten', 'Hut', ',', 'das', 'sich', 'an', 'einem', 'Geländer', 'festhält', '.'], 'trg': ['Toddler', 'boy', 'in', 'a', 'red', 'hat', 'holding', 'on', 'to', 'some', 'railings', '.']}


In [0]:
# set source and target language
DE.build_vocab(train_data.src, min_freq=3)
EN.build_vocab(train_data.trg, min_freq=3)

In [20]:
#train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=64)
train_iter, valid_iter, test_iter = data.BucketIterator.splits((train_data, valid_data, test_data), batch_size=32, device=device)

print(DE.vocab.freqs.most_common(10))
print(len(DE.vocab))
print(EN.vocab.freqs.most_common(10))
print(len(EN.vocab))

[('.', 28821), ('Ein', 13904), ('einem', 13697), ('in', 11830), (',', 8938), ('und', 8925), ('mit', 8838), ('auf', 8686), ('Mann', 7805), ('einer', 6750)]
5500
[('a', 31707), ('.', 27623), ('A', 17458), ('in', 14847), ('the', 9923), ('on', 8019), ('is', 7524), ('and', 7378), ('man', 7359), ('of', 6871)]
4727


In [21]:
batch = next(iter(train_iter))
print(batch.src.size())
#print(batch.src[-1])
#print(batch.src.size())
#print(batch.trg.size())

torch.Size([26, 32])


In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, device):
        super(EncoderRNN, self).__init__()

        self.hidden_size = hidden_size

        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size)

        self.device = device
        self.to(self.device)
        
    def forward(self, input, hidden):
        # input is of shape [sentence_length, batch_size]
        # embedded is of shape [sentence_length, batch_size, embedding_size]

        input = input.to(self.device)

        embedded = self.embed(input)
        output, hidden = self.gru(embedded, hidden)

        # output shape is [sentence_length, batch_size, hidden_dim]
        # hidden shape is [num_layers, batch_size, hidden_dim]

        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size, device=device)


In [0]:
class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, device):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embed = nn.Embedding(vocab_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size)
        
        self.out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
        self.device = device
        self.to(self.device)

    def forward(self, input, hidden):
        # input is of shape [batch_size]
        
        input = input.unsqueeze(0)
        # now, input shape is [1, batch_size]

        input = input.to(self.device)

        embedded = self.embed(input)
        # embedded is of shape [1, batch_size, embedding_dim]

        embedded = F.relu(embedded)
        output, hidden = self.gru(embedded, hidden)

        output = self.out(output.squeeze(0))
        output = self.softmax(output)

        return output, hidden


In [24]:
vocab_size_src = len(DE.vocab)
vocab_size_trg = len(EN.vocab)

embedding_size = 300
hidden_size = 128

enc = EncoderRNN(vocab_size_src, embedding_size, hidden_size, device)
dec = DecoderRNN(vocab_size_trg, embedding_size, hidden_size, device)
enc_optim = optim.Adam(enc.parameters())
dec_optim = optim.Adam(dec.parameters())
    
pad_idx = EN.vocab.stoi['<pad>']
print('Pad index: ', pad_idx)
criterion = nn.NLLLoss(ignore_index=pad_idx)

epochs = 10
clip = 10

epoch_losses = []

for epoch in range(epochs):
    epoch_loss = 0

    for batch in train_iter:
        hidden = enc.init_hidden(batch.src.size(1))
        
        out, hidden = enc(batch.src, hidden)

        max_len = batch.trg.size(0)
        batch_size = batch.trg.size(1)

        # store outputs
        outputs = torch.zeros(max_len, batch_size, vocab_size_trg, device=device)
        input = batch.trg[0, :]
        
        for i in range(1, max_len):
            output, hidden = dec(input, hidden)
            outputs[i] = output
            input = batch.trg[i]

        enc_optim.zero_grad()
        dec_optim.zero_grad()

        target = torch.tensor(batch.trg[1:], device=device)
        loss = criterion(outputs[1:].view(-1, outputs.shape[2]), target.view(-1))
        loss.backward()

        nn.utils.clip_grad_norm_(enc.parameters(), clip)
        nn.utils.clip_grad_norm_(dec.parameters(), clip)

        enc_optim.step()
        dec_optim.step()

        epoch_loss += loss.item()
        print('\rEpisode {} : Loss {:.3f}'.format(epoch, epoch_loss / len(batch)), end="")

    print('\rEpisode {} : Loss {:.3f}'.format(epoch, epoch_loss / len(train_iter)))
    print()

    epoch_losses.append(epoch_loss / len(train_iter))

Pad index:  1




Episode 0 : Loss 4.123

Episode 1 : Loss 3.342

Episode 2 : Loss 3.040

Episode 3 : Loss 2.852

Episode 4 : Loss 2.706

Episode 5 : Loss 2.586

Episode 6 : Loss 2.484

Episode 7 : Loss 2.395

Episode 8 : Loss 2.314

Episode 9 : Loss 2.242

