In [5]:
import spacy
from torchtext.legacy.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator

In [10]:
spacy_eng = spacy.load('en_core_web_sm')
spacy_ger = spacy.load('de_core_news_sm')

In [11]:
def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]

In [12]:
english = Field(sequential=True, use_vocab=True, tokenize=tokenize_eng, lower=True)
german = Field(sequential=True, use_vocab=True, tokenize=tokenize_ger, lower=True)

In [14]:
# Extention tells us the source language and the target language
# I.e., German --> English 
train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'), fields=(german, english))

downloading training.tar.gz


training.tar.gz: 100%|██████████████████████| 1.21M/1.21M [00:05<00:00, 219kB/s]


downloading validation.tar.gz


validation.tar.gz: 100%|███████████████████| 46.3k/46.3k [00:00<00:00, 88.0kB/s]


downloading mmt_task1_test2016.tar.gz


mmt_task1_test2016.tar.gz: 100%|███████████| 66.2k/66.2k [00:00<00:00, 89.2kB/s]


In [15]:
english.build_vocab(train_data, max_size=10000, min_freq=2)
german.build_vocab(train_data, max_size=10000, min_freq=2)

In [16]:
train_iterator, validation_iterator, test_iterator = BucketIterator.splits((train_data, validation_data, test_data),
                                                                           batch_size=64, 
                                                                           device='cpu')

In [19]:
for batch in train_iterator:
    print(batch)
    print(batch.src)
    print(batch.trg)
    break


[torchtext.legacy.data.batch.Batch of size 64 from MULTI30K]
	[.src]:[torch.LongTensor of size 25x64]
	[.trg]:[torch.LongTensor of size 24x64]
tensor([[   6,    3,    3,  ...,   16,   74,   16],
        [  14,   11, 3283,  ...,   28, 1633,   23],
        [   5,    8, 5312,  ...,    5,   43,    7],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
tensor([[   2,    2,    2,  ...,   14,  111,   14],
        [  12,    7, 1464,  ...,   28,   48,  102],
        [   4,    9, 2176,  ...,    4,   11, 1407],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])


In [20]:
# String to index: stoi
english.vocab.stoi['the']

5

In [21]:
# index to string
english.vocab.itos[5]

'the'