In [1]:
import sys

# make sure this points to the correct environment
sys.executable

'/usr/local/anaconda3/envs/torch-nlp/bin/python3'

In [13]:
import os

# add module
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [235]:
import io

from collections import Counter
import random

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

from torchtext.vocab import vocab
from torchtext.utils import download_from_url, extract_archive
from torchtext.data.utils import get_tokenizer

In [15]:
from translators.utils.constants import Constants
from translators.utils.path_utils import downloads_path, dataset_filepaths

Constants.DATASET_BASE_URL, Constants.DATASET_DOWNLOADS_PATH, Constants.DATASET_SPLITS

('https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/',
 './data/',
 {'train': {'source': 'train.de', 'target': 'train.en'},
  'val': {'source': 'val.de', 'target': 'val.en'},
  'test': {'source': 'test_2016_flickr.de', 'target': 'test_2016_flickr.en'}})

In [16]:
# file paths
train_filepaths = dataset_filepaths("train")
source_filepath = train_filepaths["source"]
target_filepath = train_filepaths["target"]

In [17]:
source_tokenizer = get_tokenizer("spacy", language="de")
target_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

In [18]:
source_tokenizer

functools.partial(<function _spacy_tokenize at 0x138dfc160>, spacy=<spacy.lang.de.German object at 0x110f9e8c0>)

In [19]:
def build_vocab(filepath: str, tokenizer: callable):
    counter = Counter()
    
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))
            pass
 
    return vocab(
        counter, 
        specials=[
            Constants.SPECIAL_TOKEN_UNKNOWN,
            Constants.SPECIAL_TOKEN_PAD,
            Constants.SPECIAL_TOKEN_BOS,
            Constants.SPECIAL_TOKEN_EOS,
        ]
    )

In [20]:
source_vocab = build_vocab(source_filepath, source_tokenizer)
target_vocab = build_vocab(target_filepath, target_tokenizer)

In [21]:
list(source_vocab.get_stoi().items())[:10]

[('verschnörkelten', 19213),
 ('majestätisch', 19211),
 ('Turnanzug', 19209),
 ('einzutreten', 19208),
 ('Cardigan', 19205),
 ('Sushi-Essen', 19204),
 ('Thora', 19198),
 ('Jüdische', 19197),
 ('Schutzgeländer', 19196),
 ('aufzeichnet', 19195)]

In [22]:
list(target_vocab.get_stoi().items())[:10]

[('majestically', 10836),
 ('Torah', 10831),
 ('coupling', 10829),
 ('joining', 10828),
 ('fitters', 10825),
 ('Rays', 10821),
 ('handwork', 10818),
 ('resembling', 10817),
 ('armbands', 10815),
 ('Similarly', 10814)]

In [23]:
# build dataset
def build_dataset(source_filepath: str, target_filepath: str) -> list:
    # raw data
    source_raw_iter = iter(io.open(source_filepath, encoding="utf8"))
    target_raw_iter = iter(io.open(target_filepath, encoding="utf8"))
    
    # build dataset (input, output) pairs
    dataset = list()

    for (source_raw, target_raw) in zip(source_raw_iter, target_raw_iter):
        source_tokenized = torch.tensor([source_vocab[token] for token in source_tokenizer(source_raw)], dtype=torch.long)
        target_tokenized = torch.tensor([target_vocab[token] for token in target_tokenizer(target_raw)], dtype=torch.long)
        dataset.append((source_tokenized, target_tokenized))
        pass

    return dataset

In [24]:
train_dataset = build_dataset(source_filepath, target_filepath)

In [82]:
train_dataset[0]

(tensor([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17]),
 tensor([ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]))

In [124]:
# padding stoi
padding_token_id = source_vocab[Constants.SPECIAL_TOKEN_PAD]
bos_token_id = source_vocab[Constants.SPECIAL_TOKEN_BOS]
eos_token_id = source_vocab[Constants.SPECIAL_TOKEN_EOS]

# tokenizer includes the specials first
source_vocab.get_itos()[:4]

['<unk>', '<pad>', '<bos>', '<eos>']

In [207]:
def collate_minibatch(minibatch):
    source_minibatch = list()
    target_minibatch = list()
    
    for (source_item, target_item) in minibatch:
        s = torch.cat((torch.tensor([bos_token_id]), source_item, torch.tensor([eos_token_id])))
        source_minibatch.append(s)
    
        t = torch.cat((torch.tensor([bos_token_id]), target_item, torch.tensor([eos_token_id])))
        target_minibatch.append(t)
        pass

    source_minibatch = pad_sequence(source_minibatch, padding_value=padding_token_id, batch_first=True)
    target_minibatch = pad_sequence(target_minibatch, padding_value=padding_token_id, batch_first=True)    

    return source_minibatch, target_minibatch

In [216]:
# sampled minibatch
minibatch_size = 128
minibatch_idx = random.sample(range(0, len(train_dataset)), 128)
temp_minibatch = [train_dataset[idx] for idx in minibatch_idx]
source_minibatch, target_minibatch = collate_minibatch(temp_minibatch)
source_minibatch.shape, target_minibatch.shape

(torch.Size([128, 36]), torch.Size([128, 36]))

In [220]:
# train_dataset = build_dataset(source_filepath, target_filepath)
train_loader = DataLoader(train_dataset, collate_fn=collate_minibatch, batch_size=minibatch_size, shuffle=True)

In [232]:
# test the loader
for i, batch in enumerate((train_loader)):
    source, target = batch
    pass

In [233]:
source.shape, target.shape

(torch.Size([72, 24]), torch.Size([72, 27]))

In [238]:
embedding_layer = nn.Embedding(len(source_vocab), embedding_dim=256)

In [248]:
embedding_layer._parameters["weight"].shape

torch.Size([19215, 256])