In [8]:
# !conda install -y -c conda-forge spacy
# !python -m spacy download en_core_web_sm
# !python -m spacy download de_core_news_sm

In [9]:
import torchdata.datapipes as dp
import torchtext.transforms as T

In [10]:
import spacy
from torchtext.vocab import build_vocab_from_iterator

In [11]:
eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text
de = spacy.load("de_core_news_sm") # Load the German model to tokenize German text

In [14]:
[token.text for token in eng.tokenizer("Hi how are you")]

['Hi', 'how', 'are', 'you']

In [15]:
FILE_PATH = 'deu-eng/deu.txt'

In [16]:
data_pipe= dp.iter.IterableWrapper([FILE_PATH])
data_pipe= dp.iter.FileOpener(data_pipe,mode='rb')
data_pipe=data_pipe.parse_csv(skip_lines=0, delimiter='\t', as_tuple=True)

In [23]:
i=0
for sample in data_pipe:
    print(sample)
    i+=1
    if i==20:
        break

('Go.', 'Geh.')
('Hi.', 'Hallo!')
('Hi.', 'Grüß Gott!')
('Run!', 'Lauf!')
('Run.', 'Lauf!')
('Wow!', 'Potzdonner!')
('Wow!', 'Donnerwetter!')
('Duck!', 'Kopf runter!')
('Fire!', 'Feuer!')
('Help!', 'Hilfe!')
('Help!', 'Zu Hülf!')
('Stay.', 'Bleib!')
('Stop!', 'Stopp!')
('Stop!', 'Anhalten!')
('Wait!', 'Warte!')
('Wait.', 'Warte.')
('Begin.', 'Fang an.')
('Do it.', 'Mache es!')
('Do it.', 'Tue es.')
('Go on.', 'Mach weiter.')


In [19]:
def removeAttribution(row):
    return row[:2]

In [20]:
data_pipe=data_pipe.map(removeAttribution)

In [21]:
for sample in data_pipe:
    print(sample)
    break

('Go.', 'Geh.')


In [22]:
def engTokenize(text):
    return [token.text for token in eng.tokenizer(text)]

def deTokenize(text):
    return [token.text for token in de.tokenizer(text)]

In [24]:
def getTokens(data_iter, place):
    """
    Function to yield tokens from an iterator. Since, our iterator contains
    tuple of sentences (source and target), `place` parameters defines for which
    index to return the tokens for. `place=0` for source and `place=1` for target
    """
    for english, german in data_iter:
        if place == 0:
            yield engTokenize(english)
        else:
            yield deTokenize(german)

In [25]:
source_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,0),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
source_vocab.set_default_index(source_vocab['<unk>'])

In [26]:
target_vocab = build_vocab_from_iterator(
    getTokens(data_pipe,1),
    min_freq=2,
    specials= ['<pad>', '<sos>', '<eos>', '<unk>'],
    special_first=True
)
target_vocab.set_default_index(target_vocab['<unk>'])

In [27]:
def getTransform(vocab):
    """
    Create transforms based on given vocabulary. The returned transform is applied to sequence
    of tokens.
    """
    text_tranform = T.Sequential(
        ## converts the sentences to indices based on given vocabulary
        T.VocabTransform(vocab=vocab),
        ## Add <sos> at beginning of each sentence. 1 because the index for <sos> in vocabulary is
        # 1 as seen in previous section
        T.AddToken(1, begin=True),
        ## Add <eos> at beginning of each sentence. 2 because the index for <eos> in vocabulary is
        # 2 as seen in previous section
        T.AddToken(2, begin=False)
    )
    return text_tranform

In [28]:
def applyTransform(sequence_pair):
    """
    Apply transforms to sequence of tokens in a sequence pair
    """

    return (
        getTransform(source_vocab)(engTokenize(sequence_pair[0])),
        getTransform(target_vocab)(deTokenize(sequence_pair[1]))
    )
data_pipe = data_pipe.map(applyTransform) ## Apply the function to each element in the iterator
temp_list = list(data_pipe)
print(temp_list[0])

([1, 616, 4, 2], [1, 739, 4, 2])


In [29]:
def sortBucket(bucket):
    """
    Function to sort a given bucket. Here, we want to sort based on the length of
    source and target sequence.
    """
    return sorted(bucket, key=lambda x: (len(x[0]), len(x[1])))

In [30]:
data_pipe = data_pipe.bucketbatch(
    batch_size = 4, batch_num=5,  bucket_num=1,
    use_in_batch_shuffle=False, sort_key=sortBucket
)

In [32]:
print(list(data_pipe)[0])

[([1, 11105, 17, 4, 2], [1, 507, 29, 24, 2]), ([1, 11105, 17, 4, 2], [1, 7994, 1487, 24, 2]), ([1, 5335, 21, 4, 2], [1, 6956, 32, 24, 2]), ([1, 5335, 21, 4, 2], [1, 16003, 32, 24, 2])]


In [31]:
# convert from  [(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)] to ((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))

In [33]:
def separateSourceTarget(sequence_pairs):
    """
    input of form: `[(X_1,y_1), (X_2,y_2), (X_3,y_3), (X_4,y_4)]`
    output of form: `((X_1,X_2,X_3,X_4), (y_1,y_2,y_3,y_4))`
    """
    sources,targets = zip(*sequence_pairs)
    return sources,targets

## Apply the function to each element in the iterator
data_pipe = data_pipe.map(separateSourceTarget)
print(list(data_pipe)[0])

(([1, 1066, 4, 2], [1, 2989, 4, 2], [1, 3, 194, 2], [1, 1670, 194, 2]), ([1, 1383, 4, 2], [1, 6030, 1616, 24, 2], [1, 740, 2445, 24, 2], [1, 1210, 3, 24, 2]))


In [34]:
def applyPadding(pair_of_sequences):
    """
    Convert sequences to tensors and apply padding
    """
    return (T.ToTensor(0)(list(pair_of_sequences[0])), T.ToTensor(0)(list(pair_of_sequences[1])))
## `T.ToTensor(0)` returns a transform that converts the sequence to `torch.tensor` and also applies
# padding. Here, `0` is passed to the constructor to specify the index of the `<pad>` token in the
# vocabulary.
data_pipe = data_pipe.map(applyPadding)

In [35]:
source_index_to_string = source_vocab.get_itos()
target_index_to_string = target_vocab.get_itos()

def showSomeTransformedSentences(data_pipe):
    """
    Function to show how the sentences look like after applying all transforms.
    Here we try to print actual words instead of corresponding index
    """
    for sources,targets in data_pipe:
        if sources[0][-1] != 0:
            continue # Just to visualize padding of shorter sentences
        for i in range(4):
            source = ""
            for token in sources[i]:
                source += " " + source_index_to_string[token]
            target = ""
            for token in targets[i]:
                target += " " + target_index_to_string[token]
            print(f"Source: {source}")
            print(f"Traget: {target}")
        break

showSomeTransformedSentences(data_pipe)

Source:  <sos> Relax . <eos> <pad>
Traget:  <sos> Entspann dich . <eos>
Source:  <sos> I see . <eos>
Traget:  <sos> Aha . <eos> <pad>
Source:  <sos> I ran . <eos>
Traget:  <sos> Ich rannte . <eos>
Source:  <sos> I see . <eos>
Traget:  <sos> Ich verstehe . <eos>


In [36]:
for sample in data_pipe:
    print(sample)
    break

(tensor([[   1, 6860,   23,  194,    2],
        [   1, 6860,   23,   10,    2],
        [   1, 1042,   21,    4,    2],
        [   1, 1042,   21,    4,    2]]), tensor([[    1, 17926,    24,     2,     0],
        [    1,     3,     8,     2,     0],
        [    1,  1578,    32,    24,     2],
        [    1,  3817,    32,    24,     2]]))
