In [1]:
import sentencepiece as sp

In [11]:
import os

''' Concatenate the contents of all files in the data folder into one single text file named full.txt. '''
def concat_files():
    with open('full.txt', 'w', encoding='utf8') as f:
        for file in os.listdir('data/multi30k/'):
            with open(os.path.join('data/multi30k/', file), encoding='utf8') as g:
                f.write(g.read())

concat_files()

In [10]:
sp.SentencePieceTrainer.train(input='data/multi30k/train.de', model_prefix='bpe_de', vocab_size=10000, model_type='bpe', character_coverage=1.0, input_sentence_size=10000, shuffle_input_sentence=True, max_sentence_length=1000, num_threads=16, unk_id=0, bos_id=1, eos_id=2, pad_id=3)

In [11]:
sp.SentencePieceTrainer.train(input='data/multi30k/train.en', model_prefix='bpe_en', vocab_size=10000, model_type='bpe', character_coverage=1.0, input_sentence_size=10000, shuffle_input_sentence=True, max_sentence_length=1000, num_threads=16, unk_id=0, bos_id=1, eos_id=2, pad_id=3)

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import multi30k, Multi30k
from typing import Iterable, List


# We need to modify the URLs for the dataset since the links to the original dataset are broken
# Refer to https://github.com/pytorch/text/issues/1756#issuecomment-1163664163 for more info
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

# Place-holders
token_transform = {}
vocab_transform = {}


# Create source and target language tokenizer. Make sure to install the dependencies.
# pip install -U torchdata
# pip install -U spacy
# python -m spacy download en_core_web_sm
# python -m spacy download de_core_news_sm
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}

    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

# Define special symbols and indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    # Training data Iterator
    train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

# Set UNK_IDX as the default index. This index is returned when the token is not found.
# If not set, it throws RuntimeError when the queried token is not found in the Vocabulary.
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [None]:
vocab_transform

In [None]:
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout,
                                       batch_first=True)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [None]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX)
    tgt_padding_mask = (tgt == PAD_IDX)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [12]:
from models.transformer import Transformer

model = Transformer.load_from_checkpoint('experiments/sep_vocab/checkpoints/model-epoch=144-val_loss=3.24.ckpt')

In [13]:
import sentencepiece as sp

src_sp_model = sp.SentencePieceProcessor('data/multi30k/bpe_en.model')
tgt_sp_model = sp.SentencePieceProcessor('data/multi30k/bpe_de.model')

In [14]:
import torch

In [15]:
def translate(model, src: str):
    x = torch.tensor(src_sp_model.encode_as_ids(src), device='cuda')
    y = model.translate(x, bos_idx=1, eos_idx=2, max_new_tokens=50)
    return y

tokens = []
for token in translate(model, "ein"):
    tokens.append(token)

print(tgt_sp_model.decode(tokens))

, essen essen essen essen essen............................................


In [79]:
print(translate(transformer, "Eine Gruppe von Menschen steht vor einem Iglu ."))

 cart cart cart watching ready covered watching covered covered covered covered working covered working working


In [None]:
print(translate(transformer, "Ich liebe flache Brüste"))

In [None]:
print(translate(transformer, "I love"))

In [None]:
from alt_dataloader import text_transform

In [None]:
input = text_transform['de']('Eine Gruppe von Menschen steht vor einem Iglu .')

In [1]:
import torch

In [42]:
N = 2
T = 3

tok_mask = torch.ones((N, 1, T)).bool()
causal_mask = torch.tril(torch.ones((T, T))).bool()

In [43]:
tok_mask[0, 0, 1:] = False
tok_mask = tok_mask.repeat(1, T, 1)
tok_mask

tensor([[[ True, False, False],
         [ True, False, False],
         [ True, False, False]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]]])

In [44]:
tok_mask2 = tok_mask.transpose(-1, -2)
tok_mask2

tensor([[[ True,  True,  True],
         [False, False, False],
         [False, False, False]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]]])

In [45]:
causal_mask

tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]])

In [46]:
(tok_mask & tok_mask2 & causal_mask).shape

torch.Size([2, 3, 3])

In [47]:
torch.ones((N, T, T)).masked_fill(~(tok_mask & tok_mask2 & causal_mask), float('-inf'))

tensor([[[1., -inf, -inf],
         [-inf, -inf, -inf],
         [-inf, -inf, -inf]],

        [[1., -inf, -inf],
         [1., 1., -inf],
         [1., 1., 1.]]])

In [21]:
N = 2
T = 3

tok_mask = torch.ones((N, 1, 1, T)).bool()
tok_mask = tok_mask.repeat(1, 1, T, 1)
causal_mask = torch.tril(torch.ones((1, 1, T, T))).bool()

In [23]:
tok_mask[0, 0, 0, 1] = False
tok_mask

tensor([[[[ True, False,  True]]],


        [[[ True,  True,  True]]]])

In [26]:
tok_mask & torch.ones((1, 1, T, T)).bool()

tensor([[[[ True, False,  True],
          [ True, False,  True],
          [ True, False,  True]]],


        [[[ True,  True,  True],
          [ True,  True,  True],
          [ True,  True,  True]]]])

In [24]:
causal_mask

tensor([[[[ True, False, False],
          [ True,  True, False],
          [ True,  True,  True]]]])

In [25]:
(tok_mask & causal_mask)

tensor([[[[ True, False, False],
          [ True, False, False],
          [ True, False,  True]]],


        [[[ True, False, False],
          [ True,  True, False],
          [ True,  True,  True]]]])

In [52]:
mask = torch.ones((N, T)).bool()
mask

tensor([[True, True, True],
        [True, True, True]])

In [53]:
mask[0, 1:] = False
mask

tensor([[ True, False, False],
        [ True,  True,  True]])

In [54]:
mask.view(N, 1, T) & mask.view(N, T, 1)

tensor([[[ True, False, False],
         [False, False, False],
         [False, False, False]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]]])

In [1]:
from alt_dataloader import train_dataloader

In [2]:
for batch in train_dataloader:
    print(batch)
    break

TranslationBatch(x_src=tensor([[  1,  19,  25,  ...,   3,   3,   3],
        [  1, 160,  37,  ...,   3,   3,   3],
        [  1,   6,  60,  ...,   3,   3,   3],
        ...,
        [  1,  48, 130,  ...,   3,   3,   3],
        [  1,  19,  22,  ...,   3,   3,   3],
        [  1, 160,  22,  ...,   3,   3,   3]]), x_tgt=tensor([[ 1, 21, 87,  ...,  3,  3,  3],
        [ 1, 86, 33,  ...,  3,  3,  3],
        [ 1,  5, 71,  ...,  3,  3,  3],
        ...,
        [ 1, 90, 37,  ...,  3,  3,  3],
        [ 1, 21, 78,  ...,  3,  3,  3],
        [ 1, 86, 78,  ...,  3,  3,  3]]), x_src_mask=tensor([[ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        ...,
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False],
        [ True,  True,  True,  ..., False, False, False]]), x_tgt_mask=tensor([[ True,  True,  True,  ..., Fals