In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy
import datasets
#import torchtext
import tqdm
import evaluate

In [18]:
seed = 1234

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [20]:
dataset = datasets.load_dataset("bentrevett/multi30k")

In [22]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [24]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [26]:
def tokenize_example(example, en_nlp, de_nlp, max_length, lower, sos_token, eos_token):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]
    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    de_tokens = [sos_token] + de_tokens + [eos_token]
    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

In [29]:
max_length = 1000
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map:   0%|          | 0/29000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1014 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [31]:
# build the vocab
min_freq = 2
unk_token = "<unk>"
pad_token = "<pad>"

special_tokens = [unk_token, pad_token, sos_token, eos_token]

en_voa

In [None]:
# build the vocab class and build_vocab_from_iterator method
# build the build_vocab_from_iterator
from collections import Counter, defaultdict

class Vocab:
    def __init__(self, token_to_index, unk_token="<unk>"):
        self.token_to_index = token_to_index
        self.index_to_token = {idx: token for token, idx in token_to_index.items()}
        self.unk_token = unk_token
        self.unk_index = token_to_index[unk_token]

    def __len__(self):
        return len(self.token_to_index)

    def __getitem__(self, token):
        return self.token_to_index.get(token, self.unk_index)

    def token_to_idx(self, token):
        return self.__getitem__(token)

    def idx_to_token(self, idx):
        return self.index_to_token.get(idx, self.unk_token)

    def get_itos(self):
        """Returns the index-to-string mapping (i.e., list of tokens)."""
        # Creating a list with tokens where the index corresponds to the token's position
        max_index = max(self.index_to_token.keys())
        itos = [self.index_to_token.get(i, self.unk_token) for i in range(max_index + 1)]
        return itos

    def get_stoi(self):
        """Returns the string-to-index mapping (i.e., dictionary of tokens and their indices)."""
        return self.token_to_index

    def set_default_index(self, index):
        """Sets the default index for unknown tokens."""
        self.unk_index = index

    def lookup_indices(self, tokens):
        """Returns a list of indices for the given list of tokens."""
        return [self.token_to_idx(token) for token in tokens]

    def lookup_tokens(self, indices):
        """Returns a list of tokens for the given list of indices."""
        if torch.is_tensor(indices):
            indices = indices.tolist()
        return [self.idx_to_token(index) for index in indices]


def build_vocab_from_iterator(iterator, min_freq=1, specials=None):
    """
    Builds a vocabulary from an iterator.

    Args:
    - iterator (iterable): An iterable yielding lists of tokens.
    - min_freq (int): The minimum frequency a token must have to be included in the vocabulary.
    - specials (list): A list of special tokens (e.g., ['<unk>', '<pad>', '<sos>', '<eos>']).

    Returns:
    - vocab (Vocab): A custom Vocab object containing the token-to-index mapping.
    """
    # Initialize the counter for token frequencies
    counter = Counter()

    # Count frequencies of tokens in the iterator
    for tokens in iterator:
        counter.update(tokens)

    # Start with special tokens if provided
    token_to_index = {}
    if specials:
        for idx, token in enumerate(specials):
            token_to_index[token] = idx
    
    # Add regular tokens to the vocabulary if they meet the min_freq
    for token, freq in counter.items():
        if freq >= min_freq and token not in token_to_index:
            token_to_index[token] = len(token_to_index)

    # Set the <unk> token index if it's not already set
    unk_token = specials[0] if specials else "<unk>"
    if unk_token not in token_to_index:
        token_to_index[unk_token] = len(token_to_index)

    print(token_to_index)

    return Vocab(token_to_index, unk_token=unk_token)

# Example usage:
tokens = [["i", "love", "pizza"], ["i", "hate", "music", "videos"]]
specials = ["<unk>", "<pad>", "<sos>", "<eos>"]
vocab = build_vocab_from_iterator(tokens, min_freq=1, specials=specials)
print(vocab.token_to_index)
print(vocab.idx_to_token(6))

