## Dataset Load

In [1]:
from datasets import load_dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the IWSLT 2017 dataset for a specific language pair
dataset = load_dataset("iwslt2017", "iwslt2017-en-de")


# Access specific splits of the dataset
train_data = dataset["train"]
valid_data = dataset["validation"]
test_data = dataset["test"]

# Access data examples
example = train_data[0]
#source_text = example["translation"]["en"]
#target_text = example["translation"]["de"]

corpus = [ train_data[i]["translation"]["en"] for i in range(5) ]
#print(example)
#print(example["translation"])
#print("Source:", source_text)
#print("Target:", target_text)

print(corpus)

#print(dataset)



['Thank you so much, Chris.', "And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.", 'I have been blown away by this conference, and I want to thank all of you for the many nice comments about what I had to say the other night.', 'And I say that sincerely, partly because  I need that.', 'Put yourselves in my position.']


## PreTokenizer
gpt2 tokenizer for the pre-tokenization

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [4]:
from collections import defaultdict

word_freqs = defaultdict(int)

for text in corpus:
    new_words = tokenizer.tokenize(text)
    for word in new_words:
        word_freqs[word] += 1

print(word_freqs)

defaultdict(<class 'int'>, {'Thank': 1, 'Ġyou': 2, 'Ġso': 1, 'Ġmuch': 1, ',': 3, 'ĠChris': 1, '.': 5, 'And': 2, 'Ġit': 1, "'s": 1, 'Ġtruly': 1, 'Ġa': 1, 'Ġgreat': 1, 'Ġhonor': 1, 'Ġto': 5, 'Ġhave': 2, 'Ġthe': 3, 'Ġopportunity': 1, 'Ġcome': 1, 'Ġthis': 2, 'Ġstage': 1, 'Ġtwice': 1, ';': 1, 'ĠI': 5, "'m": 1, 'Ġextremely': 1, 'Ġgrateful': 1, 'I': 1, 'Ġbeen': 1, 'Ġblown': 1, 'Ġaway': 1, 'Ġby': 1, 'Ġconference': 1, 'Ġand': 1, 'Ġwant': 1, 'Ġthank': 1, 'Ġall': 1, 'Ġof': 1, 'Ġfor': 1, 'Ġmany': 1, 'Ġnice': 1, 'Ġcomments': 1, 'Ġabout': 1, 'Ġwhat': 1, 'Ġhad': 1, 'Ġsay': 2, 'Ġother': 1, 'Ġnight': 1, 'Ġthat': 2, 'Ġsincerely': 1, 'Ġpartly': 1, 'Ġbecause': 1, 'Ġ': 1, 'Ġneed': 1, 'Put': 1, 'Ġyourselves': 1, 'Ġin': 1, 'Ġmy': 1, 'Ġposition': 1})


In [5]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

print(alphabet)

["'", ',', '.', ';', 'A', 'C', 'I', 'P', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'Ġ']


In [6]:
vocab = ["<|endoftext|>"] + alphabet.copy()

In [7]:
splits = {word : [c for c in word] for word in word_freqs.keys()}
splits

{'Thank': ['T', 'h', 'a', 'n', 'k'],
 'Ġyou': ['Ġ', 'y', 'o', 'u'],
 'Ġso': ['Ġ', 's', 'o'],
 'Ġmuch': ['Ġ', 'm', 'u', 'c', 'h'],
 ',': [','],
 'ĠChris': ['Ġ', 'C', 'h', 'r', 'i', 's'],
 '.': ['.'],
 'And': ['A', 'n', 'd'],
 'Ġit': ['Ġ', 'i', 't'],
 "'s": ["'", 's'],
 'Ġtruly': ['Ġ', 't', 'r', 'u', 'l', 'y'],
 'Ġa': ['Ġ', 'a'],
 'Ġgreat': ['Ġ', 'g', 'r', 'e', 'a', 't'],
 'Ġhonor': ['Ġ', 'h', 'o', 'n', 'o', 'r'],
 'Ġto': ['Ġ', 't', 'o'],
 'Ġhave': ['Ġ', 'h', 'a', 'v', 'e'],
 'Ġthe': ['Ġ', 't', 'h', 'e'],
 'Ġopportunity': ['Ġ', 'o', 'p', 'p', 'o', 'r', 't', 'u', 'n', 'i', 't', 'y'],
 'Ġcome': ['Ġ', 'c', 'o', 'm', 'e'],
 'Ġthis': ['Ġ', 't', 'h', 'i', 's'],
 'Ġstage': ['Ġ', 's', 't', 'a', 'g', 'e'],
 'Ġtwice': ['Ġ', 't', 'w', 'i', 'c', 'e'],
 ';': [';'],
 'ĠI': ['Ġ', 'I'],
 "'m": ["'", 'm'],
 'Ġextremely': ['Ġ', 'e', 'x', 't', 'r', 'e', 'm', 'e', 'l', 'y'],
 'Ġgrateful': ['Ġ', 'g', 'r', 'a', 't', 'e', 'f', 'u', 'l'],
 'I': ['I'],
 'Ġbeen': ['Ġ', 'b', 'e', 'e', 'n'],
 'Ġblown': ['Ġ', 'b', '

In [8]:
def compute_pair_frequs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split)-1):
            pair = (split[i], split[i+1])
            pair_freqs[pair] += freq
    return pair_freqs


