## LOADING DATASET

In [2]:
with open("data/the-verdict.txt", 'r', encoding='utf-8') as f:
    raw_text = f.read()

print(f"num characters: {len(raw_text)}")
raw_text[:100]

num characters: 20479


'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g'

## TOKENIZING

In [52]:
import re

preprocessed = re.split(f'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [word for word in preprocessed if word.strip()]
all_words = sorted(set(preprocessed))
all_words.extend(['<|endoftext|>', '<|unk|>'])
print(all_words[:10])
print(f"Vocab size: {len(all_words)}")

stoi = {word: i for i, word in enumerate(all_words)}
itos = {i: word for i, word in enumerate(all_words)}



['!', '"', "'", '(', ')', ',', '--', '.', ':', ';']
Vocab size: 1132


In [53]:
class Tokenizer:
    def __init__(self, vocab):
        self.stoi = {word: i for i, word in enumerate(vocab)}
        self.itos = {i: word for i, word in enumerate(vocab)}

    def encode(self, text):
        preprocessed = re.split(r"([!,.;:'\"?()]|--|\s)", text)
        preprocessed = [word.strip() for word in preprocessed if word.strip()]
        preprocessed = [word if word in self.stoi else "<|unk|>" for word in preprocessed]
        ids = [self.stoi[word] for word in preprocessed]
        return ids
    
    def decode(self, ids):
        text = ' '.join([self.itos[idx] for idx in ids])
        text = re.sub(r'\s+([,.?;:\'"!()]|--)', r'\1', text)
        return text
    

tokenizer = Tokenizer(all_words)
text1 = "Hello, do you like football"
text2 = "In the sunlut terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
tokenizer.decode(tokenizer.encode(text))



'<|unk|>, do you like <|unk|> <|endoftext|> In the <|unk|> terraces of the <|unk|>.'

## Byte-Pair Encoding (BPE)