In [48]:
import urllib.request as request
import os
import re
import string

In [7]:
if not os.path.exists('data/verdict.txt'):
    file_path = 'theverdict.txt'
    url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
    request.urlretrieve(url, file_path)

In [29]:
with open('theverdict.txt','r', encoding='utf-8') as file:
    text = file.read()

In [95]:
text = text.lower()

In [96]:
re.escape(string.punctuation)

'!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~'

In [97]:
tokenizer = re.compile(rf"([{re.escape(string.punctuation)}])|(?:\s)")

In [98]:
tokens = tokenizer.split(text)

In [99]:
len(tokens)

9505

In [100]:
# remove duplicates
tokens = list(set(tokens))

In [101]:
tokens = sorted(list(filter(lambda x: x is not None, tokens)))


In [102]:
len(tokens)

1086

In [103]:
vocab = {token:integer for integer, token in enumerate(tokens)}

In [138]:
vocab[-1] = 'ha'

In [140]:
vocab[-1]

'ha'

In [154]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.id_to_token = {integer:token for token,integer in vocab.items()}
        self.token_to_id = vocab
        self.tokenizer = re.compile(rf"([{re.escape(string.punctuation)}])|(?:\s)")
        
    def encode(self, input_text):
        input_text = input_text.lower().strip()
        input_tokens = self.tokenizer.split(input_text)
        input_tokens = list(filter(lambda x: x is not None, input_tokens))
        input_token_ids = [self.token_to_id[token] if token in self.token_to_id.keys() else self.token_to_id['<unk>'] for token in input_tokens]
        return input_token_ids
    
    def decode(self, input_token_ids: list[int]):
        input_token_text = [self.id_to_token[token_id] if token_id in self.id_to_token.keys() else self.token_to_id[-1] for token_id in input_token_ids]
        input_token_text = ' '.join(input_token_text)
        return input_token_text
        

In [155]:
vocab.keys()

dict_keys(['', '!', '"', "'", '(', ')', ',', '-', '.', ':', ';', '?', '_', 'a', 'abdication', 'able', 'about', 'above', 'abruptly', 'absolute', 'absorbed', 'absurdity', 'academic', 'accuse', 'accustomed', 'across', 'activity', 'add', 'added', 'admirers', 'adopted', 'adulation', 'advance', 'aesthetic', 'affect', 'afraid', 'after', 'afterward', 'again', 'ago', 'ah', 'air', 'alive', 'all', 'almost', 'alone', 'along', 'always', 'am', 'amazement', 'amid', 'among', 'amplest', 'amusing', 'an', 'and', 'another', 'answer', 'answered', 'any', 'anything', 'anywhere', 'apparent', 'apparently', 'appearance', 'appeared', 'appointed', 'are', 'arm', 'arms', 'arrt', 'art', 'articles', 'artist', 'as', 'aside', 'asked', 'at', 'atmosphere', 'atom', 'attack', 'attention', 'attitude', 'audacities', 'away', 'awful', 'axioms', 'azaleas', 'back', 'background', 'balance', 'balancing', 'balustraded', 'basking', 'bath', 'be', 'beaming', 'bean', 'bear', 'beard', 'beauty', 'became', 'because', 'becoming', 'bed', 'b

In [156]:
tokenizer = SimpleTokenizer(vocab=vocab)

In [157]:
tokenizer.encode('what is happening here')

[1045, 507, -1, 460]

In [158]:
tokenizer.decode([1045, 507, -1, 460])

'what is <unk> here'