In [4]:
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/"
           "ch02/01_main-chapter-code/the-verdict.txt")
    
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

with open("the-verdict.txt", "r") as f:
    raw_text = f.read()

# Print the first 500 characters of the text
print("The total number of characters in the text is:", len(raw_text))

The total number of characters in the text is: 20479


In [5]:
# split the text based on whitespace
import re
text = "Hello, world.is this --a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print("The total number of tokens in the text is:", len(preprocessed))

['Hello', ',', 'world', '.', 'is', 'this', '--', 'a', 'test', '?']
The total number of tokens in the text is: 4690


In [6]:
all_words = sorted(set(preprocessed))
print("The total number of unique tokens in the text is:", len(all_words))

vocab = {word: i for i, word in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    if i<10:
        print(item)
    else:
        break

The total number of unique tokens in the text is: 1130
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)


In [7]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [8]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)
print( ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [9]:
tex = tokenizer.decode(ids)
print(tex)

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


## Adding special context token

In [13]:
text = "Hello, world. Is this --a test?"
ids = tokenizer.encode(text)
print(ids)

KeyError: 'Hello'

## What Caused This Error?

The error occurred because the word " Hello" (with the leading space) does not exist in our training corpus, and therefore, the tokenizer was unable to find a corresponding token ID during encoding.

## 🛠️ How to Address It

To handle unknown tokens temporarily, we can modify the TokenizerV2 class to insert a special <unk> token for out-of-vocabulary words.

However, a more robust and scalable solution is to use the tiktoken package, which implements Byte Pair Encoding (BPE). BPE tokenizers can effectively handle unseen words by breaking them down into known subword units, avoiding the need for hardcoded fallback tokens like <unk>.

In [36]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<unk>", "<endoftext>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}
len_vocab = len(vocab.items())
print("The total number of tokens in the vocabulary is:", len_vocab)

The total number of tokens in the vocabulary is: 1132


In [37]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)
    

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<unk>', 1130)
('<endoftext>', 1131)


In [38]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [item if item in self.str_to_int
                        else "<unk>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [39]:
tokenizer = SimpleTokenizerV2(vocab)
text = "Hello, It's the last he painted, you know,Mrs. Gisburn said with pardonable pride."
ids = tokenizer.encode(text)
print(ids)
tex = tokenizer.decode(ids)
print(tex)

[1130, 5, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 67, 7, 38, 851, 1108, 754, 793, 7]
<unk>, It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride.


In [None]:
import tiktoken

# tiktoken.__version__


'0.9.0'