## Alternative Vocabulary Options

Instead of using only tokens from "the-verdict.txt", you can use more comprehensive English vocabularies:

### Option 1: NLTK Words Corpus
```python
import nltk
nltk.download('words')
from nltk.corpus import words
english_words = set(words.words())
```

### Option 2: Pre-trained Tokenizers
- **tiktoken** (OpenAI's tokenizer) - already in requirements.txt
- **Hugging Face tokenizers** - BPE, WordPiece, SentencePiece
- **spaCy** tokenizers

### Option 3: Large Text Datasets
- Common Crawl
- Wikipedia dumps
- Project Gutenberg
- Google Books N-grams

### Option 4: Word Frequency Lists
- Google's 10,000 most common English words
- Brown Corpus vocabulary
- Oxford English Dictionary word lists

In [105]:
with open("./files/the-verdict.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

In [106]:
len(raw_text)

20479

**Tokeninzing text**

In [107]:
import re
text = "Hello world! This is a test."
result = re.split(r'(\s)', text)
result = re.split(r'(\s|[.,!])', text)
result = [token for token in result if token.strip()]
result

['Hello', 'world', '!', 'This', 'is', 'a', 'test', '.']

In [108]:
'''import nltk
nltk.download('words')
from nltk.corpus import words
english_words = set(words.words())'''

"import nltk\nnltk.download('words')\nfrom nltk.corpus import words\nenglish_words = set(words.words())"

In [109]:
result = re.split(r'(--|[,.;:?_!"()\'\s-])', raw_text)
result = [item.strip() for item in result if item.strip()]
result


['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius',
 '--',
 'though',
 'a',
 'good',
 'fellow',
 'enough',
 '--',
 'so',
 'it',
 'was',
 'no',
 'great',
 'surprise',
 'to',
 'me',
 'to',
 'hear',
 'that',
 ',',
 'in',
 'the',
 'height',
 'of',
 'his',
 'glory',
 ',',
 'he',
 'had',
 'dropped',
 'his',
 'painting',
 ',',
 'married',
 'a',
 'rich',
 'widow',
 ',',
 'and',
 'established',
 'himself',
 'in',
 'a',
 'villa',
 'on',
 'the',
 'Riviera',
 '.',
 '(',
 'Though',
 'I',
 'rather',
 'thought',
 'it',
 'would',
 'have',
 'been',
 'Rome',
 'or',
 'Florence',
 '.',
 ')',
 '"',
 'The',
 'height',
 'of',
 'his',
 'glory',
 '"',
 '--',
 'that',
 'was',
 'what',
 'the',
 'women',
 'called',
 'it',
 '.',
 'I',
 'can',
 'hear',
 'Mrs',
 '.',
 'Gideon',
 'Thwing',
 '--',
 'his',
 'last',
 'Chicago',
 'sitter',
 '--',
 'deploring',
 'his',
 'unaccountable',
 'abdication',
 '.',
 '"',
 'Of',
 'course',
 'it',
 "'",
 's',
 'going',
 'to',
 'send',
 't

In [110]:
preprossed = result
len(preprossed)

4766

## **CONVERTING THE TOKENS INTO TOKEN IDs**

In [111]:
all_tokens = sorted(set(preprossed))
vocab_size = len(all_tokens)
vocab = {token: integer for integer, token in enumerate(all_tokens)}
inverse_vocab = {integer: token for token, integer in vocab.items()}
token_ids = [vocab[token] for token in preprossed]
len(vocab)

1140

In [112]:
class SimpleTokenizerV1:
    """
    A simple tokenizer to convert text to a sequence of integer IDs and back.
    """

    def __init__(self, vocab: dict):
        """
        Initializes the tokenizer with a vocabulary.

        Args:
            vocab (dict): A dictionary mapping strings (tokens) to integer IDs.
        """
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}

    def encode(self, text: str) -> list[int]:
        """
        Encodes a string of text into a list of integer IDs.

        Args:
            text (str): The input text to encode.

        Returns:
            list[int]: A list of integer IDs representing the text.
        """
        # Split the text into tokens based on punctuation, special characters, and whitespace
        preprocessed = re.split(r'(--|[,.?_!"()\'\s])', text)
        
        # Clean up the tokens by removing leading/trailing whitespace and empty strings
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        
        # Convert the tokens to their corresponding integer IDs
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids: list[int]) -> str:
        """
        Decodes a list of integer IDs back into a string of text.

        Args:
            ids (list[int]): The list of integer IDs to decode.

        Returns:
            str: The decoded text.
        """
        # Convert the integer IDs back to their string tokens and join them with spaces
        text = " ".join([self.int_to_str[i] for i in ids])
        
        # Refine the spacing around punctuation for better readability
        # This removes the space before certain punctuation marks
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [113]:
tokenizer = SimpleTokenizerV1(vocab)

In [114]:
text = "the last, painted"
ids = tokenizer.encode(text)
ids

[998, 606, 5, 751]

In [115]:
value = tokenizer.decode(ids)
value

'the last, painted'

## Adding special context to tokens for unknown words

In [116]:
all_tokens.extend(['<|endoftext|>', '|unk|'])

In [117]:
vocab_size = len(all_tokens)
vocab = {token: integer for integer, token in enumerate(all_tokens)}
inverse_vocab = {integer: token for token, integer in vocab.items()}
token_ids = [vocab[token] for token in preprossed]

In [118]:
len(vocab)

1142

In [121]:
class SimpleTokenizerV2:
    """
    An improved tokenizer that handles unknown words with special tokens.
    """

    def __init__(self, vocab: dict):
        """
        Initializes the tokenizer with a vocabulary.

        Args:
            vocab (dict): A dictionary mapping strings (tokens) to integer IDs.
        """
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}
        self.unk_token = "|unk|"  # Unknown token
        self.unk_id = vocab.get(self.unk_token, None)
        
        if self.unk_token not in vocab:
            raise ValueError(f"Unknown token '{self.unk_token}' not found in vocabulary")

    def encode(self, text: str) -> list[int]:
        """
        Encodes a string of text into a list of integer IDs.
        Unknown tokens are replaced with |unk| token.

        Args:
            text (str): The input text to encode.

        Returns:
            list[int]: A list of integer IDs representing the text.
        """
        # Split the text into tokens based on punctuation, special characters, and whitespace
        preprocessed = re.split(r'(--|[,.?_!"()\'\s])', text)
        
        # Clean up the tokens by removing leading/trailing whitespace and empty strings
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        
        # Convert the tokens to their corresponding integer IDs
        # Use unk_id for unknown tokens
        ids = []
        for token in preprocessed:
            if token in self.str_to_int:
                ids.append(self.str_to_int[token])
            else:
                ids.append(self.unk_id)  # Use unknown token ID
                print(f"Unknown token '{token}' replaced with '{self.unk_token}'")
        
        return ids

    def decode(self, ids: list[int]) -> str:
        """
        Decodes a list of integer IDs back into a string of text.

        Args:
            ids (list[int]): The list of integer IDs to decode.

        Returns:
            str: The decoded text.
        """
        # Convert the integer IDs back to their string tokens and join them with spaces
        text = " ".join([self.int_to_str[i] for i in ids])
        
        # Refine the spacing around punctuation for better readability
        # This removes the space before certain punctuation marks
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [122]:
tokenizer2 = SimpleTokenizerV2(vocab)
text = "hii, painted"
ids = tokenizer2.encode(text)
ids

Unknown token 'hii' replaced with '|unk|'


[1141, 5, 751]

## Byte pair encoding for unknown words