In [4]:
import os
import urllib.request

if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

text_file = open('the-verdict.txt', 'r')
raw_text = text_file.read()
text_file.close()

In [2]:
import re

class Tokenizer:
    def __init__(self):
        self.vocab = {}
        self.vocab_size = 0

    def preprocess(self, text):
        # Split text into tokens using regex
        tokens = re.split(r'([,.:;?_!"()\'\-\-\s])', text)
        # Remove empty tokens and strip whitespace
        tokens = [token.strip() for token in tokens if token.strip()]
        return tokens

    def build_vocab(self, tokens):
        # Create a sorted set of unique tokens
        unique_tokens = sorted(set(tokens))
        # Assign an ID to each token
        self.vocab = {token: idx for idx, token in enumerate(unique_tokens)}
        self.vocab_size = len(self.vocab)

    def tokenize(self, text):
        # Preprocess the text
        tokens = self.preprocess(text)
        # Convert tokens to their corresponding IDs
        token_ids = [self.vocab[token] for token in tokens if token in self.vocab]
        return token_ids

In [5]:
tokenizer = Tokenizer()
# Preprocess and build vocabulary
tokens = tokenizer.preprocess(raw_text)
tokenizer.build_vocab(tokens)

# Tokenize the text
token_ids = tokenizer.tokenize(raw_text)

token_ids

[53,
 44,
 149,
 1012,
 57,
 38,
 825,
 115,
 258,
 492,
 6,
 6,
 1011,
 115,
 506,
 440,
 397,
 6,
 6,
 916,
 588,
 1087,
 713,
 512,
 970,
 1025,
 666,
 1025,
 537,
 996,
 5,
 571,
 997,
 540,
 726,
 551,
 502,
 5,
 535,
 518,
 375,
 551,
 752,
 5,
 664,
 115,
 848,
 1111,
 5,
 157,
 402,
 549,
 571,
 115,
 1076,
 731,
 997,
 84,
 7,
 3,
 99,
 53,
 825,
 1012,
 588,
 1129,
 532,
 206,
 85,
 738,
 34,
 7,
 4,
 1,
 93,
 540,
 726,
 551,
 502,
 1,
 6,
 6,
 996,
 1087,
 1099,
 997,
 1121,
 242,
 588,
 7,
 53,
 244,
 537,
 67,
 7,
 37,
 100,
 6,
 6,
 551,
 605,
 25,
 905,
 6,
 6,
 332,
 551,
 1052,
 116,
 7,
 1,
 73,
 302,
 588,
 2,
 858,
 504,
 1025,
 874,
 997,
 1068,
 726,
 701,
 774,
 2,
 1093,
 1061,
 9,
 239,
 53,
 365,
 2,
 979,
 1007,
 726,
 996,
 5,
 66,
 7,
 83,
 6,
 6,
 997,
 649,
 1025,
 16,
 587,
 145,
 53,
 1007,
 726,
 7,
 1,
 93,
 1125,
 5,
 731,
 67,
 7,
 100,
 2,
 858,
 636,
 5,
 697,
 589,
 114,
 855,
 114,
 175,
 1011,
 1003,
 1098,
 834,
 571,
 156,
 394,
 1079,
 726,