Step 1: Creating tokens

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    print("Total number of characters: ", len(raw_text))
    print(raw_text[:199])

In [None]:
import re
text = "Hello, world. This is a test."
result = re.split(r'(\s)', text)
print(result)

In [None]:
text = "Hello, world. This is a test."
result = re.split(r'([.,]|\s)', text)
print(result)

In [None]:
result = [item for item in result if item.strip()]
print(result)

In [None]:
result = re.sub(r'[^a-zA-Z0-9\s]','',text)
print(result)


basic tokenizer

In [None]:
text = "Hello, world. This is _-- a test?"
result = re.split(r'([.,:;?_()\']|--|\s)', text)
result = [item for item in result if item.strip()]
print(result)

In [None]:

preprocessed = re.split(r'([.,:;?_()!"\']|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:30])

In [None]:
len(preprocessed)

Step 2: Creating tokens IDs

In [None]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

In [None]:
vocab =  {token:integer for integer,token in enumerate(all_words)}

In [None]:
print(vocab)

In [None]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {integer:token for token,integer in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([.,:;?_()!"\']|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([.,:;?_()!"\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)
text = """
    It's the last he painted, you know, 
    Mrs. Gisburn said with pardonable pride
    """

In [None]:
ids = tokenizer.encode(text)
print(ids)

In [None]:
tokenizer.decode(ids)

In [None]:
tokenizer = SimpleTokenizerV1(vocab)
text = """
    Hello, do you like tea?
    """
# print(tokenizer.encode(text))
# This gives an error because Hello not in vocab

Adding Special Context Tokens

In [None]:
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab =  {token:integer for integer,token in enumerate(all_tokens)}

In [None]:
all_tokens[-5:]

In [None]:
len(vocab.items())

In [None]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {integer:token for token,integer in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([.,:;?_()!"\']|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([.,:;?_()!"\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV2(vocab)
text1 = """
    Hello, do you like tea?
    """
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

In [None]:
print(tokenizer.encode(text))

In [None]:
print(tokenizer.decode(tokenizer.encode(text)))

BYTE PAIR ENCODING (BPE)