### Load Source Data

In [3]:
with open ("verdicttext/the-verdict.txt", "r", encoding ="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:1000])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it's going to send the value of my picture 'way up; but I don't think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing's lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn's "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its lik

### Test Tokenization

In [7]:
import re
text = "Hello, world. This, is a tokenization test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'tokenization', ' ', 'test.']


In [8]:
result = re.split(r'([,.] |\s)', text)
print(result)

['Hello', ', ', 'world', '. ', 'This', ', ', 'is', ' ', 'a', ' ', 'tokenization', ' ', 'test.']


In [9]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ', ', 'world', '. ', 'This', ', ', 'is', 'a', 'tokenization', 'test.']


In [11]:
# modify it a bit further so that it can also handle other types of punctuation, such as question
# marks, quotation marks, and the double-dashes

In [12]:
text = "Hello, world. Is this-- a tokenization test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'tokenization', 'test', '?']


### Apply the tokenization to our sample data

In [13]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [14]:
print(preprocessed[:100])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--']


### Convert tokens into token IDs


In [21]:
# converting these tokens from a Python string to an integer representation to
# produce the token IDs. This conversion is an intermediate step before converting the
# token IDs into embedding vectors.

In [16]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [19]:
vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [23]:
# implement a complete tokenizer class in Python with an encode method that
# splits text into tokens and carries out the string-to-integer mapping to produce token
# IDs via the vocabulary. In addition, we’ll implement a decode method that carries out
# the reverse integer-to-string mapping to convert the token IDs back into text.

In [22]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        
    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    

In [25]:
# Using the SimpleTokenizerV1 Python class, we can now instantiate new tokenizer
# objects via an existing vocabulary, which we can then use to encode and decode text

In [31]:
tokenizer = SimpleTokenizerV1(vocab)
text = """It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [27]:
# let’s see whether we can turn these token IDs back into text using the decode method

In [32]:
print(tokenizer.decode(ids))

It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [None]:
# Let’s now apply it to a new text sample not contained in the training set

text = "Hello, do you like tea?"
print(tokenizer.encode(text))

In [34]:
# we will receive a Key Error in the previous cell because the word “Hello” was not used in the our sample training text dataset i.e. the-verdict.txt
# this is why we need to consider large and diverse training sets to extend the vocabulary when working on LLMs

In [35]:
# We need to modify the tokenizer to handle unknown words. We also need to address
# the usage and addition of special context tokens that can enhance a model’s understanding
# of context or other relevant information in the text.

In [36]:
# These special tokens can include markers for unknown words and document boundaries
# we will modify the vocabulary and tokenizer, SimpleTokenizerV2, to support
# two new tokens, <|unk|> and <|endoftext|>
# the <|unk|> token will be used if it encounters a word that is not part of the vocabulary

In [37]:
# when training GPT-like LLMs on multiple independent documents or books
# it is common to insert a <|endoftext|> before each document or book that follows a previous text source
# This helps the LLM understand that although these text sources are concatenated for training, 
# they are, in fact, unrelated

In [38]:
#Let’s now modify the vocabulary to include these two special tokens, <unk> and
# <|endoftext|>, by adding them to our list of all unique words

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

print(len(vocab.items()))

1132


In [39]:
# quick check
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [None]:
# Tokenizer V2

In [45]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [item if item in self.str_to_int
                        else "<|unk|>" for item in preprocessed]
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text                     
            

In [None]:
# Compared to the SimpleTokenizerV1 we implemented in listing 2.3, the new Simple-
# TokenizerV2 replaces unknown words with <|unk|> tokens
# Let’s now try this new tokenizer out in practice. For this, we will use a simple text
# sample that we concatenate from two independent and unrelated sentences

In [46]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [None]:
# Next, let’s tokenize the sample text using the SimpleTokenizerV2 on the vocab we previously created

In [47]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [48]:
# decode
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [None]:
# Based on comparing this detokenized text above with the original input text, we know that
# the training dataset,the-verdict.txt, does not contain the
# words “Hello” and “palace.”

In [None]:
# Depending on the LLM, some researchers also consider additional special tokens
# such as the following:

# [BOS] (beginning of sequence)—This token marks the start of a text.It signifies to the LLM where a piece of content begins.

# [EOS] (end of sequence)—This token is positioned at the end of a text and is especially useful when concatenating multiple unrelated texts, similar to
# <|endoftext|>. For instance, when combining two different Wikipedia articles or books, the [EOS] token indicates where one ends and the next begins.

# [PAD] (padding)—When training LLMs with batch sizes larger than one, the batch might contain texts of varying lengths. 
# To ensure all texts have the same length, the shorter texts are extended or “padded” using the [PAD] token, up to
# the length of the longest text in the batch.

In [None]:
# The tokenizer used for GPT models does not need any of these tokens; it only uses an
# <|endoftext|> token for simplicity. <|endoftext|> is analogous to the [EOS] token.
# <|endoftext|> is also used for padding.

### Byte pair encoding

In [50]:
pip install tiktoken

Looking in indexes: https://GB-SVC-DSW-NEXUS:****@nexus302.systems.uk.hsbc:8081/nexus/repository/pypi-proxy_n3p/simple
Collecting tiktoken
  Downloading https://nexus302.systems.uk.hsbc:8081/nexus/repository/pypi-proxy_n3p/packages/tiktoken/0.9.0/tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m47.2 MB/s[0m eta [36m0:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.9.0

DONE.


In [51]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tiktoken version: 0.9.0


In [55]:
tokenizer = tiktoken.get_encoding("gpt2")

ValueError: Unknown encoding tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4.
Plugins found: ['tiktoken_ext.openai_public']
tiktoken version: 0.9.0 (are you on latest?)