In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of chars: ", len(raw_text))

Total number of chars:  20479


In [3]:
# we want to tokenize this text such that we can feed it into an embedding model to generate embeddings 
# first naive aproach with regex: 
import re
text = "Hello, world. This, is a test."
res = re.split(r'(\s)', text)
print(res)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [4]:
# we want to have the punctuation marks as their ownm entities
res = re.split(r'([,.]|\s)', text)
print(res)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [5]:
# problem: we still have whitespaces inside which is meaningless (?)
res = [r for r in res if r.strip()]
print(res)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [6]:
# capture all other special chars in the sample text as well
text = "Hello, world. Is this-- a test?"

res = re.split(r'([,.:;?_!"()\']|--|\s)', text)
res = [r.strip() for r in res if r.strip()]
print(res)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [7]:
# use 'tokenizer' to tokenize the sample text
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [r.strip() for r in preprocessed if r.strip()]
print(len(preprocessed))

4690


In [8]:
# now we need to map the tokens to tokenIDs before we can continue with the embedding process
# for that we need a vocabulary of unique tokens first and then convert those into distinct integers (the tokenIDs)
# this list needs to be sorted alphabetically

all_words = sorted(set(preprocessed)) # sort tokens and remove duplicates
vocab_size = len(all_words)
print(vocab_size)

1130


In [9]:
# now create dicr with tokens and ids 
vocab = {token:integer for integer, token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(item)
    if i > 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)
('His', 51)


In [10]:
# create  tokenizer class for encoding to tokens to token ids and decoding from tokenids to tokens

class SimpleTokenizerV1: 
    def __init__(self, vocab):
        self.str_to_int = vocab # this we already did 
        self.int_to_str = {integer:token for token, integer in vocab.items()} #reverse mappping
    
    def encode(self, text):
        '''Process input text into tokenIDS using vocab'''
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        '''Convert a list of token ids into corresponding string under vocab.'''
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # remove spaces before the punctuations

        return text


In [11]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [12]:
# get og text back from the token ids
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [13]:
# problem: if we give it new text with unnknownm words (not in vocab) we will get an error: 

text = "Hello World."
print(tokenizer.encode(text))

KeyError: 'Hello'

In [14]:
# we need to introduce a way to handle unknown words, for that we will introduce a new token to the vocab fopr dealing with this 

all_tokens = sorted(list(set(preprocessed))) 
all_tokens.extend(["<|endoftext|>", "<|unk|>"]) # we also add another special token as a delimitzer between texts, such that the llm understands that the documents are not related to each other
vocab = {token: integer for integer, token in enumerate(all_tokens)}



In [15]:
# update our tokenizer to incorporatze the two new special tokens 
class SimpleTokenizerV2: 
    def __init__(self, vocab):
        self.str_to_int = vocab # this we already did 
        self.int_to_str = {integer:token for token, integer in vocab.items()} #reverse mappping
    
    def encode(self, text):
        '''Process input text into tokenIDS using vocab'''
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed] # replace unknownm words with unk token

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        '''Convert a list of token ids into corresponding string under vocab.'''
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # remove spaces before the punctuations

        return text

In [16]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))

tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.decode(tokenizer.encode(text)))


<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [17]:
# using Byte-Pair-encodind (BPE)
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

# exercicse 2.1 
text = "Akwirw ier"
token_ids = tokenizer.encode(text)
for id in token_ids: 
    print(f" {tokenizer.decode([id])} -> {id}")

print()
print(tokenizer.decode(token_ids))

 Ak -> 33901
 w -> 86
 ir -> 343
 w -> 86
   -> 220
 ier -> 959

Akwirw ier


In [None]:
### implement sliding window approach to create the input-target pairs the llm needs to for pretraining ###
# first tokenize the verdict text with the bpe tokenizer

with open("the-verdict.txt", "r", encoding="utf-8") as f: 
    raw_text = f.read()

encoded_text = tokenizer.encode(raw_text) #bpe tokemizer does tokenizing and converting into token ids in one go
print(len(encoded_text))

5145


In [21]:
encoded_sample = encoded_text[50:] # remove the first 50 tokens to make it more intersting (?)

context_size = 4 #how many tokens are included in the input
for i in range(1, context_size+1):
    context = encoded_sample[:i]
    desired = encoded_sample[i]
    print(context, "----->", desired)    


[290] -----> 4920
[290, 4920] -----> 2241
[290, 4920, 2241] -----> 287
[290, 4920, 2241, 287] -----> 257


In [23]:
# now we can also turn the ids into text to see the actual input-target pairs: 
for i in range(1, context_size+1):
    context = encoded_sample[:i]
    desired = encoded_sample[i]
    print(tokenizer.decode(context), "----->", tokenizer.decode([desired]))  

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


In [None]:
# now we need an actual data loader that iterates over the input dataset and retuerns inouts and targets and pytorch tensors 
