In [8]:
with open("C:\\AB_Personal\\building_llm_from_scratch\\data\\the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of charadcters in the text: ", len(raw_text))
print("First 100 characters of the text: ", raw_text[:100])

Total number of charadcters in the text:  20479
First 100 characters of the text:  I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [9]:
import re
# Split the text where while space is found
text = "Hello, world. This, is a  test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', '', ' ', 'test.']


<div style="background-color: darkblue; padding: 10px; border-radius: 5px;">
  The reult we got is a list of individual words, white spaces and punctuations!
</div>

In [10]:
result = re.split(r'([, .]|\s)', text) # no space in the pipe part
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', '', ' ', 'test', '.', '']


In [11]:
# now we will remove the empty strings from the list
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


<div style="background-color: darkblue; padding: 10px; border-radius: 5px;">
  When developing a tokenizer whetehr we should keep or delete whote spaces depends on the application that we will be using it for.

  Removing white spaces reduces memory and computing requirements. But white spaces can be sensitive to tasks like generationg code.
   
  But in this case we will remove white spaces.  
</div>

In [12]:
# We will now try to keep all spcial characters as tokens as well
text = "Hello, world. is this-- a test?"
result = re.split(r'([?: , . : ; ? "() \']|--|\s )', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'is', 'this', '--', 'a', 'test', '?']


In [13]:
# now we apply the above scheme to the entire raw text to get the tokens

preprocessed = re.split(r'([?: , . : ; "( ) \']|--|\s )', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print("Total number of tokens in the text: ", len(preprocessed))
print("First 100 tokens of the text: ", preprocessed[:100]) 

Total number of tokens in the text:  4629
First 100 tokens of the text:  ['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--']


# Converting Tokens to Token IDS

In [14]:
# Creating the vocabulary : Select the unique tokens and sort them alphabetically
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print("Total number of unique tokens in the text: ", len(all_words))


Total number of unique tokens in the text:  1157


In [15]:
# After determining the vocabulary soze we create the vocabulary dictionary.
# The keys of the dictionary are the words and the values are the indices of the words in the vocabulary
# unique integer for each word
vocab = {word: i for i, word in enumerate(all_words)}

In [16]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i > 50 :
        break

('"', 0)
("'", 1)
('(', 2)
(')', 3)
(',', 4)
('--', 5)
('.', 6)
(':', 7)
(';', 8)
('?', 9)
('A', 10)
('Ah', 11)
('Among', 12)
('And', 13)
('Are', 14)
('Arrt', 15)
('As', 16)
('At', 17)
('Be', 18)
('Begin', 19)
('Burlington', 20)
('But', 21)
('By', 22)
('Carlo', 23)
('Chicago', 24)
('Claude', 25)
('Come', 26)
('Croft', 27)
('Destroyed', 28)
('Devonshire', 29)
('Don', 30)
('Dubarry_', 31)
('Emperors', 32)
('Florence', 33)
('For', 34)
('Gallery', 35)
('Gideon', 36)
('Gisburn', 37)
('Gisburn!', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)
('His', 51)


# Creating the Tokeinzer Class

In [17]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i : s for s, i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([?: , . : ; "( ) \']|--|\s )', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[word] for word in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuation marks
        text = re.sub(r'\s([, . : ; ? !])', r'\1', text)
        return text

In [18]:
tokenizer = SimpleTokenizerV1(vocab) # the vocabulary that we created above
text = """"It's the last he painted, you know," 
    	    Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)


[0, 56, 1, 870, 1010, 619, 549, 766, 4, 1153, 613, 4, 0, 69, 6, 37, 871, 1132, 774, 814, 6]


In [19]:
regenrated_te = tokenizer.decode(ids)
print(regenrated_te)

" It ' s the last he painted, you know, " Mrs. Gisburn said with pardonable pride.


<div style="border: 2px solid green; padding: 10px; background-color:rgb(62, 202, 167); border-radius: 5px; color: black;">
    Let's say we now have a senstecne say <strong>"Hello, how was your tea?"</strong>.
    In this case we will get an error when we try to encode because the word <strong>"Hello"</strong> is not present in the Vocabulary.
    In order to handle that <strong>Special Context Tokens</strong> are used.
</div>

# Adding Special Context Tokens

1. We will add two new tokens to our vocabulary namely <**|unk|**> and <**|endoftext|**>.
2. When we encouter any unknown word (word not in the vocabulary) then  <**|unk|**> is used.
3. When we have multiple sources then we need the <**|endoftext|**>.
4. The <**|endoftext|**> tokens act as markers, signalling the start or end of a particular segment.

In [20]:
# let's include the two special tokens
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {word: i for i, word in enumerate(all_tokens)}
print("Total number of unique tokens in the text: ", len(vocab))

Total number of unique tokens in the text:  1159


In [21]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1154)
('your', 1155)
('yourself', 1156)
('<|endoftext|>', 1157)
('<|unk|>', 1158)


In [22]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i : s for s, i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([?: , . : ; "( ) \']|--|\s )', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            word if word in self.str_to_int else "<|unk|>"
            for word in preprocessed
        ]
        
        ids = [self.str_to_int[word] for word in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuation marks
        text = re.sub(r'\s([, . : ; ? !])', r'\1', text)
        return text

In [23]:
tokenizer2 = SimpleTokenizerV2(vocab) # the vocabulary that we created above

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

"""
When there are multiple sources, then each ssource is joined by this kind of <|endoftext|> token.
"""
text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [24]:
ids = tokenizer2.encode(text)
print(ids)

[1158, 4, 373, 1153, 645, 997, 9, 1157, 55, 1010, 978, 1006, 741, 1010, 1158, 6]


In [25]:
tokenizer2.decode(ids) 

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

### There are some other tokens that researches use

1. **[BOS]** : Beginning of Sequence. this token marks the beginning of text.
2. **[EOS]** : End of Sequence. This token is placed at the end of a text, and is useful when concatenating multiple unrelated texts.
3. **[PAD]** : Padding. 

Study Byte Pair Encoding Tokenizer

# BYTE PAIR ENCODING

In [6]:
import importlib
import tiktoken

In [27]:
# We initiate the BPE tokenizer from tiktokens
tokenizer = tiktoken.get_encoding('gpt2')

In [28]:
text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces"
        "of someunknownplace.")

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 5372, 13]


<div style="background-color:rgb(96, 194, 201); padding:10px; border-radius:5px;color: black">
  
  **Observations from the above tokenizer**  

  1. The `<|endoftext|>` has a relatively large token `50256`.  
  2. The BPE tokenizer used to train models like GPT-2 and GPT-3 has a total vocabulary size of `50257`.  
  3. `<|endoftext|>` is the last token in the vocabulary.  

</div>


In [29]:
text = tokenizer.decode(integers)

In [30]:
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownplace.


# Creating Input Target Pairs

In [32]:
with open("C:\\AB_Personal\\building_llm_from_scratch\\data\\the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [38]:
enc_sample = enc_text[50:]

In [39]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [40]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [41]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


# Implementing a Dataloader

Step 1 : Tokeinze the entire text

Step 2 : Use sliding Window to chuk the book into overlapping sequenes of ma_length

Step 3 : Return the total numbe of rows in the dataset

Step 4 : Return a single row from the dataset 

In [13]:
from torch.utils.data import Dataset, DataChunk, DataLoader
import torch

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        """
        txt: the dataset
        tokenizer: the tokenizer (BPE)
        max_length: the maximum length of the context (context_size)
        stride: the stride of the context
        """
        self.input_ids = []
        self.output_ids = []

        # Tokeinze the entire text
        token_ids = tokenizer.encode(txt, allowed_special = {"<|endoftext|>"})

        # Use a sliding winodw to chunk the book into ovelapping sequences of ma-length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            output_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.output_ids.append(torch.tensor(output_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.output_ids[index]

1. The GPTDatasetV1 class is based on the Pytorch Dataset class

2. It defines how individual rows are fetched from the dataset.

3. Each row consists of a number of token IDs (based on a max_length) assigned to an input-chunk tensor.

4. The target chunk tensor contains the corresponding targets


### Creating the Data Loader

1. Initialize the tokenizer
2. Create dataset
3. drop_last = True. Drops the last batch if it is shorter than the specified batch_size to prevent loss spilkes during training.
4. The number of CPU processes to use for preprocessing


In [14]:
def create_dataloader_v1(txt, batch_size = 4, max_length=256, 
                         stride = 128, shuffle = True, drop_last = True,
                         num_workers = 0):
    
    # Initialize tokeinzer
    tokenizer = tiktoken.get_encoding('gpt2')

    # create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers

    )

    return dataloader

In [15]:
with open("C:\\AB_Personal\\building_llm_from_scratch\\data\\the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
dataloader = create_dataloader_v1(raw_text, batch_size = 1, max_length=4, stride= 1,shuffle=True)

data_iter = iter(dataloader)
firstbatch = next(data_iter)
print(firstbatch)

[tensor([[ 3619,   338, 10568,   550]]), tensor([[  338, 10568,   550,   587]])]


# Vector Embeddings

In [1]:
import torch

In [2]:
input_ids = torch.tensor([2, 3, 5, 1])
# We are using vocabulay of only 6 words.

In [3]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [4]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [5]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


# Positional Embdeddings

In [17]:
vocab_size = 50257
output_dim = 256

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [19]:
# data loader
max_length = 4
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=True)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Token IDs : \n", inputs)
print("\nInput shape: ", inputs.shape)

Token IDs : 
 tensor([[  284,  1064,   503,  1521],
        [  284,   766,   340,   438],
        [   12, 12239,    13,   198],
        [ 5986,    13, 23676,  2415],
        [  520,  5493,   438,   292],
        [ 1364,  2157,    13,  2750],
        [  351,   884,  2784,  9830],
        [  293,   553,   373,   465]])

Input shape:  torch.Size([8, 4])


In [20]:
# now we want to convert each token ID to a vector (256 D) using the embedding layer
# One embedding vector of 256 length is generated for each token in input
token_embeddings = embedding_layer(inputs)
print("\nToken embeddings shape: ", token_embeddings.shape)


Token embeddings shape:  torch.Size([8, 4, 256])


In [23]:
context_length = max_length
positional_embeddings_layer = torch.nn.Embedding(context_length, output_dim)

In [24]:
pos_emddings = positional_embeddings_layer(torch.arange(context_length))
print("\nPositional embeddings shape: ", pos_emddings.shape)


Positional embeddings shape:  torch.Size([4, 256])


In [25]:
input_embeddings = token_embeddings + pos_emddings
print("\nInput embeddings shape: ", input_embeddings.shape)


Input embeddings shape:  torch.Size([8, 4, 256])
