There are 4 steps involve in data preprocessing for LLMs
1. converting text into tokens
2. tokens into token IDs
3. token IDs into dataloader (token IDs loader)
4. token IDs (loader) into embedding vectors

**converting text into tokens**

In [1]:

with open ('the-verdict.txt' ,'r') as f:
    raw_text=f.read()
print(raw_text[:99])
print(len(raw_text)) # total number of characters


I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
20479


for building LLM, tokens plays a crucial role in data preprocessing
in this example, we'll use python's `regular expression` syntax for that.

for real life usage, when building an LLM, there exists manypre-built tokenizer for that purpose.

In [2]:
import re
text='Hello, World. This, is a test.'
result=re.split(r'(\s)', text) # at re.split (patten for splitting, string for splitting)
print(result)


['Hello,', ' ', 'World.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [3]:
result=re.split(r'([,.]|\s)', text)
print(result)


['Hello', ',', '', ' ', 'World', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [4]:
# for removing existing whitespaces from the result
result=[item for item in result if item.strip()] 
print(result)


['Hello', ',', 'World', '.', 'This', ',', 'is', 'a', 'test', '.']


tokenization level 2

In [5]:
text='Hello, world. Is this-- a test?'
result=re.split(r'([,.?]|--|\s)',text) # way we can generalize the tokenization pattern
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'Is', ' ', 'this', '--', '', ' ', 'a', ' ', 'test', '?', '']


In [6]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [7]:
preprocessed=re.split(r'([.,:;?_!"()\']|--|\s)',raw_text)
preprocessed=[item for item in preprocessed if item.strip()]
print(len(preprocessed)) # number of tokens in the text (without whitespaces)
print(preprocessed[:30])


4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


**converting tokens into token IDs**

In [8]:
print(len(preprocessed))
all_words=sorted(set(preprocessed)) # only take the unique & represent into alphabatically order
vocab_size=len(all_words)
print(vocab_size)
# print(all_words)

4690
1130


In [9]:
# vocab building
char_to_indx={char:i for i,char in enumerate(all_words)}
indx_to_char={i:char for i,char in enumerate(all_words)}

# vocab needs to be make at first
vocab={token:indx for indx, token in enumerate(all_words)}
print(vocab)
# vocab is a dictionary

{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '--': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Chicago': 25, 'Claude': 26, 'Come': 27, 'Croft': 28, 'Destroyed': 29, 'Devonshire': 30, 'Don': 31, 'Dubarry': 32, 'Emperors': 33, 'Florence': 34, 'For': 35, 'Gallery': 36, 'Gideon': 37, 'Gisburn': 38, 'Gisburns': 39, 'Grafton': 40, 'Greek': 41, 'Grindle': 42, 'Grindles': 43, 'HAD': 44, 'Had': 45, 'Hang': 46, 'Has': 47, 'He': 48, 'Her': 49, 'Hermia': 50, 'His': 51, 'How': 52, 'I': 53, 'If': 54, 'In': 55, 'It': 56, 'Jack': 57, 'Jove': 58, 'Just': 59, 'Lord': 60, 'Made': 61, 'Miss': 62, 'Money': 63, 'Monte': 64, 'Moon-dancers': 65, 'Mr': 66, 'Mrs': 67, 'My': 68, 'Never': 69, 'No': 70, 'Now': 71, 'Nutley': 72, 'Of': 73, 'Oh': 74, 'On': 75, 'Once': 76, 'Only': 77, 'Or': 78, 'Perhaps': 79, 'Poor': 80, 'Professional': 81, 'Renaissance': 82, 'Ri

*Implementing a simple text tokenizer* Type 1

In [10]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int=vocab 
        self.int_to_str={i:s for s,i in vocab.items()}
        
    def encode(self, text):
        preprocessed = re.split(r'([.,:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids=[self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text=" ".join([self.int_to_str[i] for i in ids])
        text=re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text
       


In [11]:
tokenizer=SimpleTokenizerV1(vocab=vocab)
text="""It's the last he painted, you know," Mrs. Gisburn said with pardonable 
pride."""

ids=tokenizer.encode(text=text)
print(ids) # corresponding token ids from the vocab

print(tokenizer.decode(ids=ids)) # from ids to string




[56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [12]:

# what if we apply sample tokenizer into an unseen data

# text='Hello, do you like tea?'
# print(tokenizer.encode(text=text))

# that's why we need large amount of data when working with LLM's



In [None]:
all_tokens=sorted(list(set(preprocessed)))
# all_tokens.extend(["<|endoftext|>","<|unk|>"])

# <|endoftext|>, <|unk|> these are added into preprocessed

In [19]:
vocab={token:indx for indx,token in enumerate(all_tokens)}
print(len(vocab))

1132


Adding special context tokens 

*simple tokenizer for handeling unknown words* 


In [20]:
class SampleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}

    def encoder(self, text):
        preprocessed = re.split(r'([.,:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decoder(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text



In [21]:
# now applying the same concept like GPT training for LLMs

text1="Hello, do you like tea?"
text2="In the sunlit terraces of the palace."
text=" <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [22]:
tokenizer2=SampleTokenizerV2(vocab=vocab)
print(tokenizer2.encoder(text=text))
# print("Encoder output (ids):", tokenizer2.encoder(text=text))
# 1128 contains <|endoftext|>, so it dosen't throwing any keyError


[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [23]:
# print("Has <|unk|> in vocab:", "<|unk|>" in vocab, "id=", vocab.get("<|unk|>"))
print(tokenizer2.decoder(tokenizer2.encoder(text=text)))


<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


Byte Pair Encoding

*GPT mostly uses BPE for converting tokens into IDs*

In [24]:
# from importlib.metadata import version
# import tiktoken
# print(version("tiktoken"))

import tiktoken
print(tiktoken.__version__)



0.12.0


In [25]:

tokenizer=tiktoken.get_encoding("gpt2") # encoding of GPT 2

text="Hello, do you like tea ? <|endoftext|> In the sunlit of terraces of someunknownPlace."
integers=tokenizer.encode(text=text,allowed_special={"<|endoftext|>"})
print(integers)


[15496, 11, 466, 345, 588, 8887, 5633, 220, 50256, 554, 262, 4252, 18250, 286, 8812, 2114, 286, 617, 34680, 27271, 13]


In [26]:
# decoing the encoding IDs
strings=tokenizer.decode(integers)
print(strings)


Hello, do you like tea ? <|endoftext|> In the sunlit of terraces of someunknownPlace.


In [27]:

exe_text="Akwirw ier"
ids=tokenizer.encode(text=exe_text,allowed_special={"<|endoftext|>"})
print(ids)




[33901, 86, 343, 86, 220, 959]


In [28]:
ids_to_str=tokenizer.decode(ids)
print(ids_to_str)

Akwirw ier


*data sampling*

In [29]:
import tiktoken
tokenizer=tiktoken.get_encoding("gpt2")

with open("the-verdict.txt",'r',encoding='utf-8') as f:
    raw_text=f.read()

enc_text=tokenizer.encode(raw_text)
print(len(enc_text))



5145


In [30]:
# for a little exercise, 
enc_samples=enc_text[50:]


In [31]:
context_size=4 # determine how many tokens are included in the input
x=enc_samples[:context_size]
y=enc_samples[1:context_size+1]
print(x)
print(y)



[290, 4920, 2241, 287]
[4920, 2241, 287, 257]


In [32]:
# a sliding window appraoch for better seeing
# context : input
# desired : target

for i in range(1,context_size+1):
    context=enc_samples[:i]
    desired=enc_samples[i]

    print(tokenizer.decode(context), "--->", tokenizer.decode([desired]))



 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


**converting token IDs into dataloader**

*before turing tokens into embeddings, we need to make an efficient data loader that iterates over the input dataset and returns the inputs and targets as pytorch tensors, whcih can be thought of as multidimentional arrays*

a dataset for batched inputs and targets

In [33]:
import torch
from torch.utils.data import Dataset

class GPTDatasetV1(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids=[]
        self.target_ids=[]

        token_ids=tokenizer.encode(text) # tokenize the entire text

        # sliding window approach used here
        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk=token_ids[i:i+max_length]
            target_chunk=token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
    


creating a dataloader to generate batches with input-wth pairs

In [34]:
from torch.utils.data import DataLoader

def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer=tiktoken.get_encoding("gpt2")
    dataset=GPTDatasetV1(text=txt,tokenizer=tokenizer,max_length=max_length,stride=stride)
    dataloader=DataLoader(dataset=dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last, num_workers=num_workers)

    return dataloader



In [35]:
dataloader=create_dataloader_v1(txt=raw_text,batch_size=1,stride=1,shuffle=False)

# converting dataloader into python iterator
data_iter=iter(dataloader)
first_batch=next(data_iter)
print(first_batch)



[tensor([[   40,   367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,
           257,  7026, 15632,   438,  2016,   257,   922,  5891,  1576,   438,
           568,   340,   373,   645,  1049,  5975,   284,   502,   284,  3285,
           326,    11,   287,   262,  6001,   286,   465, 13476,    11,   339,
           550,  5710,   465, 12036,    11,  6405,   257,  5527, 27075,    11,
           290,  4920,  2241,   287,   257,  4489,    64,   319,   262, 34686,
         41976,    13,   357, 10915,   314,  2138,  1807,   340,   561,   423,
           587, 10598,   393, 28537,  2014,   198,   198,     1,   464,  6001,
           286,   465, 13476,     1,   438,  5562,   373,   644,   262,  1466,
          1444,   340,    13,   314,   460,  3285,  9074,    13, 46606,   536,
          5469,   438, 14363,   938,  4842,  1650,   353,   438,  2934,   489,
          3255,   465, 48422,   540,   450,    67,  3299,    13,   366,  5189,
          1781,   340,   338,  1016,   284,  3758, 

In [36]:

second=next(data_iter)
print(second)


[tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899,  2138,   257,
          7026, 15632,   438,  2016,   257,   922,  5891,  1576,   438,   568,
           340,   373,   645,  1049,  5975,   284,   502,   284,  3285,   326,
            11,   287,   262,  6001,   286,   465, 13476,    11,   339,   550,
          5710,   465, 12036,    11,  6405,   257,  5527, 27075,    11,   290,
          4920,  2241,   287,   257,  4489,    64,   319,   262, 34686, 41976,
            13,   357, 10915,   314,  2138,  1807,   340,   561,   423,   587,
         10598,   393, 28537,  2014,   198,   198,     1,   464,  6001,   286,
           465, 13476,     1,   438,  5562,   373,   644,   262,  1466,  1444,
           340,    13,   314,   460,  3285,  9074,    13, 46606,   536,  5469,
           438, 14363,   938,  4842,  1650,   353,   438,  2934,   489,  3255,
           465, 48422,   540,   450,    67,  3299,    13,   366,  5189,  1781,
           340,   338,  1016,   284,  3758,   262, 

In [37]:
inputs, targets=next(data_iter)
print(inputs)

tensor([[ 2885,  1464,  1807,  3619,   402,   271, 10899,  2138,   257,  7026,
         15632,   438,  2016,   257,   922,  5891,  1576,   438,   568,   340,
           373,   645,  1049,  5975,   284,   502,   284,  3285,   326,    11,
           287,   262,  6001,   286,   465, 13476,    11,   339,   550,  5710,
           465, 12036,    11,  6405,   257,  5527, 27075,    11,   290,  4920,
          2241,   287,   257,  4489,    64,   319,   262, 34686, 41976,    13,
           357, 10915,   314,  2138,  1807,   340,   561,   423,   587, 10598,
           393, 28537,  2014,   198,   198,     1,   464,  6001,   286,   465,
         13476,     1,   438,  5562,   373,   644,   262,  1466,  1444,   340,
            13,   314,   460,  3285,  9074,    13, 46606,   536,  5469,   438,
         14363,   938,  4842,  1650,   353,   438,  2934,   489,  3255,   465,
         48422,   540,   450,    67,  3299,    13,   366,  5189,  1781,   340,
           338,  1016,   284,  3758,   262,  1988,  

In [38]:
print(targets)

tensor([[ 1464,  1807,  3619,   402,   271, 10899,  2138,   257,  7026, 15632,
           438,  2016,   257,   922,  5891,  1576,   438,   568,   340,   373,
           645,  1049,  5975,   284,   502,   284,  3285,   326,    11,   287,
           262,  6001,   286,   465, 13476,    11,   339,   550,  5710,   465,
         12036,    11,  6405,   257,  5527, 27075,    11,   290,  4920,  2241,
           287,   257,  4489,    64,   319,   262, 34686, 41976,    13,   357,
         10915,   314,  2138,  1807,   340,   561,   423,   587, 10598,   393,
         28537,  2014,   198,   198,     1,   464,  6001,   286,   465, 13476,
             1,   438,  5562,   373,   644,   262,  1466,  1444,   340,    13,
           314,   460,  3285,  9074,    13, 46606,   536,  5469,   438, 14363,
           938,  4842,  1650,   353,   438,  2934,   489,  3255,   465, 48422,
           540,   450,    67,  3299,    13,   366,  5189,  1781,   340,   338,
          1016,   284,  3758,   262,  1988,   286,  

**converting token IDs (loader) into embeddings** 

*for starter let's take a small example for understanding on embeddings*

In [39]:
# let say we have 4 input token ids
input_ids=torch.tensor([2,3,5,1])

vocab_size=6 
output_dim=3 # for GPT-3, output dimension: 12,288


In [40]:
torch.manual_seed(123)
embedding_layer=torch.nn.Embedding(vocab_size,output_dim)
print(embedding_layer.weight)



Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [41]:
# let's now apply a token ID to get embedding vector
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [42]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


*as we are using BPE for creating token IDs, BPE has a vocab size of 50,257*

*for GPT training, OpenAI used output dimension of 12,288*

In [59]:
vocab_size=50257
output_dim=256 # consider for our purpose
token_embedding_layer=torch.nn.Embedding(vocab_size, output_dim)
print(token_embedding_layer.weight)

Parameter containing:
tensor([[ 0.6403, -1.4213, -2.3757,  ...,  1.3287, -0.4587, -0.6813],
        [ 2.1007, -0.7478,  1.1531,  ..., -1.8840,  0.6308,  1.3845],
        [-1.5949, -0.4704,  0.2339,  ...,  0.2753,  2.6710,  0.1407],
        ...,
        [-1.5469, -0.4470, -0.5156,  ...,  0.7707,  0.9065, -1.7708],
        [-0.4246,  1.3170,  0.9862,  ...,  0.8581,  0.7659, -1.7296],
        [-0.1311,  0.0434, -1.5763,  ..., -2.2314, -0.5979,  0.5495]],
       requires_grad=True)


In [56]:
# we'll use our predefined dataloader for a context_size of 4 (max_length)
max_lenth=4
dataloader=create_dataloader_v1(txt=raw_text,batch_size=8, max_length=max_lenth, stride=max_lenth,shuffle=False)
data_iter=iter(dataloader)
inputs,targets=next(data_iter)

print(inputs)
print(inputs.shape)
print(targets)



tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
torch.Size([8, 4])
tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [None]:
# getting embedding for inputs (8x4)
token_embeddings=token_embedding_layer(inputs)
print(token_embeddings.shape)
print(token_embeddings)


torch.Size([8, 4, 256])
tensor([[[-1.2006e+00,  6.9948e-01,  1.3829e+00,  ...,  5.4554e-02,
           7.9838e-01, -1.7708e-01],
         [-4.9637e-01,  1.9622e+00,  1.2301e+00,  ..., -2.9119e+00,
          -7.9404e-01, -1.3448e+00],
         [ 1.4023e+00, -1.4087e+00,  7.2441e-02,  ...,  1.2578e+00,
          -1.9913e+00,  5.7780e-01],
         [-1.3574e-01,  2.2363e+00, -4.8063e-01,  ...,  7.6079e-01,
          -2.3937e+00,  2.5137e-01]],

        [[ 3.0991e-01,  1.6669e+00,  4.0496e-01,  ..., -1.6744e+00,
           1.2830e+00,  9.5530e-01],
         [-1.2835e+00, -2.3158e-01, -1.1369e-01,  ...,  3.4810e-01,
           2.9918e-01,  1.6525e+00],
         [-2.0264e+00,  9.7831e-01, -8.8639e-01,  ...,  8.2856e-01,
          -4.9504e-01, -1.3272e+00],
         [-1.2288e-01, -1.7676e+00,  7.7311e-01,  ..., -2.4732e+00,
          -4.9612e-01, -4.1105e-01]],

        [[-4.6817e-01, -1.3394e+00, -2.7724e-01,  ..., -8.5541e-01,
           1.0628e+00, -6.3077e-01],
         [ 1.2375e+00, -1.2

In [63]:
# GPT model uses absolute embedding approach

context_lenght=max_lenth
pos_embedding_layer=torch.nn.Embedding(context_lenght, output_dim)
pos_embedding=pos_embedding_layer(torch.arange(context_lenght))
print(pos_embedding.shape)



torch.Size([4, 256])


In [66]:
# so, we got token_embedding and positonal embedding,
# now we can get the corresponding input_embedding to train an LLM model

input_embedding=token_embeddings+pos_embedding
print(input_embedding.shape)



torch.Size([8, 4, 256])
