In [1]:
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

torch version: 2.5.1
tiktoken version: 0.8.0


In [2]:
with open("data.txt","r",encoding="utf-8") as f:
    raw_text = f.read()

print(f"total number of characters:{len(raw_text)}")
print(raw_text[:50])

total number of characters:20479
I HAD always thought Jack Gisburn rather a cheap g


In [3]:
import re
text = "hello world, how are you doing?"
#result = re.split(r'(\s)',text)
result = re.split(r'([,.]|\s)',text)
result = [item for item in result if item.strip()]
print(result)

['hello', 'world', ',', 'how', 'are', 'you', 'doing?']


#### Tokenize the texts
- convert the text present into indivual tokens

In [4]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4690


In [5]:
print(preprocessed[:25])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me']


#### Creating vocabulary

In [6]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [7]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [8]:
for i,item in enumerate(vocab.items()):
    print(item)
    if i>=10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)


## Creating a simple tokenizer

In [9]:
class SimpleTokenizer:
    def __init__(self,vocab):
        self.stoi = vocab
        self.itos = {i:s for s,i in vocab.items()} #maps token ids to tokens

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
            ]
        ids = [self.stoi[s] for s in preprocessed]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.itos[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [10]:
tokenizer = SimpleTokenizer(vocab)

sample_text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(sample_text)
print(ids)


[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [11]:
print(tokenizer.decode(ids))

" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [12]:
sample_text2 = "The sky is blue in colour"
tokenizer.decode(tokenizer.encode(sample_text2))

KeyError: 'sky'

- The above error occurs due to the limited vocabulary on which we got to train the tokenizer
- Avoid this issue by adding special tokens

In [13]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>","<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab.items()))

1132


In [14]:
for i , item in enumerate(list(vocab.items())[-4:]):
    print(item)

('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


## Updated tokenizer:

In [15]:
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.stoi = vocab
        self.itos = {i:s for s,i in vocab.items()} #maps token ids to tokens

    def encode(self,text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
            ]
        preprocessed = [item if item in self.stoi else
                        "<|unk|>" for item in preprocessed]
        ids = [self.stoi[s] for s in preprocessed]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.itos[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [16]:
text1 = "hello what are you upto"
text2 = "come out of the room"
text = " <|endoftext|> ".join((text1,text2))
print(text)

hello what are you upto <|endoftext|> come out of the room


In [17]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1131, 1089, 169, 1126, 1131, 1130, 277, 738, 722, 988, 845]


In [18]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|> what are you <|unk|> <|endoftext|> come out of the room


## Using Efficient Tokenizer:

In [19]:
import tiktoken

In [20]:
tokenizer  = tiktoken.get_encoding("gpt2")

In [21]:
text = (
    "what is life all about if not for exploring the unknown!"
    "wish i could do that everyday"
)

In [22]:
sample_ids = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(sample_ids)
sample_strings = tokenizer.decode(sample_ids)
print(sample_strings)

[10919, 318, 1204, 477, 546, 611, 407, 329, 13504, 262, 6439, 0, 86, 680, 1312, 714, 466, 326, 10908]
what is life all about if not for exploring the unknown!wish i could do that everyday


### Sampling data from the dataset:

In [23]:
with open("data.txt","r",encoding="utf-8") as file:
    dataset = file.read()

enc_text = tokenizer.encode(dataset)
print(len(enc_text))

5145


In [24]:
enc_sample = enc_text[25:]

In [25]:
#creating input target pairs
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

x: [5975, 284, 502, 284]
y:      [284, 502, 284, 3285]


The ids on the left represents the input the LLM receives. The ids on the right are the predictions which the LLM has to make and is trained to do.

In [26]:
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context,"-->",desired)

[5975] --> 284
[5975, 284] --> 502
[5975, 284, 502] --> 284
[5975, 284, 502, 284] --> 3285


In [27]:
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context),"-->",tokenizer.decode([desired]))


 surprise -->  to
 surprise to -->  me
 surprise to me -->  to
 surprise to me to -->  hear


### Loading data:

In [28]:
import torch
from torch.utils.data import Dataset, DataLoader

In [29]:
class GPTDataset(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)

        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]

In [32]:
def create_dataloaderv1(txt,batch_size=4,max_length=256,
                      stride=128,shuffle=True,drop_last=True,
                      num_workers = 0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(txt,tokenizer,max_length,stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [36]:
with open("data.txt","r",encoding="utf-8") as file:
    raw_text = file.read()

dataloader = create_dataloaderv1(
    raw_text,batch_size=1,max_length=6,stride=4,shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464, 1807, 3619]]), tensor([[ 367, 2885, 1464, 1807, 3619,  402]])]


In [37]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 1807,  3619,   402,   271, 10899,  2138]]), tensor([[ 3619,   402,   271, 10899,  2138,   257]])]


### How does the embedding layer function ?

In [38]:
indexes = torch.tensor([2,3,4]) #random values
num_indexes = max(indexes)+1 #no of rows in embedding matrix  = maximum token + 1
out_dim = 4 #hyperparameter

In [40]:
torch.manual_seed(42)

embedding = torch.nn.Embedding(num_indexes,out_dim)

In [41]:
embedding.weight

Parameter containing:
tensor([[ 1.9269,  1.4873,  0.9007, -2.1055],
        [-0.7581,  1.0783,  0.8008,  1.6806],
        [ 0.3559, -0.6866, -0.4934,  0.2415],
        [-0.2316,  0.0418, -0.2516,  0.8599],
        [-0.3097, -0.3957,  0.8034, -0.6216]], requires_grad=True)

In [44]:
embedding(torch.tensor([2])) #lookup values by index

tensor([[ 0.3559, -0.6866, -0.4934,  0.2415]], grad_fn=<EmbeddingBackward0>)

#### Building the embedding layer
The steps to create the embedding layer
- one hot representation of the vectors needs to be matmul'ed 

In [45]:
one_hot_vec = torch.nn.functional.one_hot(indexes)
one_hot_vec

tensor([[0, 0, 1, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]])

In [47]:
torch.manual_seed(42)

linear = torch.nn.Linear(num_indexes,out_dim,bias=False)
linear.weight

Parameter containing:
tensor([[ 0.3419,  0.3712, -0.1048,  0.4108, -0.0980],
        [ 0.0902, -0.2177,  0.2626,  0.3942, -0.3281],
        [ 0.3887,  0.0837,  0.3304,  0.0606,  0.2156],
        [-0.0631,  0.3448,  0.0661, -0.2088,  0.1140]], requires_grad=True)

In [48]:
# reassigning weights of linear layer to compare 
linear.weight = torch.nn.Parameter(embedding.weight.T) 

In [52]:
linear(one_hot_vec.float())

tensor([[ 0.3559, -0.6866, -0.4934,  0.2415],
        [-0.2316,  0.0418, -0.2516,  0.8599],
        [-0.3097, -0.3957,  0.8034, -0.6216]], grad_fn=<MmBackward0>)

In [53]:
embedding(indexes)

tensor([[ 0.3559, -0.6866, -0.4934,  0.2415],
        [-0.2316,  0.0418, -0.2516,  0.8599],
        [-0.3097, -0.3957,  0.8034, -0.6216]], grad_fn=<EmbeddingBackward0>)

### Embedding word positions:

- embedding layers gets to perform a lookup operation in the matrix
- thus if positions aren't added or introduced then the same token ID is mapped to the same vector representation.

In [54]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [56]:
max_length = 4
dataloader = create_dataloaderv1(
    raw_text,batch_size=8,max_length=max_length,
    stride = max_length,shuffle= False
)

data_iter = iter(dataloader)
inputs , targets = next(data_iter)
print("Token IDs:\n",inputs)
print("\nInput shape:\n",inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Input shape:
 torch.Size([8, 4])


In [58]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape) # batch size , no of tokens , embedding vector

torch.Size([8, 4, 256])


In [59]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length)) #sequence of nums tills N-1
print(pos_embeddings.shape)

torch.Size([4, 256])


In [60]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
