## Positional Embedding ( Encoding Words Positions)

In [18]:
! pip3 install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl.metadata (6.7 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2025.9.18-cp311-cp311-macosx_10_9_x86_64.whl.metadata (40 kB)
Downloading tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.4 MB/s[0m  [33m0:00:00[0m
[?25hDownloading regex-2025.9.18-cp311-cp311-macosx_10_9_x86_64.whl (288 kB)
Installing collected packages: regex, tiktoken
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [tiktoken]1/2[0m [tiktoken]
[1A[2KSuccessfully installed regex-2025.9.18 tiktoken-0.12.0


In [2]:
import torch
import importlib
import tiktoken

In [6]:
with open("Data/the-verdict.txt","r", encoding="utf-8") as f:
    raw_text=f.read()

In [13]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt,tokenizer,max_length,stride):
        self.input_ids=[]
        self.target_ids=[]

        #Tokenize the entire text
        token_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})

        #use a sliding window to chunk the book into overlapping sequence of max_length
        # sliding window is take entire row or max_length of texts and then slide next row

        for i in range(0, len(token_ids)- max_length, stride):
            input_chunk= token_ids[i:i+max_length]
            target_chunk=token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [14]:
def create_dataloader_v1(text, batch_size=4, max_length=256, stride=128, shuffle=True,
                         drop_last=True, num_workers=0):
    #initaalize the tokenizer
    tokenizer= tiktoken.get_encoding("gpt2")

    #Create dataSet
    dataset= GPTDatasetV1(text,tokenizer,max_length,stride)

    #Create dataloader
    dataloader= DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [15]:
vocab_size= 50257
output_dim= 256

token_embedding_layer= torch.nn.Embedding(vocab_size, output_dim)

In [4]:
# create a data loader
# Batch size in data loader will be 8 => means 8 rows in each batch
# each row will contain 4 input tokens as context size is 4 to predict next word
#Dimenssion will be 256
# stride will be 4=> means gap between chunks is 4
# 8x4x256


In [20]:
max_Length= 4
dataloader= create_dataloader_v1(
    raw_text, batch_size=8,max_length=max_Length,
    stride=max_Length,shuffle=False
)
data_iter= iter(dataloader)
inputs, targets= next(data_iter)

In [21]:
print("Token Ids: \n ", inputs)
print("\nInputs shape:\n", inputs.shape)

Token Ids: 
  tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [None]:
# for each token in this input, one embedding of vector 256 length is generated
# like for 40, there will be a vector of 256 length of embedding vector, for 367.. 1464
#each has a 256-length of dimensional vector. 

In [24]:
toke_embeddings= token_embedding_layer(inputs)
print(toke_embeddings.shape)

torch.Size([8, 4, 256])


In [None]:
# add positional embedding to each token embedding
# there will be only 4 positions as out context size is the batch is 4
# but there will be 256 dimensions as per dimensional size it will remain same

In [26]:
context_length= max_Length
pos_embedding_layer= torch.nn.Embedding(context_length, output_dim)

In [28]:
pos_embeddings=  pos_embedding_layer(torch.arange(max_Length))
print(pos_embeddings.shape)
                            

torch.Size([4, 256])


In [30]:
# ass pos embedding with token e
input_embeddings= toke_embeddings+ pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
