In [35]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        
        # Fix: Handle case where token_ids length equals max_length
        if len(token_ids) == max_length:
            # For exact match, we need max_length tokens for input
            # and max_length tokens for target (shifted by 1)
            input_chunk = token_ids  # Use all tokens
            target_chunk = token_ids[1:] + [token_ids[-1]]  # Shift and repeat last token
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
        elif len(token_ids) < max_length:
            # Pad if shorter
            padded_tokens = token_ids + [0] * (max_length - len(token_ids))
            input_chunk = padded_tokens
            target_chunk = padded_tokens[1:] + [0]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
        else:
            # Original logic for longer texts
            for i in range(0, len(token_ids) - max_length, stride):
                input_chunk = token_ids[i : i + max_length]
                target_chunk = token_ids[i + 1 : i + max_length + 1]
                self.input_ids.append(torch.tensor(input_chunk))
                self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=2, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    
    return dataloader    



raw_text = "Yours journey start with one steps"
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})

max_length = len(token_ids)
vocab_size = 50257
output_dim = 3

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(max_length, output_dim)

dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=max_length,
    stride=max_length, shuffle=False
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

token_embeddings = token_embedding_layer(inputs)
pos_embeddings = pos_embedding_layer(torch.arange(max_length))

input_embeddings = token_embeddings + pos_embeddings
input_embeddings[0][0]


tensor([-0.8529, -1.3223, -2.7716], grad_fn=<SelectBackward0>)

## A simple self-attention mechanism without trainable weights

In [36]:
seq_len = input_embeddings.shape[1]
attn_scores = torch.empty(seq_len, seq_len)

for i, x_i in enumerate(input_embeddings[0]):
    for j, x_j in enumerate(input_embeddings[0]):
        attn_scores[i,j] = torch.dot(x_i, x_j)
        
attn_scores        

tensor([[ 10.1579,   2.4870,   3.5425,  -6.1913,  -8.3178,   0.1233,  -7.5881],
        [  2.4870,   1.5492,  -0.2341,  -0.7940,  -0.8809,   0.6988,  -0.5017],
        [  3.5425,  -0.2341,  14.8219,  -2.0039, -12.1672,  -4.5268,  -3.6738],
        [ -6.1913,  -0.7940,  -2.0039,   4.4094,   5.3130,   0.1299,   5.7119],
        [ -8.3178,  -0.8809, -12.1672,   5.3130,  13.3233,   3.1575,   7.5193],
        [  0.1233,   0.6988,  -4.5268,   0.1299,   3.1575,   1.6430,   0.6994],
        [ -7.5881,  -0.5017,  -3.6738,   5.7119,   7.5193,   0.6994,   7.6501]],
       grad_fn=<CopySlices>)

In [37]:
attn_scores = input_embeddings[0] @ input_embeddings[0].T
attn_scores

tensor([[ 10.1579,   2.4870,   3.5425,  -6.1913,  -8.3178,   0.1233,  -7.5881],
        [  2.4870,   1.5492,  -0.2341,  -0.7940,  -0.8809,   0.6988,  -0.5017],
        [  3.5425,  -0.2341,  14.8219,  -2.0039, -12.1672,  -4.5268,  -3.6738],
        [ -6.1913,  -0.7940,  -2.0039,   4.4094,   5.3130,   0.1299,   5.7119],
        [ -8.3178,  -0.8809, -12.1672,   5.3130,  13.3233,   3.1575,   7.5193],
        [  0.1233,   0.6988,  -4.5268,   0.1299,   3.1575,   1.6430,   0.6994],
        [ -7.5881,  -0.5017,  -3.6738,   5.7119,   7.5193,   0.6994,   7.6501]],
       grad_fn=<MmBackward0>)

In [38]:
attn_weights = torch.softmax(attn_scores, dim=1)
attn_weights

tensor([[9.9815e-01, 4.6537e-04, 1.3371e-03, 7.9220e-08, 9.4478e-09, 4.3779e-05,
         1.9599e-08],
        [5.7244e-01, 2.2409e-01, 3.7667e-02, 2.1517e-02, 1.9727e-02, 9.5741e-02,
         2.8823e-02],
        [1.2630e-05, 2.8926e-07, 9.9999e-01, 4.9277e-08, 1.9002e-12, 3.9533e-09,
         9.2774e-09],
        [3.4736e-06, 7.6702e-04, 2.2874e-04, 1.3951e-01, 3.4437e-01, 1.9322e-03,
         5.1319e-01],
        [3.9806e-10, 6.7569e-07, 8.4753e-12, 3.3090e-04, 9.9662e-01, 3.8334e-05,
         3.0053e-03],
        [3.2334e-02, 5.7487e-02, 3.0911e-04, 3.2547e-02, 6.7201e-01, 1.4779e-01,
         5.7523e-02],
        [1.1918e-07, 1.4250e-04, 5.9727e-06, 7.1175e-02, 4.3378e-01, 4.7362e-04,
         4.9442e-01]], grad_fn=<SoftmaxBackward0>)

In [39]:
context_vecs = attn_weights @ input_embeddings[0]
context_vecs

tensor([[-0.8540, -1.3241, -2.7658],
        [-0.7305, -0.5577, -1.6744],
        [-1.7375, -3.3299,  0.8452],
        [ 0.1766,  2.2317,  1.6300],
        [ 1.2886,  3.2411,  1.0571],
        [ 0.7843,  2.4641,  0.6529],
        [ 0.3153,  2.4075,  1.5686]], grad_fn=<MmBackward0>)