In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import tiktoken   # GPT-2 BPE tokenizer

# -----------------------------
# 1. PARAMETERS
# -----------------------------
batch_size = 512   #GPT-2 small Batch Size is 512 sequence
max_length = 1024  # GPT-2 small context window size or each sequence length
stride = 2         # GPT-2 STRIDE is same as max_length. It means, there is no stride. In our case, we will have it as 100.
embedding_dim = 768  # GPT-2 small hidden size

# -----------------------------
# 2. READ RAW TEXT
# -----------------------------
with open("../../_data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

# -----------------------------
# 3. TOKENIZER (GPT-2 BPE)
# -----------------------------
tokenizer = tiktoken.get_encoding("gpt2")
tokens = tokenizer.encode(raw_text)
vocab_size = tokenizer.n_vocab
print("Total tokens in text:", len(tokens))

# -----------------------------
# 4. SLIDING WINDOW DATASET
# -----------------------------
class SlidingWindowDataset(Dataset):
    """Creates input/target sequences using a sliding window approach."""
    def __init__(self, tokens, max_length, stride):
        self.tokens = tokens
        self.max_length = max_length
        self.stride = stride

        self.inputs = []
        self.targets = []

        i = 0
        while i + max_length < len(tokens):
            seq_in = tokens[i : i + max_length]
            seq_out = tokens[i + 1 : i + max_length + 1]
            self.inputs.append(seq_in)
            self.targets.append(seq_out)
            i += stride

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])


def create_dataloader_v1(raw_text, batch_size, max_length, stride, shuffle=False):
    tokens = tokenizer.encode(raw_text)
    dataset = SlidingWindowDataset(tokens, max_length, stride)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


dataloader = create_dataloader_v1(raw_text, batch_size, max_length, stride, shuffle=False)

# -----------------------------
# 5. EMBEDDING LAYERS
# -----------------------------
# Token embedding
token_embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

# Positional embedding (your object)
position_embedding_layer = torch.nn.Embedding(num_embeddings=max_length, embedding_dim=embedding_dim)

# -----------------------------
# 6. GENERATE FINAL EMBEDDINGS
# -----------------------------
all_embeddings = []
for batch_index, (input_tensor, target_tensor) in enumerate(dataloader):
    # input_tensor shape: (batch_size, seq_len)
    batch_size_current, seq_len = input_tensor.shape

    print(f"\n=== Batch {batch_index+1} ===")

    # Create position indices once per sequence
    positions = torch.arange(seq_len, dtype=torch.long)  # (seq_len,)
    pos_emb = position_embedding_layer(positions)        # (seq_len, embedding_dim)

    # Compute embeddings for the entire batch at once
    token_emb = token_embedding_layer(input_tensor)      # (batch_size, seq_len, embedding_dim)
    final_emb = token_emb + pos_emb.unsqueeze(0)         # broadcast positional embedding

    print("Final embeddings shape:", final_emb.shape)    # (batch_size, seq_len, embedding_dim)

    # Optional: decode and show first sequence of the batch
    decoded_text = tokenizer.decode(input_tensor[0].tolist())
    print("Decoded snippet of first sequence:", decoded_text[:200], "...\n")

    all_embeddings.append(final_emb)     # store for later use


Total tokens in text: 5145

=== Batch 1 ===
Final embeddings shape: torch.Size([512, 1024, 768])
Decoded snippet of first sequence: I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a ...


=== Batch 2 ===
Final embeddings shape: torch.Size([512, 1024, 768])
Decoded snippet of first sequence:  to put beauty into circulation," was one of the axioms he laid down across the Sevres and silver of an exquisitely appointed luncheon-table, when, on a later day, I had again run over from Monte Carl ...


=== Batch 3 ===
Final embeddings shape: torch.Size([512, 1024, 768])
Decoded snippet of first sequence: , presenting a neutral surface to work on--forming, as it were, so inevitably the background of her own picture--had lent herself in an unusual degree to the display of this false virtuosity. The pict ...


=== Batch 4 ===
Final embeddings shape: