# Read Raw Text

In [8]:
import os

In [9]:
RESOURCE_DIR = "Resources"
HARRY_POTTER_SS_FILE = "Harry_Potter_and_Sorcerer's_Stone.txt"
FILE_PATH = os.path.join(RESOURCE_DIR, HARRY_POTTER_SS_FILE)

In [10]:
with open(FILE_PATH, 'r', encoding='windows-1252') as file:
    raw_text = file.read()

In [11]:
print(f"First 500 characters of the book:\n\n{raw_text[:500]}")

First 500 characters of the book:

Harry Potter and the Sorcerer's Stone 

CHAPTER ONE 

THE BOY WHO LIVED 

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. 

Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large musta


# Tokenizer

In [12]:
import tiktoken

In [13]:
tokenizer = tiktoken.get_encoding("gpt2")

# Dataset Class

In [14]:
import torch
from torch.utils.data import Dataset

In [15]:
class GPTDatasetV1(Dataset):
    def __init__(self, raw_text, tokenizer, context_size, stride):
        self.input_token_ids = []
        self.target_token_ids = []

        all_token_ids = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(all_token_ids) - context_size, stride):
            input_chunk = all_token_ids[i : i + context_size]
            target_chunk = all_token_ids[i + 1 : i + context_size + 1]
            self.input_token_ids.append(torch.tensor(input_chunk))
            self.target_token_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_token_ids)

    def __getitem__(self, idx):
        x = self.input_token_ids[idx]
        y = self.target_token_ids[idx]
        return x, y

# Data Loader

In [16]:
from torch.utils.data import DataLoader

In [17]:
def create_dataloader_v1(raw_text, tokenizer, context_size=256, stride=256, batch_size=8,
                         shuffle=True, num_workers=0, drop_last=True):
    
    dataset = GPTDatasetV1(raw_text, tokenizer, context_size, stride)
    
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, num_workers=num_workers, drop_last=drop_last)
    
    return dataloader

In [18]:
MAX_LENGTH = 4
CONTEXT_SIZE = MAX_LENGTH
STRIDE = MAX_LENGTH
BATCH_SIZE = 8

In [19]:
data_loader = create_dataloader_v1(raw_text, tokenizer, context_size=CONTEXT_SIZE,
                                   stride=STRIDE, batch_size=BATCH_SIZE, shuffle=False)

In [20]:
iter = iter(data_loader)

input_token_ids, target_token_ids = next(iter)

print(f"Input token IDs shape: {input_token_ids.shape}\n")

print(f"Input token IDs:\n{input_token_ids}\n")

Input token IDs shape: torch.Size([8, 4])

Input token IDs:
tensor([[18308, 14179,   290,   262],
        [30467,   338,  8026,   220],
        [  198,   198, 41481, 16329],
        [  220,   198,   198, 10970],
        [16494,    56, 19494,   406],
        [ 3824,  1961,   220,   198],
        [  198,  5246,    13,   290],
        [ 9074,    13,   360,  1834]])



# Token Embedding

In [21]:
VOCAB_SIZE = 50257  # GPT-2 vocabulary size
EMBEDDING_DIM = 768

In [22]:
token_embeddingg_layer = torch.nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM)

In [23]:
input_token_embedding = token_embeddingg_layer(input_token_ids)

print(f"Input token embeddings shape: {input_token_embedding.shape}")

Input token embeddings shape: torch.Size([8, 4, 768])


In [27]:
print(input_token_embedding)

tensor([[[-4.9818e-01, -4.6472e-01,  3.9764e-01,  ...,  7.9246e-01,
           1.1571e+00,  5.3370e-01],
         [ 2.2044e+00,  2.2840e-01,  1.6137e+00,  ..., -7.6135e-01,
           2.2072e-03,  1.8693e+00],
         [-7.9827e-01, -5.4600e-02,  9.6870e-01,  ..., -1.5319e+00,
          -2.2185e+00, -1.5200e+00],
         [-2.4486e-01,  2.7383e-01, -1.1582e+00,  ..., -1.5464e+00,
          -1.1343e+00,  5.9144e-01]],

        [[-1.2179e-01,  4.7988e-01, -9.8560e-01,  ..., -8.1018e-02,
          -1.0450e+00, -1.1603e+00],
         [-3.6685e-02, -1.3806e-01, -1.1217e+00,  ..., -4.6414e-01,
           6.6517e-01, -1.8198e-01],
         [-1.8753e+00,  9.9881e-01,  4.3583e-01,  ..., -8.4714e-01,
          -4.8133e-01,  9.1154e-01],
         [ 6.2563e-01,  1.0582e+00, -1.1504e-01,  ..., -6.9983e-01,
          -7.9212e-01,  6.8030e-01]],

        [[ 1.2907e-01, -4.0606e-01,  5.1460e-01,  ...,  4.4782e-01,
          -5.3980e-01, -4.0373e-01],
         [ 1.2907e-01, -4.0606e-01,  5.1460e-01,  .

# Position Embedding

In [24]:
position_embedding_layer = torch.nn.Embedding(num_embeddings=CONTEXT_SIZE, embedding_dim=EMBEDDING_DIM)

In [25]:
pos_embedding = position_embedding_layer(torch.arange(CONTEXT_SIZE))

print(f"Position embeddings shape: {pos_embedding.shape}")

Position embeddings shape: torch.Size([4, 768])


In [28]:
print(pos_embedding)

tensor([[ 0.3439,  0.8972,  0.4373,  ..., -1.1304, -0.4956, -0.3342],
        [ 0.3613,  2.0763,  1.6107,  ..., -0.7132,  0.1108,  0.8191],
        [-0.0317,  0.3340,  0.1873,  ..., -0.4138,  2.2604, -2.4380],
        [ 2.0926, -0.4425, -0.5229,  ...,  1.0921, -0.6074, -0.7991]],
       grad_fn=<EmbeddingBackward0>)


# Input Embedding

**Input Embedding = Vector Embedding + Position Embeddingg**

In [26]:
input_embeddings = input_token_embedding + pos_embedding # Broadcasting happens here

print(f"Input embeddings after adding position embeddings shape: {input_embeddings.shape}")

Input embeddings after adding position embeddings shape: torch.Size([8, 4, 768])


In [29]:
print(input_embeddings)

tensor([[[-1.5433e-01,  4.3246e-01,  8.3499e-01,  ..., -3.3799e-01,
           6.6150e-01,  1.9951e-01],
         [ 2.5657e+00,  2.3047e+00,  3.2244e+00,  ..., -1.4745e+00,
           1.1302e-01,  2.6885e+00],
         [-8.3002e-01,  2.7938e-01,  1.1560e+00,  ..., -1.9457e+00,
           4.1915e-02, -3.9580e+00],
         [ 1.8478e+00, -1.6865e-01, -1.6810e+00,  ..., -4.5438e-01,
          -1.7417e+00, -2.0771e-01]],

        [[ 2.2207e-01,  1.3771e+00, -5.4825e-01,  ..., -1.2115e+00,
          -1.5406e+00, -1.4945e+00],
         [ 3.2461e-01,  1.9382e+00,  4.8895e-01,  ..., -1.1773e+00,
           7.7599e-01,  6.3715e-01],
         [-1.9070e+00,  1.3328e+00,  6.2315e-01,  ..., -1.2609e+00,
           1.7791e+00, -1.5264e+00],
         [ 2.7182e+00,  6.1571e-01, -6.3791e-01,  ...,  3.9223e-01,
          -1.3995e+00, -1.1885e-01]],

        [[ 4.7293e-01,  4.9112e-01,  9.5195e-01,  ..., -6.8263e-01,
          -1.0354e+00, -7.3791e-01],
         [ 4.9037e-01,  1.6702e+00,  2.1253e+00,  .