In [10]:
# Import relevant packages

import matplotlib
import torch
import tiktoken

In [11]:
#Versions of each package
from importlib.metadata import version

print("matplotlib version:", version("matplotlib"))
print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))

matplotlib version: 3.9.2
torch version: 2.5.1
tiktoken version: 0.11.0


In [12]:
#Configuration of 124 million parameter GPT-2 model:

GPT_CONFIG_124M = {
    "vocab_size": 50257,    #Vocabulary size
    "context_length": 1024, #Context length
    "emb_dim": 768,         #Embedding dimension
    "n_heads": 12,          #Number of attention heads
    "n_layers": 12,         #Number of layers
    "drop_rate": 0.1,       #Dropout rate
    "qkv_bias": False       #Query-Key-Value bias
}


**Heading: Building a Minimal GPT-like Model with PyTorch**

This code demonstrates how to construct a simplified version of a GPT (Generative Pretrained Transformer) model using PyTorch. The script is structured to show the main architectural components of a transformer-based language model, using placeholder classes for the transformer block and layer normalization to focus on the overall structure. Here’s a summary what each part does:

1. **Importing Libraries:**  
   - `torch` and `torch.nn` are imported to provide tensor operations and neural network layers, which are essential for building and training deep learning models.

2. **Defining the DummyGPTModel Class:**  
   - Inherits from `nn.Module`, making it compatible with PyTorch’s model API.
   - The `__init__` method initializes the model’s layers:
     - `self.tok_emb`: A token embedding layer that converts input token indices into dense vectors of size `emb_dim`.
     - `self.pos_emb`: A positional embedding layer that encodes the position of each token in the sequence, also as vectors of size `emb_dim`.
     - `self.drop_emb`: A dropout layer for regularization, randomly zeroing some elements to prevent overfitting.
     - `self.trf_blocks`: A stack of placeholder transformer blocks, repeated `n_layers` times, using `nn.Sequential` to apply them in order.
     - `self.final_norm`: A placeholder for the final layer normalization, which would typically normalize the output before the final prediction.
     - `self.out_head`: A linear layer that projects the final hidden states back to the vocabulary size, producing logits for each token.
   - The `forward` method defines the data flow:
     - Takes input token indices, computes token and positional embeddings, sums them, applies dropout, passes through the transformer blocks, applies final normalization, and projects to output logits.

3. **Defining DummyTransformerBlock:**  
   - A placeholder class that mimics a transformer block’s interface but simply returns its input unchanged. In a full model, this would contain attention and feedforward sublayers.

4. **Defining DummyLayerNorm:**  
   - Another placeholder that mimics layer normalization but returns its input unchanged. In a real model, this would normalize activations to stabilize training.

**Key Steps in the Model:**
- Embedding input tokens and their positions.
- Applying dropout for regularization.
- Passing data through a stack of transformer blocks (here, placeholders).
- Applying layer normalization (placeholder).
- Projecting to vocabulary logits for language modeling.

**Inputs and Parameters:**
- The model expects a configuration dictionary (`cfg`) specifying vocabulary size, embedding dimension, context length, number of layers, dropout rate, etc.
- The `forward` method expects a tensor of token indices as input.

This structure provides a clear template for building a transformer-based language model in PyTorch, with placeholders that can be replaced by full implementations for attention, feedforward, and normalization layers.

In [3]:
import torch
import torch.nn as nn

class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

    #Use a placeholder for TransformerBlock
        self.trf_blocks = nn.Sequential(
            *[DummyTransformerBlock(cfg) for _ in range (cfg ["n_layers"])])

    #Use a placeholder for LayerNorm
        self.final_norm = DummyLayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

class DummyTransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        # A simple placeholder

    def forward(self, x):
        # This block does nothing and just returns its input.
        return x

class DummyLayerNorm(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5):
        super().__init__()
        # The parameters here are just to mimic the LayerNorm interface.

    def forward(self, x):
        # This layer does nothing and just returns its input.
        return x




In [7]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you"
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)


tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])



**Heading: Running a Dummy GPT Model on Tokenized Input Using PyTorch**

This code block demonstrates how to initialize and run a minimal GPT-like model using PyTorch, with placeholder components for the transformer and normalization layers. The purpose is to verify the data flow and output shape of the model using a batch of tokenized sentences. The code uses the PyTorch library for model definition and tensor operations. Here’s a breakdown of each step:

1. `torch.manual_seed(123)` sets the random seed for PyTorch to ensure reproducibility, so that model weights and any random operations yield the same results each time the code is run.
2. `model = DummyGPTModel(GPT_CONFIG_124M)` creates an instance of the dummy GPT model using a configuration dictionary that specifies model parameters such as vocabulary size, embedding dimension, number of layers, and dropout rate. The model uses placeholder transformer and normalization blocks, so it does not perform real attention or normalization.
3. `logits = model(batch)` passes a batch of tokenized input sentences (prepared earlier) through the model. The model processes the input through embedding, dropout, and placeholder layers, then outputs logits or a tensor of raw, unnormalized predictions for each token position and vocabulary entry.
4. `print("Output shape:", logits.shape)` prints the shape of the output tensor, which should match `[batch_size, sequence_length, vocab_size]`, confirming that the model produces outputs of the expected dimensions.
5. `print(logits)` prints the actual logits tensor, showing the raw prediction values for each token in the input batch. Since the model is untrained and uses dummy components, these values are random.

**Summary of Key Steps:**
- Set a random seed for reproducibility.
- Instantiate a dummy GPT model with specified architecture.
- Run a batch of tokenized sentences through the model to obtain output logits.
- Print the shape and values of the output tensor to verify correct data flow and output structure.

This block is useful for confirming that the model architecture and data pipeline are functioning as intended before implementing more complex transformer logic.

In [None]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)

logits = model(batch)
print("Output shape:", logits.shape)
print(logits)

Output shape: torch.Size([2, 4, 50257])
tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)


In [25]:
torch.manual_seed(123)

#create 2 training examples with 5 dimensions or features
batch_example = torch.randn(2, 5)

layer = nn.Sequential(nn.Linear (5, 6), nn.ReLU())
out = layer(batch_example)
print(out)
print("Output shape", out.shape)

tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)
Output shape torch.Size([2, 6])
