configuration of the small GPT-2 model (dummy class)

In [28]:
GPT_CONFIG_124M = {
"vocab_size": 50257, # vocabulary of 50,257 words
"context_length": 1024, #  maximum number of input tokens the model can handle
"emb_dim": 768, # embedding size, transforming each token into a 768-dimensional vector
"n_heads": 12, # count of attention heads in the multi-head attention mechanism
"n_layers": 12, # number of transformer blocks in the model
"drop_rate": 0.1, #  intensity of the dropout mechanism to prevent overfitting
"qkv_bias": False # whether to include a bias vector in the Linear layers of the MHA for QKV computations
}


 placeholder GPT model architecture class

 -- An LLM consists of transformer blocks that contain masked MHAs. Weâ€™re transforming token IDs into words into vector representations then later on transferred into vocab size so that the LLM generates text


In [None]:
import torch
import torch.nn as nn

class DummyGPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) # turns numbers into vectors
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) # adds info about position of each word
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential( # trf blocks is a series of dummy transformer blocks
            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]) #A
        self.final_norm = DummyLayerNorm(cfg["emb_dim"]) #B
        self.out_head = nn.Linear( # outhead is the final layer that takes the processed vectors and turns them back into a prediction for next word
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

        # DEF FORWARD; computes token and positional embeddings for the input indices, 
        # applies dropout, processes the data through
        # the transformer blocks, applies normalization, and finally produces logits with the linear
        # output layer.

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
    
class DummyTransformerBlock(nn.Module): #C, tests if data flows correctly from embedding to layer
    def __init__(self, cfg):
        super().__init__()
    def forward(self, x): #F, It doesnt do anything at first, just returns input
        return x
    
class DummyLayerNorm(nn.Module): #E, normalizing output of a given layer that go into the next layer for optimization
    def __init__(self, normalized_shape, eps=1e-5): #F
        super().__init__()
    def forward(self, x):
        return x


tokenize a batch consisting of two text
inputs for the GPT model using the tiktoken tokenizer

In [30]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch = []
txt1 = "Every effort moves you" # Each word represents a token in output
txt2 = "Every day holds a"

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


initialize a new 124 million parameter DummyGPTModel instance and feed it the
tokenized batch

In [31]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M) # model initialized with random values via embedding layers
logits = model(batch) #calling model
print("Output shape:", logits.shape)
print(logits) #logits: last linear layer that returns its output


Output shape: torch.Size([2, 4, 50257])
tensor([[[-1.2034,  0.3201, -0.7130,  ..., -1.5548, -0.2390, -0.4667],
         [-0.1192,  0.4539, -0.4432,  ...,  0.2392,  1.3469,  1.2430],
         [ 0.5307,  1.6720, -0.4695,  ...,  1.1966,  0.0111,  0.5835],
         [ 0.0139,  1.6754, -0.3388,  ...,  1.1586, -0.0435, -1.0400]],

        [[-1.0908,  0.1798, -0.9484,  ..., -1.6047,  0.2439, -0.4530],
         [-0.7860,  0.5581, -0.0610,  ...,  0.4835, -0.0077,  1.6621],
         [ 0.3567,  1.2698, -0.6398,  ..., -0.0162, -0.1296,  0.3717],
         [-0.2407, -0.7349, -0.5102,  ...,  2.0057, -0.3694,  0.1814]]],
       grad_fn=<UnsafeViewBackward0>)


implement a neural network layer with 5 inputs and 6 outputs that we apply to two input
examples

In [None]:
torch.manual_seed(123)
batch_example = torch.randn(2, 5) #will generate random sample
layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU()) #ReLU is a non-linear activation function so network can learn more things
out = layer(batch_example)
print(out)

# first row lists the layer outputs for the first input
# and the second row lists the layer outputs for the second row


tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)


apply layer normalization to these outputs, let's examine the mean and
variance

In [38]:
mean = out.mean(dim=-1, keepdim=True)
var = out.var(dim=-1, keepdim=True) #keepDim keeps dimensions and rows for efficiency
print("Mean:\n", mean)
print("Variance:\n", var)

# first row in the mean tensor above contains the mean value for the first input row, and
# the second output row contains the mean for the second input row


Mean:
 tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)


apply layer normalization to the layer outputs we obtained earlier. The
operation consists of subtracting the mean and dividing by the square root of the variance
(also known as standard deviation)

In [39]:
out_norm = (out - mean) / torch.sqrt(var)
mean = out_norm.mean(dim=-1, keepdim=True)
var = out_norm.var(dim=-1, keepdim=True)
print("Normalized layer outputs:\n", out_norm)
print("Mean:\n", mean)
print("Variance:\n", var)

Normalized layer outputs:
 tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
Mean:
 tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


turn off the scientific notation when printing tensor
values by setting sci_mode to False for readability

In [40]:
torch.set_printoptions(sci_mode=False)
print("Mean:\n", mean)
print("Variance:\n", var)

Mean:
 tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)


Layer normalization class

In [43]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim): #initialization parameters
        super().__init__()
        self.eps = 1e-5 # prevents division by 0, very small value placeholder
        self.scale = nn.Parameter(torch.ones(emb_dim)) # scale makes the 6 values trainable
        self.shift = nn.Parameter(torch.zeros(emb_dim)) # shift, helps later when adding back value
    def forward(self, x): 
        mean = x.mean(dim=-1, keepdim=True) # normalization, computed mean
        var = x.var(dim=-1, keepdim=True, unbiased=False) # norm, computer variance
        norm_x = (x - mean) / torch.sqrt(var + self.eps) # norm, substracted mean divided by SD, EPS prevents div by 0 error
        return self.scale * norm_x + self.shift # shift adds back the value

below normalizes the values of each of the two inputs such that they have a mean of 0 and a
variance of 1

In [None]:
ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example) # outputs transformed values
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True) # unbiased false is sample statistic
print("Mean:\n", mean)
print("Variance:\n", var)


Mean:
 tensor([[    -0.0000],
        [     0.0000]], grad_fn=<MeanBackward1>)
Variance:
 tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
