This is a small notebook that follows the structure of the code and defines examples of different functions and concepts

In [2]:
import torch
import torch.nn as nn

### **1. Input Tokens, Input Embeddings, Positional Embeddings:**

#### **Broadcasting**

In [162]:
# Simple Broadcasting Example: https://pytorch.org/docs/stable/notes/broadcasting.html

# Broadcasting # (B,T,C) + (T,C) = (B,T,C) # (2,3,4) + (3,4) = (2,3,4)
# Define a tensor of shape (2, 3, 4)
tensor1 = torch.ones((2, 3, 4))
# Define a tensor of shape (3, 4)
tensor2 = torch.arange(12).reshape((3, 4))
# Broadcasting addition
result = tensor1 + tensor2
print("Tensor 1 (shape: {}):\n{}".format(tensor1.shape, tensor1))
print("Tensor 2 (shape: {}):\n{}".format(tensor2.shape, tensor2))
print("Result (shape: {}):\n{}".format(result.shape, result))
# Last 2 dimensions should be of same size for broadcasting to work

Tensor 1 (shape: torch.Size([2, 3, 4])):
tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]])
Tensor 2 (shape: torch.Size([3, 4])):
tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])
Result (shape: torch.Size([2, 3, 4])):
tensor([[[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.],
         [ 9., 10., 11., 12.]],

        [[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.],
         [ 9., 10., 11., 12.]]])


#### **nn.Embedding**

In [163]:
# https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html
# Small example of creating an embedding lookup table for a B,T (B=1) tensor and a B,T (B=2) tensor

# Define simple lookup embedding table
embedding = torch.nn.Embedding(num_embeddings=4, embedding_dim=5)

# Testing with tensor of shape B,T = 1,4
input = torch.tensor([[1, 2, 3, 0]])
output = embedding(input)
print("Output tensor (shape: {}):\n{}".format(output.shape, output))
print("Firt Element of Embedding Table:\n{}".format(embedding.weight[0]))
print("-")
# Testing with a tensor of shape B,T = 2,4
input = torch.tensor([[1, 2, 3, 0],[1, 3, 2, 1]], dtype=torch.long)
output = embedding(input)
print("Input tensor (shape: {}):\n{}".format(input.shape, input))
# Lookup embeddings
output = embedding(input)
print("Output tensor (shape: {}):\n{}".format(output.shape, output))
# Get the embedding for the first element
print("Embedding for first element:\n{}".format(output[0][0]))

Output tensor (shape: torch.Size([1, 4, 5])):
tensor([[[ 0.2523,  1.9565, -0.0244,  0.1356,  1.4870],
         [ 0.2593, -0.2504, -0.7831, -1.6453,  1.7529],
         [ 0.6701,  0.3365, -0.1757,  1.5942,  0.3505],
         [ 0.5761, -0.7605, -0.5727, -0.9205,  0.1978]]],
       grad_fn=<EmbeddingBackward0>)
Firt Element of Embedding Table:
tensor([ 0.5761, -0.7605, -0.5727, -0.9205,  0.1978],
       grad_fn=<SelectBackward0>)
-
Input tensor (shape: torch.Size([2, 4])):
tensor([[1, 2, 3, 0],
        [1, 3, 2, 1]])
Output tensor (shape: torch.Size([2, 4, 5])):
tensor([[[ 0.2523,  1.9565, -0.0244,  0.1356,  1.4870],
         [ 0.2593, -0.2504, -0.7831, -1.6453,  1.7529],
         [ 0.6701,  0.3365, -0.1757,  1.5942,  0.3505],
         [ 0.5761, -0.7605, -0.5727, -0.9205,  0.1978]],

        [[ 0.2523,  1.9565, -0.0244,  0.1356,  1.4870],
         [ 0.6701,  0.3365, -0.1757,  1.5942,  0.3505],
         [ 0.2593, -0.2504, -0.7831, -1.6453,  1.7529],
         [ 0.2523,  1.9565, -0.0244,  0.1

In [164]:
# Other small operations

# Torch Arrange 
tensor = torch.tensor([3,2,1,0])
print("Tensor:\n{}".format(tensor))
print("Tensor shape: {}".format(tensor.shape))
tensor_2 = torch.arange(len(tensor))
print("Tensor 2:\n{}".format(tensor_2))
print("-")

# Initialize from normal distribution
tensor = nn.init.normal_(torch.empty(3, 3),mean=0.0,std=0.01)
print("Tensor:\n{}".format(tensor))
print("-")

# Sample from multinomial distribution (iterate 10 times to see different values)
selected_values = []
for i in range(10):
    tensor = torch.tensor([0.75,0.20,0.05])
    new_val = torch.multinomial(tensor, 1)
    selected_values.append(new_val.item())
print("Selected Values:\n{}".format(selected_values))

# Compute softmax
tensor_unormalized = torch.tensor([1.0, 2.0, 3.0])
tensor_softmax = torch.nn.functional.softmax(tensor_unormalized, dim=-1) # -1 or 0 = last dimension
print("Tensor Softmax:\n{}".format(tensor_softmax))
print("-")
tensor_2D_unormalized = torch.tensor([[-1.5,4,0.2],[1.0, 2.0, 3.0]])
tensor_2D_softmax = torch.nn.functional.softmax(tensor_2D_unormalized, dim=-1) # -1 or 0 = last dimension
print("Tensor 2D Softmax:\n{}".format(tensor_2D_softmax))
print("-")

# Logits of last token (B, T, C) = (1, 4, 6)
# Define a tensor of shape (1, 4, 6)
tensor = torch.tensor([[
    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],  # Token 1
    [7.0, 8.0, 9.0, 10.0, 11.0, 12.0],  # Token 2
    [13.0, 14.0, 15.0, 16.0, 17.0, 18.0],  # Token 3
    [19.0, 20.0, 21.0, 22.0, 23.0, 24.0]   # Token 4 (last token)
]])

print("Tensor shape:\n{}".format(tensor.shape))
print("Tensor:\n{}".format(tensor))
# Logits of the last token
last_token_logits = tensor[:, -1, :]  # Select logits of the last token in the sequence
print("Last token logits:\n{}".format(last_token_logits))
print("shaoe: {}".format(last_token_logits.shape))
# Softmax of the last token
last_token_softmax = torch.nn.functional.softmax(last_token_logits, dim=-1)
print("Last token softmax:\n{}".format(last_token_softmax))

Tensor:
tensor([3, 2, 1, 0])
Tensor shape: torch.Size([4])
Tensor 2:
tensor([0, 1, 2, 3])
-
Tensor:
tensor([[ 1.2848e-02,  1.1640e-02, -8.8220e-03],
        [-1.7866e-02, -4.3383e-03, -4.9922e-03],
        [-1.1544e-03,  1.4105e-02,  8.3314e-06]])
-
Selected Values:
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
Tensor Softmax:
tensor([0.0900, 0.2447, 0.6652])
-
Tensor 2D Softmax:
tensor([[0.0040, 0.9742, 0.0218],
        [0.0900, 0.2447, 0.6652]])
-
Tensor shape:
torch.Size([1, 4, 6])
Tensor:
tensor([[[ 1.,  2.,  3.,  4.,  5.,  6.],
         [ 7.,  8.,  9., 10., 11., 12.],
         [13., 14., 15., 16., 17., 18.],
         [19., 20., 21., 22., 23., 24.]]])
Last token logits:
tensor([[19., 20., 21., 22., 23., 24.]])
shaoe: torch.Size([1, 6])
Last token softmax:
tensor([[0.0043, 0.0116, 0.0315, 0.0858, 0.2331, 0.6337]])


In [165]:

# Define the embedding layer with a fixed number of embeddings and dimension
embedding = nn.Embedding(num_embeddings=4, embedding_dim=5)

# Test 1: Input tensor of shape (1, 4)
input1 = torch.tensor([[1, 2, 3, 0]], dtype=torch.long)
output1 = embedding(input1)

print("Embedding Table:\n{}".format(embedding.weight))
print("First Element of Embedding Table:\n{}".format(embedding.weight[0]))
print("Output Tensor (Test 1):\n{}".format(output1))

print("-" * 50)

# Test 2: Input tensor of shape (2, 4) using the same embedding layer
input2 = torch.tensor([[1, 1, 2, 3], [1, 2, 3, 0]], dtype=torch.long)
output2 = embedding(input2)

print("Input Tensor (Test 2):\n{}".format(input2))
print("Output Tensor (Test 2):\n{}".format(output2))
print("Embedding for First Element in Output Tensor (Test 2):\n{}".format(output2[0][0]))


Embedding Table:
Parameter containing:
tensor([[-0.3997,  0.6892, -0.9752,  0.1737, -0.1289],
        [-0.2019,  2.3380, -2.0946, -0.5255, -1.4160],
        [-1.1896, -0.4828,  0.9657, -0.3822,  0.0784],
        [ 0.6606, -2.8634, -0.0859, -1.1614,  0.1609]], requires_grad=True)
First Element of Embedding Table:
tensor([-0.3997,  0.6892, -0.9752,  0.1737, -0.1289],
       grad_fn=<SelectBackward0>)
Output Tensor (Test 1):
tensor([[[-0.2019,  2.3380, -2.0946, -0.5255, -1.4160],
         [-1.1896, -0.4828,  0.9657, -0.3822,  0.0784],
         [ 0.6606, -2.8634, -0.0859, -1.1614,  0.1609],
         [-0.3997,  0.6892, -0.9752,  0.1737, -0.1289]]],
       grad_fn=<EmbeddingBackward0>)
--------------------------------------------------
Input Tensor (Test 2):
tensor([[1, 1, 2, 3],
        [1, 2, 3, 0]])
Output Tensor (Test 2):
tensor([[[-0.2019,  2.3380, -2.0946, -0.5255, -1.4160],
         [-0.2019,  2.3380, -2.0946, -0.5255, -1.4160],
         [-1.1896, -0.4828,  0.9657, -0.3822,  0.0784],


In [166]:
# Create a simple embedding matrix
vocab_size = 7 # number of words in the vocabulary
embedding_dim = 6 # dimension of the word embeddings
embedding = torch.nn.Embedding(vocab_size, embedding_dim)
print(embedding.weight)

# Create a tensor of Batch Size x Sequence Length (Batch Size = 2, Sequence Length = 3)
tokenized_text = torch.tensor([[1, 2, 3, 4], [1, 2, 3, 5]])
print(tokenized_text)
# Get the embeddings for the tensor
embeddings = embedding(tokenized_text)
print(embeddings)
print(embeddings.shape)

Parameter containing:
tensor([[ 1.4902,  0.7087, -0.7820, -0.1957, -1.3458,  2.9447],
        [-0.6739, -0.2622,  1.8307, -0.6616, -0.1938,  0.0837],
        [-0.0206,  0.0453, -1.2263, -0.2533,  0.5648,  0.5291],
        [-0.6192,  1.9398, -1.5451,  1.5133, -1.5949, -1.0607],
        [-0.9507,  0.6232,  0.3598,  0.7308,  0.2165, -0.0649],
        [ 1.0993, -0.0993,  0.4505,  1.1188, -0.6635, -0.9021],
        [ 1.4747, -0.4545, -0.7924, -0.8058,  1.5112, -0.1523]],
       requires_grad=True)
tensor([[1, 2, 3, 4],
        [1, 2, 3, 5]])
tensor([[[-0.6739, -0.2622,  1.8307, -0.6616, -0.1938,  0.0837],
         [-0.0206,  0.0453, -1.2263, -0.2533,  0.5648,  0.5291],
         [-0.6192,  1.9398, -1.5451,  1.5133, -1.5949, -1.0607],
         [-0.9507,  0.6232,  0.3598,  0.7308,  0.2165, -0.0649]],

        [[-0.6739, -0.2622,  1.8307, -0.6616, -0.1938,  0.0837],
         [-0.0206,  0.0453, -1.2263, -0.2533,  0.5648,  0.5291],
         [-0.6192,  1.9398, -1.5451,  1.5133, -1.5949, -1.0607],


In [167]:
# Broadcasting simplified example
vocab_size = 4
embedding_dim = 3
block_size = 3
batch_size = 2

embedding_table = torch.nn.Embedding(vocab_size,embedding_dim) 
print(embedding_table.weight)
print("--")
positional_embedding_table = torch.nn.Embedding(block_size,embedding_dim) 
print(positional_embedding_table.weight)

# B,T tensor of shape (batch_size, block_size)
input_ids = torch.tensor([[0, 2, 1],[0, 1, 2]])
print('Inputs \n',input_ids)
print("--")
# B,T tensor of shape (batch_size, block_size, embedding_dim)
input_embeddings = embedding_table(input_ids)
print('Input Embeddings \n', input_embeddings)
print("--")
# B,T tensor of shape (batch_size, block_size, embedding_dim)
print(torch.arange(block_size))
positional_embeddings = positional_embedding_table(torch.arange(block_size))
print('Positional embeddings \n',positional_embeddings)
print("--")
# B,T tensor of shape (batch_size, block_size, embedding_dim)
embeddings = input_embeddings + positional_embeddings
print('Final embeddings \n',embeddings)

Parameter containing:
tensor([[-0.1124,  1.3414, -0.0276],
        [ 0.9247,  0.1645, -1.0012],
        [ 0.3607,  0.7374, -0.5141],
        [-0.3547, -0.0061, -1.5211]], requires_grad=True)
--
Parameter containing:
tensor([[-1.2013,  0.9507, -1.5531],
        [ 1.0079,  1.0736,  0.6594],
        [ 0.5344,  0.2102,  0.3869]], requires_grad=True)
Inputs 
 tensor([[0, 2, 1],
        [0, 1, 2]])
--
Input Embeddings 
 tensor([[[-0.1124,  1.3414, -0.0276],
         [ 0.3607,  0.7374, -0.5141],
         [ 0.9247,  0.1645, -1.0012]],

        [[-0.1124,  1.3414, -0.0276],
         [ 0.9247,  0.1645, -1.0012],
         [ 0.3607,  0.7374, -0.5141]]], grad_fn=<EmbeddingBackward0>)
--
tensor([0, 1, 2])
Positional embeddings 
 tensor([[-1.2013,  0.9507, -1.5531],
        [ 1.0079,  1.0736,  0.6594],
        [ 0.5344,  0.2102,  0.3869]], grad_fn=<EmbeddingBackward0>)
--
Final embeddings 
 tensor([[[-1.3137,  2.2921, -1.5807],
         [ 1.3685,  1.8110,  0.1453],
         [ 1.4591,  0.3748, -0.6143

### **2. Decoder**

#### **2.1. Masked Multi-Head Attention**

##### **Linear Layer**

In [173]:
# Simple Linear Layer Example # https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
n_embed = 4
head_size = 2
# W is stored as a matrix of shape (out_features, in_features) but the linear layer expects it to be (in_features, out_features)
linear_layer = torch.nn.Linear(in_features=n_embed, out_features=head_size,bias=False)
print(linear_layer.weight)
print("--")
# B,T tensor of shape (batch_size, block_size, n_embed) = (1,3,4)
input_embeddings = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]]])
print('Input Embeddings \n', input_embeddings)
print("--")
output = linear_layer(input_embeddings)
print('Output \n', output, output.shape)

# Rewrite using the formula: output = input * weight^T
output = torch.matmul(input_embeddings, linear_layer.weight.t())
print('Output \n', output, output.shape)

# Comment: pytorch function handles direclty the transpose of the weight matrix

Parameter containing:
tensor([[-0.3037,  0.0434,  0.0251, -0.1345],
        [ 0.4486,  0.2598,  0.3083, -0.3898]], requires_grad=True)
--
Input Embeddings 
 tensor([[[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.],
         [ 9., 10., 11., 12.]]])
--
Output 
 tensor([[[-0.6792,  0.3340],
         [-2.1575,  2.8416],
         [-3.6358,  5.3492]]], grad_fn=<UnsafeViewBackward0>) torch.Size([1, 3, 2])
Output 
 tensor([[[-0.6792,  0.3340],
         [-2.1575,  2.8416],
         [-3.6358,  5.3492]]], grad_fn=<UnsafeViewBackward0>) torch.Size([1, 3, 2])


##### **Attention Score Formula**

In [197]:
# Few transpose
a = torch.tensor([[1, 2], [3, 4], [5, 6]])
print(a)
print(a.transpose(-2, -1)) # -2 and -1 are the last two dimensions

tensor([[1, 2],
        [3, 4],
        [5, 6]])
tensor([[1, 3, 5],
        [2, 4, 6]])


In [181]:
q = torch.ones((1, 2, 3)) # Shape (1, 2, 2)
print(q)
k = torch.rand((1, 2, 3)).transpose(-2,-1) # Shape (1,3, 2) # Transpose the last two dimensions
print(k)
wei = q @ k
print(wei)
# Output: (1, 2, 2) = (1, 2, 3) @ (1, 3, 2)
# Output[0][0][0] = q[0][0] @ k[0][0] = q[0][0][0] * k[0][0][0] + q[0][0][1] * k[0][0][1] + q[0][0][2] * k[0][0][2]

# Sqrt by shape k
k = torch.ones((1, 2, 4)) # Shape (1, 2, 4)
print(k)
print(k.shape[-1])
k = k / k.shape[-1] ** 0.5
print(k)

tensor([[[1., 1., 1.],
         [1., 1., 1.]]])
tensor([[[0.4630, 0.1789],
         [0.1553, 0.8568],
         [0.8870, 0.0808]]])
tensor([[[1.5053, 1.1166],
         [1.5053, 1.1166]]])
tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.]]])
4
tensor([[[0.5000, 0.5000, 0.5000, 0.5000],
         [0.5000, 0.5000, 0.5000, 0.5000]]])


##### **Masking**


In [206]:
# Lower triangular masl
# Create a tensor of shape (3, 3)
tensor = torch.ones((3, 3))
print("Tensor:\n{}".format(tensor))
# Create a mask using tril function
mask = torch.tril(torch.ones((3, 3)))
print("Mask:\n{}".format(mask))
# Apply the mask to the tensor
masked_tensor = tensor.masked_fill(mask == 0, float('-inf'))
print("Masked Tensor:\n{}".format(masked_tensor))
# Apply softmax to the masked tensor
softmax_tensor = torch.nn.functional.softmax(masked_tensor, dim=-1)
print("Softmax Tensor:\n{}".format(softmax_tensor)) # this computes row-wise softmax on last dimension

# Create a upper triangular mask
tensor = torch.ones((3, 3))
mask = torch.triu(torch.ones((3, 3)))
masked_tensor = tensor.masked_fill(mask == 0, float('-inf'))
print(masked_tensor)

Tensor:
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
Mask:
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
Masked Tensor:
tensor([[1., -inf, -inf],
        [1., 1., -inf],
        [1., 1., 1.]])
Softmax Tensor:
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[1., 1., 1.],
        [-inf, 1., 1.],
        [-inf, -inf, 1.]])


#### **2.2. Norm and Add (Pre-Norm)**

In [240]:
# https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html
# Say: B,T,C = 1,3,4
# Define a tensor of shape (1, 3, 4)
tensor = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]]])
print("Tensor shape:\n{}".format(tensor.shape))
print("Tensor:\n{}".format(tensor))
# Layer normalization
layer_norm = torch.nn.LayerNorm(normalized_shape=4)
output = layer_norm(tensor)
print("Output shape:\n{}".format(output.shape))
print("Output:\n{}".format(output))

Tensor shape:
torch.Size([1, 3, 4])
Tensor:
tensor([[[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.],
         [ 9., 10., 11., 12.]]])
Output shape:
torch.Size([1, 3, 4])
Output:
tensor([[[-1.3416, -0.4472,  0.4472,  1.3416],
         [-1.3416, -0.4472,  0.4472,  1.3416],
         [-1.3416, -0.4472,  0.4472,  1.3416]]],
       grad_fn=<NativeLayerNormBackward0>)


In [243]:
# This is done for each token in the sequence
# Print the normalized token of the first sequence
print("Normalized token 1:\n{}".format(output[0][0]))
# If we compute manually the normalized token 1
single_token = tensor[0][0]
print(single_token)
# Compute mean and std for the last dimension of the single token
eps = 1e-5
mean = single_token.mean(dim=-1, keepdim=True)
std = single_token.std(dim=-1, unbiased=False, keepdim=True)
# Normalize the single token
normalized_token = (single_token - mean) / (std + eps)
print("Original token:", single_token)
print("Normalized token:", normalized_token)

Normalized token 1:
tensor([-1.3416, -0.4472,  0.4472,  1.3416])
tensor([1., 2., 3., 4.])
Original token: tensor([1., 2., 3., 4.])
Normalized token: tensor([-1.3416, -0.4472,  0.4472,  1.3416])


In [242]:
# Manually writing the formula of Pytorch LayerNorm
def layer_norm(tensor, eps=1e-5):
    mean = tensor.mean(dim=-1, keepdim=True)
    std = tensor.std(dim=-1, unbiased=False, keepdim=True)
    return (tensor - mean) / (std + eps)

output = layer_norm(tensor)
print("Output shape:\n{}".format(output.shape))
print("Output:\n{}".format(output))

Output shape:
torch.Size([1, 3, 4])
Output:
tensor([[[-1.3416, -0.4472,  0.4472,  1.3416],
         [-1.3416, -0.4472,  0.4472,  1.3416],
         [-1.3416, -0.4472,  0.4472,  1.3416]]])


In [244]:
# Compare batch normalization and layer normalization
tensor = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]]])
print("Tensor shape:\n{}".format(tensor.shape))
print("Tensor:\n{}".format(tensor))

# Layer normalization # Normalize each token independently across its embedding dimensions
layer_norm = torch.nn.LayerNorm(normalized_shape=4)
output_layer_norm = layer_norm(tensor)
print("Output shape (LayerNorm):\n{}".format(output_layer_norm.shape))
print("Output (LayerNorm):\n{}".format(output_layer_norm))

# Batch normalization # Normalize each embedding dimension independently across all tokens
batch_norm = torch.nn.BatchNorm1d(num_features=4)
output_batch_norm = batch_norm(tensor.view(-1, 4)).view(tensor.shape)
print("Output shape (BatchNorm):\n{}".format(output_batch_norm.shape))
print("Output (BatchNorm):\n{}".format(output_batch_norm))

Tensor shape:
torch.Size([1, 3, 4])
Tensor:
tensor([[[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.],
         [ 9., 10., 11., 12.]]])
Output shape (LayerNorm):
torch.Size([1, 3, 4])
Output (LayerNorm):
tensor([[[-1.3416, -0.4472,  0.4472,  1.3416],
         [-1.3416, -0.4472,  0.4472,  1.3416],
         [-1.3416, -0.4472,  0.4472,  1.3416]]],
       grad_fn=<NativeLayerNormBackward0>)
Output shape (BatchNorm):
torch.Size([1, 3, 4])
Output (BatchNorm):
tensor([[[-1.2247, -1.2247, -1.2247, -1.2247],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 1.2247,  1.2247,  1.2247,  1.2247]]], grad_fn=<ViewBackward0>)


#### **2.3. Feed-Forward Neural-Network**

In [245]:
# Relu Activation Function (zeroes out negative values)
tensor = torch.tensor([[-1.0, 0.0, 1.0], [2.0, -2.0, 0.0]])
print("Tensor:\n{}".format(tensor))
relu = torch.nn.ReLU()
output = relu(tensor)
print("Output:\n{}".format(output))

Tensor:
tensor([[-1.,  0.,  1.],
        [ 2., -2.,  0.]])
Output:
tensor([[0., 0., 1.],
        [2., 0., 0.]])


In [401]:
# Function to compute mean gradients with optional ReLU
def compute_gradients(with_relu, runs=10):
    mean_grads = []
    for _ in range(runs):
        # Define the input tensor
        tensor = torch.tensor([[1.0, 2.0, 3.0]], requires_grad=True)

        # Define a 2-layer feedforward network
        layers = [
            torch.nn.Linear(in_features=3, out_features=128),  # First layer
        ]
        if with_relu:
            layers.append(torch.nn.ReLU())  # Add ReLU activation if specified
        layers.append(torch.nn.Linear(in_features=128, out_features=3))  # Second layer
        feedforward = torch.nn.Sequential(*layers)

        # Apply the feedforward network to the tensor
        output = feedforward(tensor)

        # Compute gradients
        loss = output.sum()  # Simple loss function
        loss.backward()  # Backpropagation

        # Record the mean of the gradients of the input tensor
        mean_grads.append(tensor.grad.mean().item())

    # Return the average of the mean gradients over multiple runs
    return sum(mean_grads) / runs

# Experiment: Compare gradients with and without ReLU
mean_grad_without_relu = compute_gradients(with_relu=False, runs=10000)
mean_grad_with_relu = compute_gradients(with_relu=True, runs=10000)

# Print results
print("Average mean of gradients without ReLU: {:.6f}".format(mean_grad_without_relu))
print("Average mean of gradients with ReLU: {:.6f}".format(mean_grad_with_relu))


Average mean of gradients without ReLU: -0.002167
Average mean of gradients with ReLU: 0.001124


### **3. Linear Layer**

In [405]:
# Last Linear Layer to produce logits
# Define the last linear layer
B,T,C = 1,3,4
vocab_size = 5
# Embedding matrix
embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=C)
input = torch.ones((B, T), dtype=torch.long)
output = embedding(input)
print("Output tensor (shape: {}):\n{}".format(output.shape, output))

# Define the last linear layer # (B,T,vocab_size)
linear = torch.nn.Linear(in_features=C, out_features=vocab_size)
logits = linear(output)
print("Logits tensor (shape: {}):\n{}".format(logits.shape, logits))


Output tensor (shape: torch.Size([1, 3, 4])):
tensor([[[-0.6495,  0.2708,  0.9177, -1.7845],
         [-0.6495,  0.2708,  0.9177, -1.7845],
         [-0.6495,  0.2708,  0.9177, -1.7845]]], grad_fn=<EmbeddingBackward0>)
Logits tensor (shape: torch.Size([1, 3, 5])):
tensor([[[-0.4887,  1.1743,  0.5417,  0.0886,  0.7914],
         [-0.4887,  1.1743,  0.5417,  0.0886,  0.7914],
         [-0.4887,  1.1743,  0.5417,  0.0886,  0.7914]]],
       grad_fn=<ViewBackward0>)


### **4. Training After Linear**

In [427]:
# Small example cross entropy loss
# https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html

# 1D tensor
# Define the target tensor
target = torch.tensor([2])
# Define the predicted logits tensor
logits = torch.tensor([[2.0, 1.0, 0.1]])
# Define the cross-entropy loss
loss_fn = torch.nn.CrossEntropyLoss()
loss = loss_fn(logits, target)
print("Loss:\n{}".format(loss))

# Now if we compute manually
# Compute the softmax of the logits
softmax = torch.nn.functional.softmax(logits, dim=-1)
print("Softmax:\n{}".format(softmax))
# Compute the negative log likelihood of the target
nll = -torch.log(softmax[0][target.item()])
print("Negative Log Likelihood:\n{}".format(nll))


Loss:
2.3170299530029297
Softmax:
tensor([[0.6590, 0.2424, 0.0986]])
Negative Log Likelihood:
2.3170299530029297


In [410]:
# B,T,vocab_size
B,T,vocab_size = 2,3,5
# Embedding matrix of shape (B,T,vocab_size)
embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=C)
input = torch.ones((B, T), dtype=torch.long)
output = embedding(input)
print("Output tensor (shape: {}):\n{}".format(output.shape, output))

# Flatten the output tensor
# Flatten the output tensor to shape (B*T, vocab_size)
output_flattened = output.view(B*T, -1)
print("Flattened output tensor (shape: {}):\n{}".format(output_flattened.shape, output_flattened))

Output tensor (shape: torch.Size([2, 3, 4])):
tensor([[[2.1440, 0.4149, 0.5095, 0.1742],
         [2.1440, 0.4149, 0.5095, 0.1742],
         [2.1440, 0.4149, 0.5095, 0.1742]],

        [[2.1440, 0.4149, 0.5095, 0.1742],
         [2.1440, 0.4149, 0.5095, 0.1742],
         [2.1440, 0.4149, 0.5095, 0.1742]]], grad_fn=<EmbeddingBackward0>)
Flattened output tensor (shape: torch.Size([6, 4])):
tensor([[2.1440, 0.4149, 0.5095, 0.1742],
        [2.1440, 0.4149, 0.5095, 0.1742],
        [2.1440, 0.4149, 0.5095, 0.1742],
        [2.1440, 0.4149, 0.5095, 0.1742],
        [2.1440, 0.4149, 0.5095, 0.1742],
        [2.1440, 0.4149, 0.5095, 0.1742]], grad_fn=<ViewBackward0>)


In [417]:
# Step 1: Define parameters
vocab_size = 7  # Number of classes (vocabulary size)
B = 2           # Batch size
T = 4           # Sequence length

# Step 2: Create random logits (unnormalized scores)
# Generate random values between -2 and 2 to simulate logits
logits = torch.rand((B * T, vocab_size)) * 4 - 2
print("Logits shape (B*T, vocab_size):\n{}".format(logits.shape))
print("Logits:\n{}".format(logits))

# Step 3: Create random targets
# Generate random integers between 0 and vocab_size to simulate class indices
targets = torch.randint(low=0, high=vocab_size, size=(B * T,))
print("Targets shape (B*T):\n{}".format(targets.shape))
print("Targets:\n{}".format(targets))

# Step 4: Compute cross-entropy loss
# Cross-entropy combines log-softmax and negative log-likelihood in one step
loss = torch.nn.functional.cross_entropy(logits, targets)
print("Cross-entropy Loss:\n{}".format(loss))


Logits shape (B*T, vocab_size):
torch.Size([8, 7])
Logits:
tensor([[ 0.7507, -1.6571, -1.8844,  1.9095,  1.3594,  1.9325,  0.0358],
        [ 0.0491, -0.3235,  0.8121,  0.0802, -0.6272, -1.6151,  0.6099],
        [-0.2383,  1.4527, -0.7418, -0.5115, -1.0993, -1.2096,  1.7127],
        [ 1.4735,  0.7973,  0.1058, -0.7465,  0.6852,  1.5404, -1.0098],
        [ 0.5300, -1.8523,  1.7380, -1.2606,  0.5971,  0.1594,  0.2876],
        [ 1.6882,  0.6892, -1.5379,  1.9792,  1.0326, -0.5124,  0.1044],
        [-1.1462,  1.6649, -1.9827,  1.8334,  1.1945, -1.5641, -1.5750],
        [ 1.4218, -0.3487,  1.7386,  0.0817, -0.4494,  1.5103, -1.8102]])
Targets shape (B*T):
torch.Size([8])
Targets:
tensor([4, 0, 4, 1, 4, 2, 5, 3])
Cross-entropy Loss:
2.834946870803833


In [424]:
first_logits = logits[0]  # Shape: (vocab_size,)
print("Logits for the first token:\n{}".format(first_logits))
probabilities = torch.softmax(first_logits, dim=0)
print("Probabilities for the first token:\n{}".format(probabilities))
correct_class = targets[0]  # The target class index for the first token
print("Correct class index for the first token:\n{}".format(correct_class))
correct_prob = probabilities[correct_class]
print("Probability of the correct class:\n{}".format(correct_prob))

print(correct_prob)
first_loss = -torch.log(correct_prob)
print("Cross-entropy loss for the first token:\n{}".format(first_loss))

Logits for the first token:
tensor([ 0.7507, -1.6571, -1.8844,  1.9095,  1.3594,  1.9325,  0.0358])
Probabilities for the first token:
tensor([0.1007, 0.0091, 0.0072, 0.3207, 0.1850, 0.3281, 0.0492])
Correct class index for the first token:
4
Probability of the correct class:
0.18499410152435303
tensor(0.1850)
Cross-entropy loss for the first token:
1.6874313354492188


In [434]:
# Very small example to illustrate backpropagation and gradient computation and optimizer update
# Define a simple linear layer
linear = torch.nn.Linear(in_features=2, out_features=1)
# Define the optimizer
optimizer = torch.optim.SGD(linear.parameters(), lr=0.01)
# Define the loss function
loss_fn = torch.nn.MSELoss()

# Define the input tensor
input = torch.tensor([[1.0, 2.0]])
# Define the target tensor
target = torch.tensor([[0.0]])

# Forward pass
output = linear(input)
loss = loss_fn(output, target)
print("Loss:\n{}".format(loss))

# Backward pass
optimizer.zero_grad()
loss.backward()
print("Gradient of the weights:\n{}".format(linear.weight.grad))

# Update the weights
# Before update
print("Initial weights:\n{}".format(linear.weight))

optimizer.step()
print("Updated weights:\n{}".format(linear.weight))

Loss:
0.20015186071395874
Gradient of the weights:
tensor([[-0.8948, -1.7895]])
Initial weights:
Parameter containing:
tensor([[ 0.6876, -0.6148]], requires_grad=True)
Updated weights:
Parameter containing:
tensor([[ 0.6966, -0.5969]], requires_grad=True)


### **5. Inference After Linear**

In [441]:
# Unsqueezing a tensor allows to add a new dimension to the tensor

# Define a tensor of shape (3,)
tensor = torch.tensor([1.0, 2.0, 3.0])
print("Original tensor:\n{}".format(tensor))
# Unsqueeze the tensor along the first dimension
tensor_unsqueezed = tensor.unsqueeze(0)
print("Unsqueezed tensor:\n{}".format(tensor_unsqueezed))
print("Unsqueezed tensor shape:\n{}".format(tensor_unsqueezed.shape))


Original tensor:
tensor([1., 2., 3.])
Unsqueezed tensor:
tensor([[1., 2., 3.]])
Unsqueezed tensor shape:
torch.Size([1, 3])


In [446]:
# Generation slices that last block size
tensor = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]])
block_size = 4
index = torch.arange(tensor.size(1)).unsqueeze(0)
index_cond = index[:,-block_size:]
print(index_cond)

# take the last logit from a 3D tensor
B,T,C = 1,3,4
tensor = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]]])
print("Tensor shape:\n{}".format(tensor.shape))
print("Tensor:\n{}".format(tensor))
# Select the last token
last_token = tensor[:, -1, :]
print("Last token shape:\n{}".format(last_token.shape))
print("Last token:\n{}".format(last_token))


tensor([[6, 7, 8, 9]])
Tensor shape:
torch.Size([1, 3, 4])
Tensor:
tensor([[[ 1.,  2.,  3.,  4.],
         [ 5.,  6.,  7.,  8.],
         [ 9., 10., 11., 12.]]])
Last token shape:
torch.Size([1, 4])
Last token:
tensor([[ 9., 10., 11., 12.]])


In [453]:
# 3 Different ways of selecting the last token
probs = torch.tensor([0.6, 0.2, 0.1, 0.05, 0.05])
print(sum(probs))

# 1. Greedy selection
greedy_token = torch.argmax(probs)
print("Greedy token:\n{}".format(greedy_token))

# 2. Sampling
sampled_token = torch.multinomial(probs, 1)
print("Sampled token:\n{}".format(sampled_token))
# 3. Temperature sampling
temperature = 0.1
# Apply temperature scaling
scaled_probs = probs ** (1 / temperature)
# Normalize the scaled probabilities
scaled_probs /= scaled_probs.sum()
# Sample from the scaled probabilities
print("Scaled probs:\n{}".format(scaled_probs))
sampled_token = torch.multinomial(scaled_probs, 1)
print("Sampled token with temperature:\n{}".format(sampled_token))
temperature = 0.7
# Apply temperature scaling
scaled_probs = probs ** (1 / temperature)
# Normalize the scaled probabilities
scaled_probs /= scaled_probs.sum()
# Sample from the scaled probabilities
print("Scaled probs:\n{}".format(scaled_probs))
sampled_token = torch.multinomial(scaled_probs, 1)
print("Sampled token with temperature:\n{}".format(sampled_token))



tensor(1.)
Greedy token:
0
Sampled token:
tensor([1])
Scaled probs:
tensor([9.9998e-01, 1.6935e-05, 1.6538e-08, 1.6150e-11, 1.6150e-11])
Sampled token with temperature:
tensor([0])
Scaled probs:
tensor([0.7446, 0.1550, 0.0576, 0.0214, 0.0214])
Sampled token with temperature:
tensor([2])


### **Additionals (exploding and vanishing gradients):**


In [461]:
# Example: Demonstrating exploding gradients with and without gradient clipping
class DeepNetwork(nn.Module):
    def __init__(self):
        super(DeepNetwork, self).__init__()
        self.layers = nn.ModuleList(
            [nn.Linear(10, 10) for _ in range(10)]  # 10 linear layers
        )
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)  # No activation to amplify gradient issues
        return x

# Instantiate the network
model = DeepNetwork()

# Initialize weights with large values to induce exploding gradients
for layer in model.layers:
    nn.init.normal_(layer.weight, mean=0.0, std=10.0)

# Define an input tensor
input_tensor = torch.rand(1, 10, requires_grad=True)  # Batch size = 1, Input size = 10

# Forward pass
output = model(input_tensor)

# Define a simple loss function
loss = output.sum()

# Backward pass to compute gradients
loss.backward()

# Print gradients of the first and last layer without clipping
print("Gradients without clipping:")
print("First layer gradients:\n", model.layers[0].weight.grad)
print("Last layer gradients:\n", model.layers[-1].weight.grad)

# Apply gradient clipping
max_norm = 1.0
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)

# Print gradients of the first and last layer after clipping
print("\nGradients after clipping:")
print("First layer gradients:\n", model.layers[0].weight.grad)
print("Last layer gradients:\n", model.layers[-1].weight.grad)


Gradients without clipping:
First layer gradients:
 tensor([[ 1.6520e+12,  5.4668e+11,  2.8144e+11,  6.0873e+11,  6.7473e+11,
          1.7612e+12,  8.4165e+11,  1.7635e+12,  1.2098e+12,  1.7397e+10],
        [-1.4486e+13, -4.7937e+12, -2.4678e+12, -5.3378e+12, -5.9165e+12,
         -1.5443e+13, -7.3801e+12, -1.5463e+13, -1.0608e+13, -1.5255e+11],
        [-6.1994e+12, -2.0516e+12, -1.0562e+12, -2.2844e+12, -2.5321e+12,
         -6.6092e+12, -3.1585e+12, -6.6178e+12, -4.5401e+12, -6.5286e+10],
        [-2.0655e+12, -6.8351e+11, -3.5188e+11, -7.6110e+11, -8.4362e+11,
         -2.2020e+12, -1.0523e+12, -2.2049e+12, -1.5126e+12, -2.1751e+10],
        [-5.5334e+12, -1.8312e+12, -9.4271e+11, -2.0390e+12, -2.2601e+12,
         -5.8992e+12, -2.8192e+12, -5.9069e+12, -4.0524e+12, -5.8273e+10],
        [ 3.2435e+12,  1.0734e+12,  5.5258e+11,  1.1952e+12,  1.3248e+12,
          3.4579e+12,  1.6525e+12,  3.4624e+12,  2.3754e+12,  3.4157e+10],
        [ 6.9659e+12,  2.3052e+12,  1.1868e+12,  2.566

In [464]:
# Example: Deep network to demonstrate vanishing gradients
class DeepNetwork(nn.Module):
    def __init__(self, num_layers=20, use_relu=False):
        super(DeepNetwork, self).__init__()
        self.layers = nn.ModuleList(
            [nn.Linear(10, 10) for _ in range(num_layers)]  # Deeper network
        )
        self.activation = nn.ReLU() if use_relu else nn.Sigmoid()  # Choose activation function
    
    def forward(self, x):
        for layer in self.layers:
            x = self.activation(layer(x))  # Apply activation after each layer
        return x

# Function to test gradients
def test_gradients(num_layers=20, use_relu=False):
    print("\nTesting with ReLU" if use_relu else "\nTesting with Sigmoid")
    
    # Instantiate the network
    model = DeepNetwork(num_layers=num_layers, use_relu=use_relu)

    # Define an input tensor
    input_tensor = torch.rand(1, 10, requires_grad=True) * 10 - 5  # Input in range [-5, 5] for sigmoid saturation

    # Forward pass
    output = model(input_tensor)

    # Define a simple loss function
    loss = output.sum()

    # Backward pass to compute gradients
    loss.backward()

    # Print gradient magnitudes across all layers
    for i, layer in enumerate(model.layers):
        grad_magnitude = layer.weight.grad.abs().mean().item()
        print(f"Layer {i+1} gradient magnitude: {grad_magnitude}")

# Test with Sigmoid activation
test_gradients(num_layers=20, use_relu=False)

# Test with ReLU activation
test_gradients(num_layers=20, use_relu=True)



Testing with Sigmoid
Layer 1 gradient magnitude: 1.7341980508517134e-17
Layer 2 gradient magnitude: 8.603757675814747e-17
Layer 3 gradient magnitude: 3.944868016401732e-16
Layer 4 gradient magnitude: 2.8540074237969797e-15
Layer 5 gradient magnitude: 1.3276803532177427e-14
Layer 6 gradient magnitude: 6.872605783081465e-14
Layer 7 gradient magnitude: 5.469681240051483e-13
Layer 8 gradient magnitude: 4.498576424566414e-12
Layer 9 gradient magnitude: 3.3529137799526154e-11
Layer 10 gradient magnitude: 2.3103577384553375e-10
Layer 11 gradient magnitude: 2.09278572249616e-09
Layer 12 gradient magnitude: 2.1537672978411138e-08
Layer 13 gradient magnitude: 1.6754657394812966e-07
Layer 14 gradient magnitude: 8.938436053540499e-07
Layer 15 gradient magnitude: 6.043735993443988e-06
Layer 16 gradient magnitude: 3.8616293750237674e-05
Layer 17 gradient magnitude: 0.00024742557434365153
Layer 18 gradient magnitude: 0.0024572371039539576
Layer 19 gradient magnitude: 0.015643732622265816
Layer 20 gr