<a href="https://colab.research.google.com/github/bubuloMallone/NeuralNetworksLM/blob/main/backprop_tensor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Understanding the Backpropagation Algorithm

The aim of this notebook is to implement from scratch the back propagation alogorithm, this time at the tensors level (like the PyTorch .backprop() method) to understand it works under the hood. This helps also understanding how to use it optimally in further training.

In [5]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
import random

In [2]:
# read all the words dataset

!wget https://raw.githubusercontent.com/bubuloMallone/NeuralNetworksLM/refs/heads/main/datasets/names.txt

words = open('names.txt', 'r').read().splitlines()

words[:10]

--2025-07-15 09:33:09--  https://raw.githubusercontent.com/bubuloMallone/NeuralNetworksLM/refs/heads/main/datasets/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2025-07-15 09:33:09 (6.19 MB/s) - ‘names.txt’ saved [228145/228145]



['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [4]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [6]:
# build the dataset

# define the context length: how many char we consider to predict the next one
block_size = 3

def build_dataset(words):

  X, Y = [], []
  for word in words:
    context = [0] * block_size
    for ch in word + '.':
      idx = stoi[ch]
      X.append(context)
      Y.append(idx)
      # print(''.join(itos[i] for i in context), '--->', itos[idx])
      context = context[1:] + [idx]
  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print('Data:', X.shape, X.dtype)
  print('Labels:', Y.shape, Y.dtype)
  num_samples = X.shape[0]

  return X, Y, num_samples

random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr, num_samples_tr = build_dataset(words[:n1])
Xval, Yval, num_samples_val = build_dataset(words[n1:n2])
Xte, Yte, num_samples_te = build_dataset(words[n2:])

Data: torch.Size([182625, 3]) torch.int64
Labels: torch.Size([182625]) torch.int64
Data: torch.Size([22655, 3]) torch.int64
Labels: torch.Size([22655]) torch.int64
Data: torch.Size([22866, 3]) torch.int64
Labels: torch.Size([22866]) torch.int64


First let us define a utility function to compare the manual gradients computed to PyTorch gradients.

In [7]:
# utility function to compare gradients to PyTorch gradients
def cmp(s, dt, t):
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad)
  maxdiff = (dt - t.grad).abs().max().item()
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')


Now let us initialize the neular network (MLP) as usual

In [8]:
# MLP architecture
g = torch.Generator().manual_seed(2147483647)

emb_dim = 10  # the dimensionality of the character embedding vectors
hidden_dim = 200  # the number of neurons in the hidden layer of the MLP

# embedding
C = torch.randn((vocab_size, emb_dim))
# Layer 1
W1 = torch.randn((block_size * emb_dim, hidden_dim), generator=g) * (5/3)/((block_size * emb_dim)**0.5) # Kaiming init for tanh! * 0.2  # to avoid dead neurons
b1 = torch.randn(hidden_dim, generator=g) * 0.1  # keep b1 just for fun (check the grads) even though it is actually useless
# Layer 2
W2 = torch.randn((hidden_dim, vocab_size), generator=g) * 0.1  # to keep low initial logits
b2 = torch.randn(vocab_size, generator=g) * 0.1   # to keep low initial logits
# BatchNorm parameters
bn_gain = torch.randn((1, hidden_dim)) * 0.1 + 1.0
bn_bias = torch.randn((1, hidden_dim)) * 0.1

# Note: many of these parameters are initialized in non-standard ways because the correct
# initializations sometimes might mask some incorrect implementations of the backward pass


parameters = [C, W1, b1, W2, b2, bn_gain, bn_bias]

# require gradients
for p in parameters:
  p.requires_grad = True

tot_parameters = sum(p.nelement() for p in parameters)
print(f'Total number of parameters: {tot_parameters}')


Total number of parameters: 12297


In [9]:
batch_size = 32
# minibatch construct
idxs = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)  # (num_samples,) --> (batch_size,)
Xb, Yb = Xtr[idxs], Ytr[idxs]  # (batch_size, block_size) and (batch_size,)

Implement the forward pass in small steps sothat is possible to backward at every step.

In [None]:
# Forward pass
emb = C[Xb]   # (batch_size, block_size, emb_dim)
emb_cat = emb.view(emb.shape[0], -1)  # (batch_size, block_size * emb_dim)
# linear layer
h_preact = emb_cat @ W1 + b1   # (batch_size, hidden_dim)
# batch norm layer
bn_mean_i = h_preact.mean(0, keepdim=True)
bn_std_i = h_preact.std(0, keepdim=True)
h_preact = batch_norm_gain * ((h_preact - bn_mean_i) / bn_std_i) + batch_norm_bias   # normalize the batch to unit gaussian and then offset/scale it according to learned bn_bias/gain (might add +eps small to std)
with torch.no_grad():
  bn_mean_running = 0.999 * bn_mean_running + 0.001 * bn_mean_i  # estimate running mean/std for later inference
  bn_std_running = 0.999 * bn_std_running + 0.001 * bn_std_i    # 0.001 is the 'momentum'
# non-linear activation
h = torch.tanh(h_preact)   # (batch_size, hidden_dim)
# output layer
logits = h @ W2 + b2    # (batch_size, alphabet_size)
loss = F.cross_entropy(logits, Yb)

  # Backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  # update parameters
  learning_rate = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -learning_rate * p.grad

  # track stats
  if i % 10000 == 0 : # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  losses.append(loss.log10().item())
