In [2]:
# Importing Libraries
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [3]:
import json
import os

from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [4]:
# Downloading the Dataset
!kaggle datasets download meemr5/indian-names-boys-girls
!unzip indian-names-boys-girls.zip

Dataset URL: https://www.kaggle.com/datasets/meemr5/indian-names-boys-girls
License(s): CC0-1.0
Downloading indian-names-boys-girls.zip to /content
  0% 0.00/161k [00:00<?, ?B/s]
100% 161k/161k [00:00<00:00, 400MB/s]
Archive:  indian-names-boys-girls.zip
  inflating: Names.txt               


In [5]:
# Reading all the names
names = open('Names.txt', 'r').read().lower().splitlines()
print(len(names)) # Number of names in the Dataset
print(max(len(n) for n in names)) # Max number of character in a name
print(min(len(n) for n in names)) # Min Number of character in a name
print(names[:5]) # First 5 names in the Dataset

55691
25
2
['aaban', 'aabharan', 'aabhas', 'aabhat', 'aabheer']


In [6]:
# Removing Unneccessary Names from the Dataset
names = [name for name in names if '-' not in name and '.' not in name and ' ' not in name]
print(len(names))

54784


In [7]:
# Building the Vocabulary of Characters
chars = sorted(set(list(''.join(names)))) # Unique characters in the Dataset
print(len(chars)) # Length of those characters list
char2idx = {s : i for i,s in enumerate(chars)} # mapping character to a particular index value
char2idx['.'] = 0 # Unique Character which will behave as <START> and <END> TOKEN in the Dataset
idx2char = {i : s for s,i in char2idx.items()} # Mapping each index to a particular character
vocal_size = len(char2idx) # Vocabulary size i.e number of unique characters in the Dataset
print(idx2char)
print(vocal_size)

26
{0: '.', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z'}
27


In [70]:
# Building the Dataset
block_size = 3 # Context Length i.e the number of characters it will remember and use in predicting the next character

def dataset(names):

  X, y = [], [] # X stores values of the Context and y stores the character which should come next based on that context

  for name in names:
    context = [0] * block_size # Setting Initial Context as Zero Padded Vector
    for char in name + '.' :
      idx = char2idx[char]
      X.append(context) # Storing context to X
      y.append(idx) # Storing Char to y
      context = context[1:] + [idx] # New Context

  X = torch.tensor(X)
  y = torch.tensor(y)
  print(X.shape, y.shape)
  return X, y

import random
random.seed(42)
random.shuffle(names)
n1 = int(0.8 * len(names))
n2 = int(0.9 * len(names))

Xtrain, ytrain = dataset(names[ : n1]) # 80% training data
Xval, yval = dataset(names[n1 : n2]) # 10% validation data
Xtest, ytest = dataset(names[n2 : ]) # 10% testing data

torch.Size([395805, 3]) torch.Size([395805])
torch.Size([49490, 3]) torch.Size([49490])
torch.Size([49352, 3]) torch.Size([49352])


In [46]:
# Utility Function For Comparing Manual Gradients with PyTorch Gradients
def compare(s, dt, t):
  exact = torch.all(dt == t.grad).item() # Comparing if both are exactly equal or not
  approx = torch.allclose(dt, t.grad) # Comparing if both are approximatelty equal or not
  maxdiff = (dt - t.grad).abs().max().item() # finding the max difference in both
  print(f'{s:15s} | exact: {str(exact):5s} | approximate: {str(approx):5s} | maxdiff: {maxdiff}')

In [36]:
n_embd = 10 # Dimentionality of the character embedding Vector
n_hidden = 64 # Number of Neurons in the hidden layer of MLP

g = torch.Generator().manual_seed(42) # For Reproducibility
C = torch.randn((vocal_size, n_embd), generator=g)

# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3) / ((n_embd * block_size) ** 0.5) # It is a weight initialization technique know as Gaussian Init
b1 = torch.randn(n_hidden, generator=g) * 0.1 # It is of no use because of Batch Normalization but is used just for fun

# Layer 2
W2 = torch.randn((n_hidden, vocal_size), generator=g) * 0.1
b2 = torch.randn(vocal_size, generator=g) * 0.1

# BatchNorm Parameters
bngain = torch.randn((1, n_hidden)) * 0.1 + 1.0
bnbias = torch.randn((1, n_hidden)) * 0.1

# Parameters
parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # Number of Parameters in Total
for p in parameters:
  p.requires_grad = True

4137


In [37]:
# Constructing a mini-batch
batch_size = n = 32
idx = torch.randint(0, Xtrain.shape[0], (batch_size,), generator=g)
Xbatch, ybatch = Xtrain[idx], ytrain[idx]

In [42]:
# Forward Pass

emb = C[Xbatch] # Embedded the characters into vectors
embcat = emb.view(emb.shape[0], -1) # Concatenate the vectors

# Linear Layer 1
hprebn = embcat @ W1 + b1 # Hidden Layer Before Batch Norm i.e pre-batchnorm

# BatchNorm Layer
bnmeani = 1/n * hprebn.sum(0, keepdim=True) # Xbar
bndiff = hprebn - bnmeani # X - Xbar
bndiff2 = bndiff ** 2 # (X -Xbar) ^ 2
bnvar = 1/(n-1) * bndiff2.sum(0, keepdim=True) # Bessel's Correction Dividing by (n-1) not n for better variance i.e 1/(n-1) * sigma((X - Xbar) ^ 2)
bnvar_inv = (bnvar + 1e-5) ** -0.5
bnraw = bndiff * bnvar_inv
hpreact = bngain * bnraw + bnbias

# adding Non-Linearity by using Activation Function
h = torch.tanh(hpreact)

# Linear Layer 2
logits = h @ W2 + b2

# Cross Entropy Loss (Same as F.cross_entropy(logits, ybatch))
logit_maxes = logits.max(1, keepdim=True).values
norm_logits = logits - logit_maxes # Substracting max logits value from all logits to provide stability
counts = norm_logits.exp()
counts_sum = counts.sum(1, keepdims=True)
counts_sum_inv = counts_sum ** -1
probs = counts * counts_sum_inv
logprobs = probs.log()
loss = -logprobs[range(n), ybatch].mean()

# PyTorch Backward Pass
for p in parameters:
  p.grad = None
for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, # afaik there is no cleaner way
          norm_logits, logit_maxes, logits, h, hpreact, bnraw,
         bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,
         embcat, emb]:
  t.retain_grad()
loss.backward()
loss

tensor(3.3946, grad_fn=<NegBackward0>)

##BackPropogating Through the Whole Thing Manually

In [43]:
varlist = [
    ('logprobs', logprobs),
    ('probs', probs),
    ('counts_sum_inv', counts_sum_inv),
    ('counts_sum', counts_sum),
    ('counts', counts),
    ('norm_logits', norm_logits),
    ('logit_maxes', logit_maxes),
    ('logits', logits),
    ('h', h),
    ('W2', W2),
    ('b2', b2),
    ('hpreact', hpreact),
    ('bngain', bngain),
    ('bnbias', bnbias),
    ('bnraw', bnraw),
    ('bnvar_inv', bnvar_inv),
    ('bnvar', bnvar),
    ('bndiff2', bndiff2),
    ('bndiff', bndiff),
    ('bnmeani', bnmeani),
    ('hprebn', hprebn),
    ('embcat', embcat),
    ('W1', W1),
    ('b1', b1),
    ('emb', emb),
    ('C', C),
]

for name, tensor in varlist:
    print(f'{name}.shape = {tensor.shape}')

logprobs.shape = torch.Size([32, 27])
probs.shape = torch.Size([32, 27])
counts_sum_inv.shape = torch.Size([32, 1])
counts_sum.shape = torch.Size([32, 1])
counts.shape = torch.Size([32, 27])
norm_logits.shape = torch.Size([32, 27])
logit_maxes.shape = torch.Size([32, 1])
logits.shape = torch.Size([32, 27])
h.shape = torch.Size([32, 64])
W2.shape = torch.Size([64, 27])
b2.shape = torch.Size([27])
hpreact.shape = torch.Size([32, 64])
bngain.shape = torch.Size([1, 64])
bnbias.shape = torch.Size([1, 64])
bnraw.shape = torch.Size([32, 64])
bnvar_inv.shape = torch.Size([1, 64])
bnvar.shape = torch.Size([1, 64])
bndiff2.shape = torch.Size([32, 64])
bndiff.shape = torch.Size([32, 64])
bnmeani.shape = torch.Size([1, 64])
hprebn.shape = torch.Size([32, 64])
embcat.shape = torch.Size([32, 30])
W1.shape = torch.Size([30, 64])
b1.shape = torch.Size([64])
emb.shape = torch.Size([32, 3, 10])
C.shape = torch.Size([27, 10])


In [47]:
# Cross Entropy Loss

# 1. logit_maxes = logits.max(1, keepdims=True).values
# 2. norm_logits = logits - logit_maxes # Substracting max logits value from all logits to provide stability
# 3. counts = norm_logits.exp()
# 4. counts_sum = counts.sum(1, keepdims=True)
# 5. counts_sum_inv = counts_sum ** -1
# 6. probs = counts * counts_sum_inv
# 7. logprobs = probs.log()
# 8. loss = -logprobs[range(n), ybatch].mean()

dlogprobs = torch.zeros_like(logprobs) # Making matrics of same shape as logprobs
dlogprobs[range(n), ybatch] = -1.0/n # Diff logprobs in 8
dprobs = (1.0 / probs) * dlogprobs # diff probs in 7
dcounts_sum_inv = (counts * dprobs).sum(1, keepdim=True) # Diff counts_sum_inv in 6
dcounts = counts_sum_inv * dprobs # Diff counts in 6
dcounts_sum = (-counts_sum ** -2) * dcounts_sum_inv # Diff counts_sum in 5
dcounts += torch.ones_like(counts) * dcounts_sum # Diff counts in 4 and adding it with previous dcounts value from 6
dnorm_logits = counts * dcounts # Diff norm_logits in 3
dlogits = dnorm_logits.clone() # Diff logits in 2 it comes out to be same as dnorm_logits
dlogit_maxes = (-dnorm_logits).sum(1, keepdim=True) # Diff logit_maxes in 2
dlogits += F.one_hot(logits.max(1).indices, num_classes=logits.shape[1]) * dlogit_maxes # Diff logits in 1 and adding it with previous dlogits from 2

# Comparing Manual Gradients with PyTorch Gradients

compare('logprobs', dlogprobs, logprobs)
compare('probs', dprobs, probs)
compare('counts_sum_inv', dcounts_sum_inv, counts_sum_inv)
compare('counts_sum', dcounts_sum, counts_sum)
compare('counts', dcounts, counts)
compare('norm_logits', dnorm_logits, norm_logits)
compare('logit_maxes', dlogit_maxes, logit_maxes)
compare('logits', dlogits, logits)

logprobs        | exact: True  | approximate: True  | maxdiff: 0.0
probs           | exact: True  | approximate: True  | maxdiff: 0.0
counts_sum_inv  | exact: True  | approximate: True  | maxdiff: 0.0
counts_sum      | exact: True  | approximate: True  | maxdiff: 0.0
counts          | exact: True  | approximate: True  | maxdiff: 0.0
norm_logits     | exact: True  | approximate: True  | maxdiff: 0.0
logit_maxes     | exact: True  | approximate: True  | maxdiff: 0.0
logits          | exact: True  | approximate: True  | maxdiff: 0.0


In [48]:
# Linear Layer 2

# 1. logits = h @ W2 + b2

dh = dlogits @ W2.T # Diff h in 1 keep in mind the shapes
dW2 = h.T @ dlogits # Diff W2 in 1 keep in mind the shapes
db2 = dlogits.sum(0) # Diff b2 in 1 and keep in mind the shapes

# Comparing Manual Gradients with PyTorch Gradients

compare('h', dh, h)
compare('W2', dW2, W2)
compare('b2', db2, b2)

h               | exact: True  | approximate: True  | maxdiff: 0.0
W2              | exact: True  | approximate: True  | maxdiff: 0.0
b2              | exact: True  | approximate: True  | maxdiff: 0.0


In [51]:
# Differentiation of Tanh (Activation Function)

# 1. h = torch.tanh(hpreact)

dhpreact = (1.0 - h ** 2) * dh # Diff Tan(X) = 1 - Tan^2(X)

# Comparing Manual Gradients with PyTorch Gradients

compare('hpreact', dhpreact, hpreact) # Exactly They are not equal due to Floating Point Precision but this small difference can be neglected here

hpreact         | exact: False | approximate: True  | maxdiff: 4.656612873077393e-10


In [52]:
# BatchNorm Layer

# 1. bnmeani = 1/n * hprebn.sum(0, keepdim=True) # Xbar
# 2. bndiff = hprebn - bnmeani # X - Xbar
# 3. bndiff2 = bndiff ** 2 # (X -Xbar) ^ 2
# 4. bnvar = 1/(n-1) * bndiff2.sum(0, keepdim=True) # Bessel's Correction Dividing by (n-1) not n for better variance i.e 1/(n-1) * sigma((X - Xbar) ^ 2)
# 5. bnvar_inv = (bnvar + 1e-5) ** -0.5
# 6. bnraw = bndiff * bnvar_inv
# 7. hpreact = bngain * bnraw + bnbias

dbngain = (bnraw * dhpreact).sum(0, keepdim=True) # Diff bngain in 7 and keeping dim same
dbnraw = bngain * dhpreact # Diff bnraw in 7
dbnbias = dhpreact.sum(0, keepdim=True) # Diff bnbias in 7 and making same shape
dbndiff = bnvar_inv * dbnraw # Diff bndiff in 6
dbnvar_inv = (bndiff * dbnraw).sum(0, keepdim=True) # Diff bnvar_inv in 6 and making similar shape
dbnvar = (-0.5 * (bnvar + 1e-5) ** -1.5) * dbnvar_inv # Diff bnvar in 5
dbndiff2 = (1.0/(n-1)) * torch.ones_like(bndiff2) * dbnvar # Diff dbndiff2 in 4
dbndiff += (2 * bndiff) * dbndiff2 # Diff bndiff in 3 and adding it with previous dbndiff from 6
dhprebn = dbndiff.clone() # Diff hprebn in 2
dbnmeani = (-dbndiff).sum(0) # Diff bnmeani in 2
dhprebn += (1.0/n) * (torch.ones_like(hprebn) * dbnmeani) # Diff hprebn in 1 and adding it with previous dhprebn from 2

# Comparing Manual Gradients with PyTorch Gradients

compare('bngain', dbngain, bngain)
compare('bnbias', dbnbias, bnbias)
compare('bnraw', dbnraw, bnraw)
compare('bnvar_inv', dbnvar_inv, bnvar_inv)
compare('bnvar', dbnvar, bnvar)
compare('bndiff2', dbndiff2, bndiff2)
compare('bndiff', dbndiff, bndiff)
compare('bnmeani', dbnmeani, bnmeani)
compare('hprebn', dhprebn, hprebn)

bngain          | exact: False | approximate: True  | maxdiff: 1.862645149230957e-09
bnbias          | exact: False | approximate: True  | maxdiff: 3.725290298461914e-09
bnraw           | exact: False | approximate: True  | maxdiff: 4.656612873077393e-10
bnvar_inv       | exact: False | approximate: True  | maxdiff: 3.725290298461914e-09
bnvar           | exact: False | approximate: True  | maxdiff: 6.984919309616089e-10
bndiff2         | exact: False | approximate: True  | maxdiff: 2.1827872842550278e-11
bndiff          | exact: False | approximate: True  | maxdiff: 4.656612873077393e-10
bnmeani         | exact: False | approximate: True  | maxdiff: 7.450580596923828e-09
hprebn          | exact: False | approximate: True  | maxdiff: 6.984919309616089e-10


In [53]:
# Linear Layer 1

# 1. hprebn = embcat @ W1 + b1

dembcat = dhprebn @ W1.T # Diff embcat in 1
dW1 = embcat.T @ dhprebn # Diff W1 in 1
db1 = dhprebn.sum(0)

# Comparing Manual Gradients with PyTorch Gradients

compare('embcat', dembcat, embcat)
compare('W1', dW1, W1)
compare('b1', db1, b1)

embcat          | exact: False | approximate: True  | maxdiff: 1.862645149230957e-09
W1              | exact: False | approximate: True  | maxdiff: 1.0011717677116394e-08
b1              | exact: False | approximate: True  | maxdiff: 6.752088665962219e-09


In [56]:
# Embedding Layer

# 1. emb = C[Xbatch] # Embedded the characters into vectors
# 2. embcat = emb.view(emb.shape[0], -1) # Concatenate the vectors

demb = dembcat.view(emb.shape) # Diff emb in 2
dC = torch.zeros_like(C) # Diff C in 1
for k in range(Xbatch.shape[0]):
  for j in range(Xbatch.shape[1]):
    idx = Xbatch[k, j]
    dC[idx] += demb[k, j]

# Comparing Manual Gradients with PyTorch Gradients

compare('emb', demb, emb)
compare('C', dC, C)

emb             | exact: False | approximate: True  | maxdiff: 1.862645149230957e-09
C               | exact: False | approximate: True  | maxdiff: 1.4901161193847656e-08


## BackPropogating in 1 Go

In [60]:
# Cross Entropy

# forward pass

# before:
# logit_maxes = logits.max(1, keepdim=True).values
# norm_logits = logits - logit_maxes # subtract max for numerical stability
# counts = norm_logits.exp()
# counts_sum = counts.sum(1, keepdims=True)
# counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
# probs = counts * counts_sum_inv
# logprobs = probs.log()
# loss = -logprobs[range(n), Yb].mean()

# Now:
new_loss = F.cross_entropy(logits, ybatch)
print(new_loss.item(), 'diff:', (new_loss - loss).item())

# Backward Pass

dlogits = None
dlogits = F.softmax(logits, 1)
dlogits[range(n), ybatch] -= 1
dlogits /= n

compare('logits', dlogits, logits) # Very Minimal Difference so we can neglect

3.3945655822753906 diff: 0.0
logits          | exact: False | approximate: True  | maxdiff: 5.587935447692871e-09


In [66]:
# Batch Norm

# forward pass

# before:
# bnmeani = 1/n*hprebn.sum(0, keepdim=True)
# bndiff = hprebn - bnmeani
# bndiff2 = bndiff**2
# bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n)
# bnvar_inv = (bnvar + 1e-5)**-0.5
# bnraw = bndiff * bnvar_inv
# hpreact = bngain * bnraw + bnbias

# Now:
new_hpreact = bngain * (hprebn - hprebn.mean(0, keepdim=True)) / torch.sqrt(hprebn.var(0, keepdim=True, unbiased=True) + 1e-5) + bnbias # unbiased hyperparameter represents that we keep bessel's correction in mind and it will divide by (n-1) instead of n
print('max diff:', (new_hpreact - hpreact).abs().max())

# Backward Pass
dhprebn = bngain * bnvar_inv / n * (n * dhpreact - dhpreact.sum(0) - n/(n-1) * bnraw * (dhpreact * bnraw).sum(0))
compare('hprebn', dhprebn, hprebn)

max diff: tensor(4.7684e-07, grad_fn=<MaxBackward1>)
hprebn          | exact: False | approximate: True  | maxdiff: 9.313225746154785e-10


## Training MLP From Scratch

In [99]:
n_embd = 10 # Dimentionality of the character embedding Vector
n_hidden = 64 # Number of Neurons in the hidden layer of MLP

g = torch.Generator().manual_seed(42) # For Reproducibility
C = torch.randn((vocal_size, n_embd), generator=g)

# Layer 1
W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3) / ((n_embd * block_size) ** 0.5) # It is a weight initialization technique know as Gaussian Init
b1 = torch.randn(n_hidden, generator=g) * 0.1 # It is of no use because of Batch Normalization but is used just for fun

# Layer 2
W2 = torch.randn((n_hidden, vocal_size), generator=g) * 0.1
b2 = torch.randn(vocal_size, generator=g) * 0.1

# BatchNorm Parameters
bngain = torch.randn((1, n_hidden)) * 0.1 + 1.0
bnbias = torch.randn((1, n_hidden)) * 0.1

# Parameters
parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # Number of Parameters in Total
for p in parameters:
  p.requires_grad = True

# Optimization
max_steps = 60000
batch_size = 512
n = batch_size
lossi = []

with torch.no_grad():

  # Starting Optimization
  for i in range(max_steps):

    # Constructing a mini-batch
    batch_size = n = 32
    idx = torch.randint(0, Xtrain.shape[0], (batch_size,), generator=g)
    Xbatch, ybatch = Xtrain[idx], ytrain[idx]

    # Forward Pass

    emb = C[Xbatch] # Embedded the characters into vectors
    embcat = emb.view(emb.shape[0], -1) # Concatenate the vectors

    # Linear Layer 1
    hprebn = embcat @ W1 + b1 # Hidden Layer Before Batch Norm i.e pre-batchnorm

    # BatchNorm Layer
    bnmeani = 1/n * hprebn.sum(0, keepdim=True) # Xbar
    bndiff = hprebn - bnmeani # X - Xbar
    bndiff2 = bndiff ** 2 # (X -Xbar) ^ 2
    bnvar = 1/(n-1) * bndiff2.sum(0, keepdim=True) # Bessel's Correction Dividing by (n-1) not n for better variance i.e 1/(n-1) * sigma((X - Xbar) ^ 2)
    bnvar_inv = (bnvar + 1e-5) ** -0.5
    bnraw = bndiff * bnvar_inv
    hpreact = bngain * bnraw + bnbias

    # adding Non-Linearity by using Activation Function
    h = torch.tanh(hpreact)

    # Linear Layer 2
    logits = h @ W2 + b2

    # Cross Entropy Loss (Same as F.cross_entropy(logits, ybatch))
    logit_maxes = logits.max(1, keepdim=True).values
    norm_logits = logits - logit_maxes # Substracting max logits value from all logits to provide stability
    counts = norm_logits.exp()
    counts_sum = counts.sum(1, keepdims=True)
    counts_sum_inv = counts_sum ** -1
    probs = counts * counts_sum_inv
    logprobs = probs.log()
    loss = -logprobs[range(n), ybatch].mean()

    # PyTorch Backward Pass
    for p in parameters:
      p.grad = None
      # loss.backward()

    # Manual BackProp

    # Cross Entropy Loss
    dlogits = F.softmax(logits, 1)
    dlogits[range(n), ybatch] -= 1
    dlogits /= n

    # Linear Layer 2
    dh = dlogits @ W2.T
    dW2 = h.T @ dlogits
    db2 = dlogits.sum(0)

    # Activation Function Tanh
    dhpreact = (1.0 - h ** 2) * dh

    # BatchNorm BackProp
    dbngain = (bnraw * dhpreact).sum(0, keepdim=True)
    dbnbias = dhpreact.sum(0, keepdim=True)
    dhprebn = bngain * bnvar_inv / n * (n * dhpreact - dhpreact.sum(0) - n/(n-1) * bnraw * (dhpreact * bnraw).sum(0))

    # Linear Layer 1
    dembcat = dhprebn @ W1.T
    dW1 = embcat.T @ dhprebn
    db1 = dhprebn.sum(0)

    # Embedding Layer
    demb = dembcat.view(emb.shape)
    dC = torch.zeros_like(C)
    for k in range(Xbatch.shape[0]):
      for j in range(Xbatch.shape[1]):
        idx = Xbatch[k, j]
        dC[idx] += demb[k, j]

    grads = [dC, dW1, db1, dW2, db2, dbngain, dbnbias]

    # Update
    lr = 0.5 if i<20000 else 0.01
    for p, grad in zip(parameters, grads):
      # p.data += -lr * p.grad  # Old way used in (loss.backward())
      p.data += -lr * grad # New Way used in manual Backprop

    # track stats
    if i % 10000 == 0: # print every once in a while
      print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

4137
      0/  60000: 3.4107
  10000/  60000: 1.8082
  20000/  60000: 2.1200
  30000/  60000: 1.7357
  40000/  60000: 2.2835
  50000/  60000: 1.7885


In [100]:
# calibrate the batch norm at the end of training

with torch.no_grad():
  # pass the training set through
  emb = C[Xtrain]
  embcat = emb.view(emb.shape[0], -1)
  hpreact = embcat @ W1 + b1
  # measure the mean/std over the entire training set
  bnmean = hpreact.mean(0, keepdim=True)
  bnvar = hpreact.var(0, keepdim=True, unbiased=True)

In [101]:
# evaluate train and val loss

@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtrain, ytrain),
    'val': (Xval, yval),
    'test': (Xtest, ytest),
  }[split]
  emb = C[x] # (N, block_size, n_embd)
  embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
  hpreact = embcat @ W1 + b1
  hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
  h = torch.tanh(hpreact) # (N, n_hidden)
  logits = h @ W2 + b2 # (N, vocab_size)
  loss = F.cross_entropy(logits, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

train 1.9334039688110352
val 1.9393078088760376


In [102]:
# Generating Some Names
def generate_names(number):

  out =[]
  for _ in range(number):

    context = [0] * block_size
    name = ''
    while True:
      emb = C[torch.tensor([context])] # (1,block_size,d)
      embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
      hpreact = embcat @ W1 + b1
      hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias
      h = torch.tanh(hpreact) # (N, n_hidden)
      logits = h @ W2 + b2 # (N, vocab_size)

      probs = F.softmax(logits, dim = 1)
      idx = torch.multinomial(probs, num_samples=1).item()
      context = context[1:] + [idx]
      if idx == 0:
        break
      name += (idx2char[idx])
    out.append(name)
  return out


In [113]:
generate_names(15)

['div',
 'dh',
 'shinesh',
 'shgino',
 'n',
 's',
 'el',
 'v',
 'ni',
 'muth',
 'enth',
 's',
 's',
 'koh',
 'il']