<a href="https://colab.research.google.com/github/Yash-invic/Architecting-LLMs-WiDS/blob/main/week3/week3_exercises.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Week 3

# Setup

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline
words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
def build_dataset(words, block_size):
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y
import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))
block_size = 3
Xtr, Ytr = build_dataset(words[:n1], block_size)
Xdev, Ydev = build_dataset(words[n1:n2], block_size)
Xte, Yte = build_dataset(words[n2:], block_size)

# Part 1 E01

In [None]:
# Final training loss obtainrd = 2.0665 and dev loss = 2.1050
# Minimum loss obtainrd during the 2,00,000 steps = 1.8347
embd = 10  # using higher embeddings
hidden = 200 # using higher no. of hidden neurons
maxstepss = 200000 # Training for more no. of times for better results
batch_size = 32
lossi = []
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, embd), generator=g)
W1 = torch.randn((embd * block_size, hidden), generator=g) * (5/3)/((embd * block_size)**0.5)
b1 = torch.randn(hidden, generator=g) * 0.01
W2 = torch.randn((hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0
bngain = torch.ones((1, hidden))
bnbias = torch.zeros((1, hidden))
bnmean_running = torch.zeros((1, hidden))
bnstd_running = torch.ones((1, hidden))
parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(f"Number of parameters: {sum(p.nelement() for p in parameters)}")
for p in parameters:
    p.requires_grad = True
# training
for i in range(maxstepss):
    # forward pass
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix]
    emb = C[Xb]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)
    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias
    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Yb)
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update
    # learning rate- 0.1 for first 100k, then 0.01
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad
    if i % 10000 == 0:
        print(f'{i:7d}/{maxstepss:7d}: {loss.item():.4f}')
    lossi.append(loss.item())
@torch.no_grad()
def split_loss(split):
    x,y = {
        'train': (Xtr, Ytr),
        'dev': (Xdev, Ydev),
        'test': (Xte, Yte),
    }[split]
    emb = C[x]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())
print("Final loss:")
split_loss('train')
split_loss('dev')

# Part 1 E02

In [None]:
# theoretical uniform loss is ~3.29
# scaling weights by 0.01 at initialization the model starts exactly at this loss
# this avoids the specific loss curve and wasted early training steps
# yes we can tune the initialization to get a starting loss that is much more similar to the uniform loss
g = torch.Generator().manual_seed(2147483647)
W2ti = torch.randn((hidden, vocab_size), generator=g) * 0.01
b2ti = torch.randn(vocab_size, generator=g) * 0
C = torch.randn((vocab_size, embd), generator=g)
W1 = torch.randn((embd * block_size, hidden), generator=g)
b1 = torch.randn(hidden, generator=g)
ix = torch.randint(0, Xtr.shape[0], (32,), generator=g)
emb = C[Xtr[ix]]
h = torch.tanh(emb.view(-1, embd*block_size) @ W1 + b1)
logits = h @ W2ti + b2ti
loss = F.cross_entropy(logits, Ytr[ix])
print(f"Loss with tiny weights: {loss.item():.4f}")
print(f"Theoretical uniform loss: {-torch.log(torch.tensor(1.0/27.0)).item():.4f}")

# Part 1 E03

In [None]:
# implementing th direct connections according to the paper
# allows the input embeddings to bypass the hidden layer, allows model to easily learn linear relationsships
embd = 10
hidden = 200
maxstepss = 20000
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, embd), generator=g)
W1 = torch.randn((embd * block_size, hidden), generator=g) * (5/3)/((embd * block_size)**0.5)
b1 = torch.randn(hidden, generator=g) * 0.01
W2 = torch.randn((hidden, vocab_size), generator=g) * 0.01
b2 = torch.randn(vocab_size, generator=g) * 0
W_skip = torch.randn((embd * block_size, vocab_size), generator=g) * 0.01
bngain = torch.ones((1, hidden))
bnbias = torch.zeros((1, hidden))
bnmean_running = torch.zeros((1, hidden))
bnstd_running = torch.ones((1, hidden))
parameters = [C, W1, b1, W2, b2, W_skip, bngain, bnbias]
for p in parameters: p.requires_grad = True
lossi = []
for i in range(maxstepss):
    ix = torch.randint(0, Xtr.shape[0], (32,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix]
    emb = C[Xb]
    embcat = emb.view(emb.shape[0], -1)
    hpreact = embcat @ W1 + b1
    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)
    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias
    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
    h = torch.tanh(hpreact)
    logits = (h @ W2 + b2) + (embcat @ W_skip)
    loss = F.cross_entropy(logits, Yb)
    for p in parameters: p.grad = None
    loss.backward()
    lr = 0.1 if i < 10000 else 0.01
    for p in parameters: p.data += -lr * p.grad
    if i % 1000 == 0: print(f'{i}: {loss.item():.4f}')
    lossi.append(loss.item())
print(f"Final Loss with skip connections: {loss.item():.4f}")

#Part 2 E01

In [None]:
# what happens-
# initializing all weights to zero causes symmetry beaking failure
# every neuron receives the exact same gradient update
# this means they will never learn distinct features and the network will then collapse into a single effective neuron
W1 = torch.zeros((embd * block_size, hidden))
b1 = torch.zeros(hidden)
W2 = torch.zeros((hidden, vocab_size))
b2 = torch.zeros(vocab_size)
parameters = [W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True
ix = torch.randint(0, Xtr.shape[0], (32,), generator=g)
emb = C[Xtr[ix]]
h = torch.tanh(emb.view(-1, embd*block_size) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ytr[ix])
loss.backward()
print(f"Loss: {loss.item()}") # Should be 3.29 (uniform)
print(f"Output of hidden layer (h): \n{h[0, :10]}") # should be all 0s
print(f"Gradient of W1: \n{W1.grad[:5, :5]}")

# Part 2 E02

In [None]:
# after folding max difference in output is very negligible
# this thus proves inference can run without the explicit bn layer
embd = 10
hidden = 20
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, embd), generator=g)
W = torch.randn((embd * block_size, hidden), generator=g)
b = torch.randn(hidden, generator=g)
bngain = torch.ones((1, hidden))
bnbias = torch.zeros((1, hidden))
bnmean_running = torch.zeros((1, hidden))
bnstd_running = torch.ones((1, hidden))
parameters = [C, W, b, bngain, bnbias]
for p in parameters: p.requires_grad = True
for i in range(100):
    ix = torch.randint(0, Xtr.shape[0], (32,), generator=g)
    emb = C[Xtr[ix]]
    hpreact = emb.view(-1, embd*block_size) @ W + b
    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)
    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias
    with torch.no_grad():
        bnmean_running = 0.9 * bnmean_running + 0.1 * bnmeani
        bnstd_running = 0.9 * bnstd_running + 0.1 * bnstdi
with torch.no_grad():
    emb = C[Xtr[:5]]
    hpreact = emb.view(-1, embd*block_size) @ W + b
    h_original = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias
with torch.no_grad():
    W_folded = bngain * W / bnstd_running
    b_folded = bnbias - (bngain * bnmean_running / bnstd_running) + (bngain * b / bnstd_running)
with torch.no_grad():
    emb = C[Xtr[:5]]
    h_folded = emb.view(-1, embd*block_size) @ W_folded + b_folded
print("Original output:")
print(h_original[0, :5])
print("\nFolded output:")
print(h_folded[0, :5])
diff = (h_original - h_folded).abs().max()
print(f"\nMax Difference: {diff.item()}")