# Pytorch Internals
http://blog.ezyang.com/2019/05/pytorch-internals/

# Exercises

In [5]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

# read in all the words
words = open('../names.txt', 'r').read().splitlines()
words = words[:100]
words[:8]

# build the vocabulary of characters and mappings to/from integers
chars = sorted(set(list("".join(words))))
stoi = {ch: i+1 for i, ch in enumerate(chars)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}

# build the dataset
block_size = 3  # context length: how many characters do we take to predict the next one?

def build_dataset(words):
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([546, 3]) torch.Size([546])
torch.Size([73, 3]) torch.Size([73])
torch.Size([65, 3]) torch.Size([65])


In [2]:
seed = 2147483647
inp = 27
emb_sz = 2
hidden = 300
out = 27

g = torch.Generator().manual_seed(seed)

C = torch.randn((inp, emb_sz), generator=g)
W1 = torch.randn((emb_sz * block_size, hidden), generator=g)
b1 = torch.randn((hidden), generator=g)
W2 = torch.randn((hidden, out), generator=g)
b2 = torch.randn((out), generator=g)

parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

sum(p.nelement() for p in parameters)

10281

E01: Tune the hyperparameters of the training to beat my best validation loss of 2.2

In [3]:
def forward(X, Y, epochs=1, bs=32, lr=0.1):
    for _ in range(epochs):
        ix = torch.randint(0, X.shape[0], (bs,), generator=g)

        emb = C[X[ix]]
        h = torch.tanh(emb.view(-1, emb_sz * block_size) @ W1 + b1)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Y[ix])
        print(loss.item())
        
        for p in parameters:
            p.grad = None
        loss.backward()

        for p in parameters:
            p.data += -lr * p.grad

forward(Xtr, Ytr, epochs=10)

25.907936096191406


E02: I was not careful with the intialization of the network in this video. (1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? (2) Can you tune the initialization to get a starting loss that is much more similar to (1)?

In [3]:
probs = torch.ones((Ytr.shape[0], out))
probs = torch.softmax(probs, dim=1)
loss = F.cross_entropy(probs, Ytr)
loss

tensor(3.2958)

In [4]:
seed = 2147483647
inp = 27
emb_sz = 2
hidden = 300
out = 27

g = torch.Generator().manual_seed(seed)

C = torch.empty((inp, emb_sz))
W1 = torch.empty((emb_sz * block_size, hidden))
b1 = torch.empty((hidden))
W2 = torch.empty((hidden, out))
b2 = torch.empty((out))

parameters = [C, W1, b1, W2, b2]
for p in parameters:
    torch.nn.init.uniform_(p, generator=g)
    p.requires_grad = True

sum(p.nelement() for p in parameters)

10281

E03: Read the Bengio et al 2003 paper (link above), implement and try any idea from the paper. Did it work?

In [None]:
# propagate for subset of chars

subset = ['a', 'e', 'i', 'o', 'u']

def forward(X, Y, epochs=1, bs=32, lr=0.1):
    for _ in range(epochs):
        ix = torch.randint(0, X.shape[0], (bs,), generator=g)

        emb = C[X[ix]]
        h = torch.tanh(emb.view(-1, emb_sz * block_size) @ W1 + b1)
        logits = h @ W2 + b2
        loss = F.cross_entropy(logits, Y[ix])
        print(loss.item())

        if set(Y[ix]).intersection(subset):
            print("Subset found!")
        
            for p in parameters:
                p.grad = None
            loss.backward()

            for p in parameters:
                p.data += -lr * p.grad

forward(Xtr, Ytr, epochs=10)