1.Implement the network on the image and overfit the first batch (of 32 samples)

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [2]:
words = open("names.txt", "r").read().splitlines()
chars = sorted(list(set("".join(words))))
stoi = {s:idx + 1 for idx, s in enumerate(chars)}
stoi["."] = 0
itos = {idx: s for s, idx in stoi.items()}
block_size = 3


In [4]:
def build_dataset(words):  
    X, Y = [], []
    for w in words:
        #print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y
x, y = build_dataset(words)


torch.Size([228146, 3]) torch.Size([228146])


In [5]:
m = len(x)

rand_idxs = torch.randperm(m)
x = x[rand_idxs]
y = y[rand_idxs]
n1 = int(0.8 * m)
n2 = int(0.9 * m)

x_train = x[:n1]
y_train = y[:n1]
x_val = x[n1:n2]
y_val = y[n1:n2]
x_test = x[n2:]
y_test = y[n2:]

In [6]:
c = torch.randn((27, 2))
c[x_train].shape
emb_train = c[x_train].view(-1, 6)
emb_train.shape
w1 = torch.randn((6, 100))
b1 = torch.randn((1, 100))
w1.shape
w2 = torch.randn((100, 27))
b2 = torch.randn((1, 27))
c = torch.randn((27, 2))
w1 = torch.randn((6, 100))
b1 = torch.randn((1, 100))
w2 = torch.randn((100, 27))
b2 = torch.randn((1, 27))
params = [c, w1, b1, w2, b2]


In [7]:
BATCH_SIZE = 32
x_batch = x[: BATCH_SIZE, :]
y_batch = y[: BATCH_SIZE]
x_batch.shape
y_batch.shape

torch.Size([32])

In [8]:
LEARN_RATE = 0.1
N_ITERS = 10000

g = torch.Generator().manual_seed(2147483647)
c = torch.randn((27, 2), generator=g)
w1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
w2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
params = [c, w1, b1, w2, b2]

for p in params:
    p.requires_grad = True

for k in range(N_ITERS):
    emb = c[x_batch]
    h = torch.tanh(emb.view(-1, 6) @ w1 + b1)
    logits = h @ w2 + b2
    loss = F.cross_entropy(logits, y_batch)


    for p in params:
        p.grad = None

    loss.backward()

    for p in params:
        p.data = p.data - LEARN_RATE * p.grad

print(f"loss: {loss.item()}")

loss: 0.10325682163238525


1.Tune the hyperparameters of the training to beat my best validation loss of 2.2

In [9]:
params = {
    "batch_size": [16, 32, 64, 128, 256],
    "learn_rate": [0.1, 0.05, 0.01, 0.001]
}

from sklearn.model_selection import ParameterGrid
param_grid = ParameterGrid(params)

for p in param_grid:
    print(p)

{'batch_size': 16, 'learn_rate': 0.1}
{'batch_size': 16, 'learn_rate': 0.05}
{'batch_size': 16, 'learn_rate': 0.01}
{'batch_size': 16, 'learn_rate': 0.001}
{'batch_size': 32, 'learn_rate': 0.1}
{'batch_size': 32, 'learn_rate': 0.05}
{'batch_size': 32, 'learn_rate': 0.01}
{'batch_size': 32, 'learn_rate': 0.001}
{'batch_size': 64, 'learn_rate': 0.1}
{'batch_size': 64, 'learn_rate': 0.05}
{'batch_size': 64, 'learn_rate': 0.01}
{'batch_size': 64, 'learn_rate': 0.001}
{'batch_size': 128, 'learn_rate': 0.1}
{'batch_size': 128, 'learn_rate': 0.05}
{'batch_size': 128, 'learn_rate': 0.01}
{'batch_size': 128, 'learn_rate': 0.001}
{'batch_size': 256, 'learn_rate': 0.1}
{'batch_size': 256, 'learn_rate': 0.05}
{'batch_size': 256, 'learn_rate': 0.01}
{'batch_size': 256, 'learn_rate': 0.001}


In [12]:
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

params = {
    "batch_size": [16, 32, 64, 128, 256],
    "learn_rate": [0.1, 0.05, 0.01],
    "num_iters": [50000]
}

param_grid = ParameterGrid(params)

for pset in tqdm(param_grid):
    g = torch.Generator().manual_seed(2147483647)
    c = torch.randn((27, 10), generator=g, requires_grad=True)
    w1 = torch.randn((30, 200), generator=g, requires_grad=True)
    b1 = torch.randn(200, generator=g)
    w2 = torch.randn((200, 27), generator=g)
    b2 = torch.randn(27, generator=g)
    params = [c, w1, b1, w2, b2]

    for p in params:
        p.requires_grad = True

    for k in range(pset["num_iters"]):
        rand_idxs = torch.randint(0, x_train.shape[0], (pset["batch_size"], ))
        x_batch = x_train[rand_idxs]
        y_batch = y_train[rand_idxs]

        emb = c[x_batch]
        h = torch.tanh(emb.view(-1, w1.shape[0]) @ w1 + b1)
        logits = h @ w2 + b2
        loss = F.cross_entropy(logits, y_batch)
        
        for p in params:
            p.grad = None
        loss.backward()

        lr = (pset["learn_rate"] / 10) if k > pset["num_iters"] / 2 else pset["learn_rate"] 
        for p in params:
            p.data = p.data - lr * p.grad

    with torch.no_grad():
        emb = c[x_val]
        h = torch.tanh(emb.view(-1, w1.shape[0]) @ w1 + b1)
        logits = h @ w2 + b2
        val_loss = F.cross_entropy(logits, y_val)
        print(f"pset: {pset}, val_loss: {val_loss.item()}")

  7%|▋         | 1/15 [00:14<03:20, 14.30s/it]

pset: {'batch_size': 16, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2741599082946777


 13%|█▎        | 2/15 [00:28<03:08, 14.48s/it]

pset: {'batch_size': 16, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.308612108230591


 20%|██        | 3/15 [00:43<02:53, 14.43s/it]

pset: {'batch_size': 16, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.514777183532715


 27%|██▋       | 4/15 [00:58<02:43, 14.87s/it]

pset: {'batch_size': 32, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.246750593185425


 33%|███▎      | 5/15 [01:14<02:31, 15.14s/it]

pset: {'batch_size': 32, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2910361289978027


 40%|████      | 6/15 [01:30<02:17, 15.33s/it]

pset: {'batch_size': 32, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.5218966007232666


 47%|████▋     | 7/15 [01:49<02:11, 16.49s/it]

pset: {'batch_size': 64, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2315168380737305


 53%|█████▎    | 8/15 [02:07<02:00, 17.28s/it]

pset: {'batch_size': 64, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.28310227394104


 60%|██████    | 9/15 [02:26<01:45, 17.62s/it]

pset: {'batch_size': 64, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.540093421936035


 67%|██████▋   | 10/15 [02:50<01:38, 19.70s/it]

pset: {'batch_size': 128, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2319602966308594


 73%|███████▎  | 11/15 [03:15<01:24, 21.13s/it]

pset: {'batch_size': 128, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.274763584136963


 80%|████████  | 12/15 [03:40<01:07, 22.41s/it]

pset: {'batch_size': 128, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.5039072036743164


 87%|████████▋ | 13/15 [04:17<00:53, 26.95s/it]

pset: {'batch_size': 256, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2291252613067627


 93%|█████████▎| 14/15 [04:55<00:30, 30.27s/it]

pset: {'batch_size': 256, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.266962766647339


100%|██████████| 15/15 [05:31<00:00, 22.11s/it]

pset: {'batch_size': 256, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.514359474182129





In [13]:
g = torch.Generator().manual_seed(2147483647)
c = torch.randn((27, 10), generator=g, requires_grad=True)
w1 = torch.randn((30, 200), generator=g, requires_grad=True)
b1 = torch.randn(200, generator=g)
w2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)
params = [c, w1, b1, w2, b2]
losses = []
LEARN_RATE = 0.1
BATCH_SIZE = 256
N_ITERS = 200000

In [14]:
for  p in params:
    p.requires_grad = True

for k in range(N_ITERS):
    rand_idxs = torch.randint(0, x_train.shape[0], (BATCH_SIZE, ))
    x_batch = x_train[rand_idxs]
    y_batch = y_train[rand_idxs]

    emb = c[x_batch]
    h = torch.tanh(emb.view(-1, w1.shape[0]) @ w1 + b1)
    logits = h @ w2 + b2
    loss = F.cross_entropy(logits, y_batch)
    losses.append(loss.item())
    
    for p in params:
        p.grad = None
    loss.backward()

    if k > N_ITERS / 2:
        lr = (LEARN_RATE / 10)
    elif k > N_ITERS / 4:
        lr = (LEARN_RATE / 100)
    else:
        lr = LEARN_RATE

    for p in params:
        p.data = p.data - lr * p.grad

with torch.no_grad():
    emb = c[x_val]
    h = torch.tanh(emb.view(-1, w1.shape[0]) @ w1 + b1)
    logits = h @ w2 + b2
    val_loss = F.cross_entropy(logits, y_val)
    print(f"val_loss: {val_loss.item()}")

val_loss: 2.1909658908843994


2. I was not careful with the intialization of the network in this video. (1) What is the loss you'd get if the predicted probabilities at initialization were perfectly uniform? What loss do we achieve? (2) Can you tune the initialization to get a starting loss that is much more similar to (1)?

In [16]:
g = torch.Generator().manual_seed(61616161)
c = torch.randn((27, 2), generator=g)
w1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
w2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)

In [17]:
emb = c[x_train]
h = torch.tanh(emb.view(-1, w1.shape[0]) @ w1 + b1)
logits = h @ w2 + b2
print(torch.softmax(logits, dim=1))
loss = F.cross_entropy(logits, y_train)

print(f"initial loss: {loss.item()}")

tensor([[4.6983e-14, 1.1985e-04, 3.5999e-08,  ..., 1.9626e-08, 2.7613e-05,
         1.2537e-03],
        [6.0896e-11, 3.2000e-04, 1.4174e-09,  ..., 5.5973e-10, 1.0600e-03,
         1.4048e-01],
        [1.3202e-09, 2.3920e-09, 4.6479e-06,  ..., 3.2859e-11, 4.0674e-07,
         1.1041e-03],
        ...,
        [1.5881e-17, 1.8376e-06, 5.6858e-12,  ..., 3.4350e-13, 2.2004e-09,
         3.9131e-04],
        [2.1407e-09, 1.1824e-07, 6.2860e-11,  ..., 1.5720e-11, 4.5586e-13,
         5.6347e-07],
        [7.1390e-09, 3.1715e-07, 4.8401e-06,  ..., 1.8034e-06, 1.2815e-07,
         5.5459e-01]])
initial loss: 15.979963302612305


In [19]:
c = torch.full((27, 10), fill_value=1.0)
w1 = torch.full((30, 200), fill_value=0.02)
b1 = torch.full((1,200), fill_value=0.0)
w2 = torch.full((200, 27), fill_value=1.0)
b2 = torch.full((1,27), fill_value=0.0)
# Forward pass
emb = c[x_train]
h = torch.tanh(emb.view(-1, w1.shape[0]) @ w1 + b1)
logits = h @ w2 + b2
print(torch.softmax(logits, dim=1))
loss = F.cross_entropy(logits, y_train)

print(f"initial loss: {loss.item()}")

tensor([[0.0370, 0.0370, 0.0370,  ..., 0.0370, 0.0370, 0.0370],
        [0.0370, 0.0370, 0.0370,  ..., 0.0370, 0.0370, 0.0370],
        [0.0370, 0.0370, 0.0370,  ..., 0.0370, 0.0370, 0.0370],
        ...,
        [0.0370, 0.0370, 0.0370,  ..., 0.0370, 0.0370, 0.0370],
        [0.0370, 0.0370, 0.0370,  ..., 0.0370, 0.0370, 0.0370],
        [0.0370, 0.0370, 0.0370,  ..., 0.0370, 0.0370, 0.0370]])
initial loss: 3.29583740234375


3. Read the Bengio et al 2003 paper (link above), implement and try any idea from the paper. Did it work?

In [20]:
g = torch.Generator().manual_seed(6161616161)
c = torch.randn((27, 10), generator=g, requires_grad=True)
w1 = torch.randn((30, 200), generator=g, requires_grad=True)
b1 = torch.randn(200, generator=g, requires_grad=True)
w2 = torch.randn((200, 27), generator=g, requires_grad=True)
w0 = torch.randn((30, 27), generator=g, requires_grad=True)
b2 = torch.randn(27, generator=g, requires_grad=True)

params = [c, w1, b1, w2, b2, w0]

In [23]:
BATCH_SIZE = 32
x_batch = x[: BATCH_SIZE, :]
y_batch = y[: BATCH_SIZE]
x_batch.shape
y_batch.shape

torch.Size([32])

In [24]:
LEARN_RATE = 0.1
N_ITERS = 1000

g = torch.Generator().manual_seed(32109832)
c = torch.randn((27, 10), generator=g)
w1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
w2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)
w0 = torch.randn((30, 27), generator=g)
params = [c, w1, b1, w2, b2, w0]
losses = []

for p in params:
    p.requires_grad = True

for k in range(N_ITERS):
    emb = c[x_batch]
    h = torch.tanh(emb.view(-1, 3 * c.shape[1]) @ w1 + b1)
    logits = h @ w2 + b2 + emb.view(-1, w1.shape[0]) @ w0
    loss = F.cross_entropy(logits, y_batch)
    losses.append(loss.item())
    for p in params:
        p.grad = None

    loss.backward()

    for p in params:
        p.data = p.data - LEARN_RATE * p.grad

print(f"loss: {loss.item()}")

loss: 0.10337723791599274


In [25]:
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

params = {
    "batch_size": [16, 32, 64, 128, 256],
    "learn_rate": [0.1, 0.05, 0.01],
    "num_iters": [50000]
}

param_grid = ParameterGrid(params)

for pset in tqdm(param_grid):
    g = torch.Generator().manual_seed(127207)
    c = torch.randn((27, 10), generator=g)
    w1 = torch.randn((30, 200), generator=g)
    b1 = torch.randn(200, generator=g)
    w2 = torch.randn((200, 27), generator=g)
    b2 = torch.randn(27, generator=g)
    w0 = torch.randn((30, 27), generator=g)
    params = [c, w1, b1, w2, b2, w0]

    for p in params:
        p.requires_grad = True

    for k in range(pset["num_iters"]):
        rand_idxs = torch.randint(0, x_train.shape[0], (pset["batch_size"], ))
        x_batch = x_train[rand_idxs]
        y_batch = y_train[rand_idxs]

        emb = c[x_batch]
        h = torch.tanh(emb.view(-1, 3 * c.shape[1]) @ w1 + b1)
        logits = h @ w2 + b2 + emb.view(-1, w1.shape[0]) @ w0
        loss = F.cross_entropy(logits, y_batch)

        for p in params:
            p.grad = None

        loss.backward()
        lr = (pset["learn_rate"] / 10) if k > pset["num_iters"] / 2 else pset["learn_rate"] 
        for p in params:
            p.data = p.data - lr * p.grad

    with torch.no_grad():
        emb = c[x_val]
        h = torch.tanh(emb.view(-1, w1.shape[0]) @ w1 + b1)
        logits = h @ w2 + b2 + emb.view(-1, w1.shape[0]) @ w0
        val_loss = F.cross_entropy(logits, y_val)
        print(f"pset: {pset}, val_loss: {val_loss.item()}")

  7%|▋         | 1/15 [00:16<03:49, 16.43s/it]

pset: {'batch_size': 16, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.258974313735962


 13%|█▎        | 2/15 [00:32<03:33, 16.39s/it]

pset: {'batch_size': 16, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2936909198760986


 20%|██        | 3/15 [00:49<03:17, 16.45s/it]

pset: {'batch_size': 16, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.4179043769836426


 27%|██▋       | 4/15 [01:06<03:06, 16.91s/it]

pset: {'batch_size': 32, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2432901859283447


 33%|███▎      | 5/15 [01:24<02:53, 17.31s/it]

pset: {'batch_size': 32, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.286076068878174


 40%|████      | 6/15 [01:43<02:38, 17.65s/it]

pset: {'batch_size': 32, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.452589750289917


 47%|████▋     | 7/15 [02:04<02:30, 18.83s/it]

pset: {'batch_size': 64, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.24276065826416


 53%|█████▎    | 8/15 [02:25<02:16, 19.43s/it]

pset: {'batch_size': 64, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.282175302505493


 60%|██████    | 9/15 [02:46<01:59, 19.85s/it]

pset: {'batch_size': 64, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.4560420513153076


 67%|██████▋   | 10/15 [03:12<01:49, 21.99s/it]

pset: {'batch_size': 128, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.236204147338867


 73%|███████▎  | 11/15 [03:39<01:34, 23.50s/it]

pset: {'batch_size': 128, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.27168607711792


 80%|████████  | 12/15 [04:06<01:13, 24.50s/it]

pset: {'batch_size': 128, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.4331679344177246


 87%|████████▋ | 13/15 [05:20<01:18, 39.42s/it]

pset: {'batch_size': 256, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2296738624572754


 93%|█████████▎| 14/15 [06:01<00:39, 39.88s/it]

pset: {'batch_size': 256, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2757456302642822


100%|██████████| 15/15 [06:43<00:00, 26.90s/it]

pset: {'batch_size': 256, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.4330084323883057





In [26]:
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

params = {
    "batch_size": [16, 32, 64, 128, 256],
    "learn_rate": [0.1, 0.05, 0.01],
    "num_iters": [50000]
}

param_grid = ParameterGrid(params)
reg_strength = 0.01

for pset in tqdm(param_grid):
    g = torch.Generator().manual_seed(613726192)
    c = torch.randn((27, 10), generator=g)
    w1 = torch.randn((30, 200), generator=g)
    b1 = torch.randn(200, generator=g)
    w2 = torch.randn((200, 27), generator=g)
    b2 = torch.randn(27, generator=g)
    w0 = torch.randn((30, 27), generator=g)
    params = [c, w1, b1, w2, b2, w0]

    for p in params:
        p.requires_grad = True
    for k in range(pset["num_iters"]):
        rand_idxs = torch.randint(0, x_train.shape[0], (pset["batch_size"], ))
        x_batch = x_train[rand_idxs]
        y_batch = y_train[rand_idxs]

        emb = c[x_batch]
        h = torch.tanh(emb.view(-1, 3 * c.shape[1]) @ w1 + b1)
        logits = h @ w2 + b2 + emb.view(-1, w1.shape[0]) @ w0
        loss = F.cross_entropy(logits, y_batch) + reg_strength * (w0.pow(2).mean() + w1.pow(2).mean() + w2.pow(2).mean())

        for p in params:
            p.grad = None

        loss.backward()

        lr = (pset["learn_rate"] / 10) if k > pset["num_iters"] / 2 else pset["learn_rate"] 
        for p in params:
            p.data = p.data - lr * p.grad

    with torch.no_grad():
        emb = c[x_val]
        h = torch.tanh(emb.view(-1, w1.shape[0]) @ w1 + b1)
        logits = h @ w2 + b2 + emb.view(-1, w1.shape[0]) @ w0
        val_loss = F.cross_entropy(logits, y_val)
        print(f"pset: {pset}, val_loss: {val_loss.item()}")

  7%|▋         | 1/15 [00:23<05:30, 23.58s/it]

pset: {'batch_size': 16, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2648887634277344


 13%|█▎        | 2/15 [00:47<05:06, 23.59s/it]

pset: {'batch_size': 16, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2890281677246094


 20%|██        | 3/15 [01:11<04:44, 23.74s/it]

pset: {'batch_size': 16, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.494035482406616


 27%|██▋       | 4/15 [01:35<04:25, 24.15s/it]

pset: {'batch_size': 32, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2565503120422363


 33%|███▎      | 5/15 [02:00<04:04, 24.45s/it]

pset: {'batch_size': 32, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2891697883605957


 40%|████      | 6/15 [02:25<03:42, 24.67s/it]

pset: {'batch_size': 32, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.4573726654052734


 47%|████▋     | 7/15 [02:56<03:33, 26.64s/it]

pset: {'batch_size': 64, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.239196538925171


 53%|█████▎    | 8/15 [03:26<03:13, 27.62s/it]

pset: {'batch_size': 64, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2776100635528564


 60%|██████    | 9/15 [03:55<02:48, 28.09s/it]

pset: {'batch_size': 64, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.453270435333252


 67%|██████▋   | 10/15 [04:29<02:29, 29.89s/it]

pset: {'batch_size': 128, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2315444946289062


 73%|███████▎  | 11/15 [05:03<02:04, 31.15s/it]

pset: {'batch_size': 128, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2656126022338867


 80%|████████  | 12/15 [05:39<01:37, 32.50s/it]

pset: {'batch_size': 128, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.448518991470337


 87%|████████▋ | 13/15 [06:28<01:15, 37.53s/it]

pset: {'batch_size': 256, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.229033946990967


 93%|█████████▎| 14/15 [07:15<00:40, 40.64s/it]

pset: {'batch_size': 256, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2734968662261963


100%|██████████| 15/15 [08:09<00:00, 32.63s/it]

pset: {'batch_size': 256, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.427614688873291





In [27]:
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm

params = {
    "batch_size": [16, 32, 64, 128, 256],
    "learn_rate": [0.1, 0.05, 0.01],
    "num_iters": [50000]
}

param_grid = ParameterGrid(params)
reg_strength = 0.01
beta = 0.9 

for pset in tqdm(param_grid):
    g = torch.Generator().manual_seed(61616161616161611)
    c = torch.randn((27, 10), generator=g)
    w1 = torch.randn((30, 200), generator=g)
    b1 = torch.randn(200, generator=g)
    w2 = torch.randn((200, 27), generator=g)
    b2 = torch.randn(27, generator=g)
    w0 = torch.randn((30, 27), generator=g)
    params = [c, w1, b1, w2, b2, w0]
    param_grads = [0.0 for _ in range(len(params))]

    for p in params:
        p.requires_grad = True

    for k in range(pset["num_iters"]):
        rand_idxs = torch.randint(0, x_train.shape[0], (pset["batch_size"], ))
        x_batch = x_train[rand_idxs]
        y_batch = y_train[rand_idxs]

        emb = c[x_batch]
        h = torch.tanh(emb.view(-1, 3 * c.shape[1]) @ w1 + b1)
        logits = h @ w2 + b2 + emb.view(-1, w1.shape[0]) @ w0
        loss = F.cross_entropy(logits, y_batch) + reg_strength * (w0.pow(2).mean() + w1.pow(2).mean() + w2.pow(2).mean())

        for p in params:
            p.grad = None

        loss.backward()

        lr = (pset["learn_rate"] / 10) if k > pset["num_iters"] / 2 else pset["learn_rate"] 
        for idx, p in enumerate(params):
            param_grads[idx] = beta * param_grads[idx] + (1 - beta) * p.grad
            p.data = p.data - lr * param_grads[idx]

    with torch.no_grad():
        emb = c[x_val]
        h = torch.tanh(emb.view(-1, w1.shape[0]) @ w1 + b1)
        logits = h @ w2 + b2 + emb.view(-1, w1.shape[0]) @ w0
        val_loss = F.cross_entropy(logits, y_val)
        print(f"pset: {pset}, val_loss: {val_loss.item()}")


  7%|▋         | 1/15 [00:27<06:27, 27.66s/it]

pset: {'batch_size': 16, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2670199871063232


 13%|█▎        | 2/15 [00:56<06:05, 28.08s/it]

pset: {'batch_size': 16, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2923951148986816


 20%|██        | 3/15 [01:25<05:43, 28.60s/it]

pset: {'batch_size': 16, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.422217607498169


 27%|██▋       | 4/15 [01:53<05:14, 28.57s/it]

pset: {'batch_size': 32, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2561957836151123


 33%|███▎      | 5/15 [02:22<04:44, 28.45s/it]

pset: {'batch_size': 32, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.289093494415283


 40%|████      | 6/15 [02:51<04:18, 28.75s/it]

pset: {'batch_size': 32, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.408313751220703


 47%|████▋     | 7/15 [03:23<04:00, 30.02s/it]

pset: {'batch_size': 64, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2334611415863037


 53%|█████▎    | 8/15 [03:56<03:35, 30.78s/it]

pset: {'batch_size': 64, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2751669883728027


 60%|██████    | 9/15 [04:28<03:06, 31.16s/it]

pset: {'batch_size': 64, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.4027657508850098


 67%|██████▋   | 10/15 [05:05<02:45, 33.07s/it]

pset: {'batch_size': 128, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.23679518699646


 73%|███████▎  | 11/15 [05:43<02:17, 34.47s/it]

pset: {'batch_size': 128, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.266667604446411


 80%|████████  | 12/15 [06:22<01:47, 35.80s/it]

pset: {'batch_size': 128, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.4048216342926025


 87%|████████▋ | 13/15 [07:13<01:21, 40.52s/it]

pset: {'batch_size': 256, 'learn_rate': 0.1, 'num_iters': 50000}, val_loss: 2.2498393058776855


 93%|█████████▎| 14/15 [08:04<00:43, 43.69s/it]

pset: {'batch_size': 256, 'learn_rate': 0.05, 'num_iters': 50000}, val_loss: 2.2753329277038574


100%|██████████| 15/15 [08:55<00:00, 35.71s/it]

pset: {'batch_size': 256, 'learn_rate': 0.01, 'num_iters': 50000}, val_loss: 2.406517505645752





In [28]:
g = torch.Generator().manual_seed(6161616161116)
c = torch.randn((27, 10), generator=g)
w1 = torch.randn((30, 200), generator=g)
b1 = torch.randn(200, generator=g)
w2 = torch.randn((200, 27), generator=g)
b2 = torch.randn(27, generator=g)
w0 = torch.randn((30, 27), generator=g)
params = [c, w1, b1, w2, b2, w0]
param_grads = [0.0 for _ in range(len(params))]
LEARN_RATE = 0.1
BATCH_SIZE = 256
N_ITERS = 200000
BETA = 0.9 
REG_STRENGTH = 0.01

In [29]:
for p in params:
    p.requires_grad = True

for k in range(N_ITERS):
    rand_idxs = torch.randint(0, x_train.shape[0], (BATCH_SIZE, ))
    x_batch = x_train[rand_idxs]
    y_batch = y_train[rand_idxs]

    emb = c[x_batch]
    h = torch.tanh(emb.view(-1, 3 * c.shape[1]) @ w1 + b1)
    logits = h @ w2 + b2 + emb.view(-1, w1.shape[0]) @ w0
    loss = F.cross_entropy(logits, y_batch) + REG_STRENGTH * (w0.pow(2).mean() + w1.pow(2).mean() + w2.pow(2).mean())

    
    for p in params:
        p.grad = None

    loss.backward()

    
    if k > N_ITERS / 2:
        lr = (LEARN_RATE / 10)
    elif k > N_ITERS / 4:
        lr = (LEARN_RATE / 100)
    else:
        lr = LEARN_RATE

    for idx, p in enumerate(params):
        param_grads[idx] = BETA * param_grads[idx] + (1 - beta) * p.grad
        p.data = p.data - lr * param_grads[idx]

with torch.no_grad():
    emb = c[x_val]
    h = torch.tanh(emb.view(-1, w1.shape[0]) @ w1 + b1)
    logits = h @ w2 + b2 + emb.view(-1, w1.shape[0]) @ w0
    val_loss = F.cross_entropy(logits, y_val)
    print(f"val_loss: {val_loss.item()}")

val_loss: 2.193235397338867


In [31]:
g = torch.Generator().manual_seed(694390)

for _ in range(20):
    out = []
    context = [0] * block_size 
    while True:
      emb = c[torch.tensor([context])] 
      h = torch.tanh(emb.view(1, -1) @ w1 + b1)
      logits = h @ w2 + b2 + emb.view(-1, w1.shape[0]) @ w0
      probs = F.softmax(logits, dim=1)
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      context = context[1:] + [ix]
      out.append(ix)
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out))

kassela.
marvionna.
aver.
abriegh.
tay.
larika.
aiburlet.
keew.
kenex.
ashellen.
mila.
metalo.
cryl.
ketina.
evievi.
xokiriah.
mikannah.
brity.
fer.
elis.
