# 02 â€” Neural bigram model

Replace the count-based bigram model with a trainable neural model:
- one-hot encoded inputs
- linear layer
- softmax
- cross-entropy loss
- gradient descent

In [1]:
import torch
from pathlib import Path

torch.manual_seed(1337)

# load data
REPO_ROOT = Path.cwd()
if (REPO_ROOT / "data").exists() is False and (REPO_ROOT.parent / "data").exists():
    REPO_ROOT = REPO_ROOT.parent

data_path = REPO_ROOT / "data" / "names.txt"
words = data_path.read_text().splitlines()

# vocabulary
chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(stoi)

# build dataset
xs, ys = [], []

for w in words:
    chs = ["."] + list(w) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        xs.append(stoi[ch1])
        ys.append(stoi[ch2])

xs = torch.tensor(xs)
ys = torch.tensor(ys)

xs.shape, ys.shape

(torch.Size([228146]), torch.Size([228146]))

In [2]:
import torch.nn.functional as F

# initialize weights
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((vocab_size, vocab_size), generator=g, requires_grad=True)

In [3]:
# one-hot encode inputs
xenc = F.one_hot(xs, num_classes=vocab_size).float()  # (N, vocab_size)

# logits
logits = xenc @ W  # (N, vocab_size)

# cross-entropy loss (this is the same NLL objective)
loss = F.cross_entropy(logits, ys)
loss.item()

3.758953332901001

In [4]:
lr = 40.0
steps = 4000

for i in range(steps+1):
    # forward
    xenc = F.one_hot(xs, num_classes=vocab_size).float()
    logits = xenc @ W
    loss = F.cross_entropy(logits, ys)

    # backward
    W.grad = None
    loss.backward()

    # update
    W.data -= lr * W.grad

    if i % 200 == 0:
        print(f"{i:4d}/{steps}  loss={loss.item():.4f}")

   0/4000  loss=3.7590
 200/4000  loss=2.4648
 400/4000  loss=2.4589
 600/4000  loss=2.4571
 800/4000  loss=2.4563
1000/4000  loss=2.4558
1200/4000  loss=2.4554
1400/4000  loss=2.4552
1600/4000  loss=2.4550
1800/4000  loss=2.4549
2000/4000  loss=2.4548
2200/4000  loss=2.4547
2400/4000  loss=2.4547
2600/4000  loss=2.4546
2800/4000  loss=2.4546
3000/4000  loss=2.4545
3200/4000  loss=2.4545
3400/4000  loss=2.4545
3600/4000  loss=2.4544
3800/4000  loss=2.4544
4000/4000  loss=2.4544


In [5]:
def sample_name_neural(W, stoi, itos, g, max_len=30):
    out = []
    ix = 0
    for _ in range(max_len):
        x = F.one_hot(torch.tensor([ix]), num_classes=vocab_size).float()  # (1, vocab_size)
        logits = x @ W                                                     # (1, vocab_size)
        probs = F.softmax(logits, dim=1).squeeze(0)                        # (vocab_size,)
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        if ix == 0:
            break
        out.append(itos[ix])
    return "".join(out)

g_sample = torch.Generator().manual_seed(2147483647)
for _ in range(10):
    print(sample_name_neural(W, stoi, itos, g_sample))

cexze
momasurailezitynn
konimittain
llayn
ka
da
staiyaubrtthrigotai
moliellavo
ke
teda
