In [None]:
import time
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from collections import Counter
from tqdm import tqdm

In [None]:
torch.cuda.is_available()

In [None]:
torch.set_default_tensor_type(torch.cuda.FloatTensor)

In [None]:
default_dtype = torch.float32

torch.set_default_dtype(default_dtype)

In [None]:
with open('names.txt', 'r') as file:
    words = file.read().splitlines()

In [None]:
def bigrams(words):
    for w in words:
        chs = ['.'] + list(w) + ['.']
        for c1, c2 in zip(chs, chs[1:]):
            yield c1, c2

In [None]:
data = sorted(Counter(sorted(bigrams(words))).items(), key=lambda kv: kv[1], reverse=True)

In [None]:
chars = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
n = len(itos)

In [None]:
N = torch.zeros((n, n), dtype=torch.int32, device='cpu')

In [None]:
for c1, c2 in bigrams(words):
    i1 = stoi[c1]
    i2 = stoi[c2]
    N[i1, i2] += 1

In [None]:
fig, ax = plt.subplots(figsize=(16, 16), dpi=200)
ax.imshow(N, cmap='Blues')

for i in range(n):
    for j in range(n):
        chstr = itos[i] + itos[j]
        ax.text(j, i, chstr, ha="center", va="bottom", color="gray")
        ax.text(j, i, N[i, j].item(), ha="center", va="top", color="gray")
ax.axis("off")
fig.tight_layout()

In [None]:
p = N[0].float()
p /= p.sum()
g = torch.Generator(device='cpu').manual_seed(42)

In [None]:
P = N.float()
P /= P.sum(axis=1, keepdim=True)

In [None]:
def makeone():
    ix = 0
    s = ''
    while True:
        #p = N[ix].float()
        #p /= p.sum()
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        if ix == 0:
            break
        s += itos[ix]
    return s

Now want to evaluate the model. The idea is to calculate the likelihood of the dataset given the model parameters.

In [None]:
ll = 0
n = 0
bestword = ''
bestll = -float('inf')
for word in words:
    chs = ['.'] + list(word) + ['.']
    wordll = 0
    for c1, c2 in zip(chs, chs[1:]):
        i1, i2 = stoi[c1], stoi[c2]
        lp = torch.log(P[i1, i2])
        ll += lp
        wordll += lp
        n += 1
    if wordll > bestll:
        bestll = wordll
        bestword = word
print(bestword, bestll)
nnll = -ll/n
print(nnll)

Create a dataset. We do this by turning character indices into one hot vector. The first character in a bigram is an $x$, the second character is a $y$.

In [None]:
xs = []
ys = []
for word in words:
    chs = ['.'] + list(word) + ['.']
    for c1, c2 in zip(chs, chs[1:]):
        i1 = stoi[c1]
        i2 = stoi[c2]
        xs.append(i1)
        ys.append(i2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [None]:
xenc = F.one_hot(xs, 27).to(default_dtype)

In [None]:
g = torch.Generator(device='cuda').manual_seed(2147483647)
W = torch.randn(27, 27, generator=g, requires_grad=True)

In [None]:
# we can jump straight to a perfect set of weights
# W = torch.log(P+0.0000000001).cuda(); W.requires_grad = True

In [None]:
t = time.time()
print(f"{'epoch':>6} {'loss':>10} {'time,s':>7}")
for i in range(100000):
    logits = xenc @ W # log-counts
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(len(xs)), ys].log().mean()# + 20*(W**2).mean()
    if (i+1)%1000 == 1:
        tt = time.time()
        print(f"{i+1:6} {loss.data.item():10.5f} {tt-t:>7.2f}")
        t = tt

    W.grad = None # zero out the gradients
    loss.backward()
    W.data += -1*W.grad

sample from the net

In [None]:
i = 0

In [None]:
w = ''
while True:
    xenc = F.one_hot(torch.tensor([i]), 27).float()

    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)

    i = torch.multinomial(probs, num_samples=1, replacement=True).item()

    if i == 0:
        break
    
    w += itos[i]
print(w)