In [None]:
import torch
import torch.nn.functional as F

# Bigram-based training

In [None]:
names = open('names.txt', 'r').read().splitlines()
freq = {}

for n in names:
    chs = ['<S>'] + list(n) + ['<E>']
    for ch1, ch2, in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        freq[bigram] = freq.get(bigram, 0) + 1

In [None]:
N = torch.zeros((28, 28), dtype=torch.int32)
chars = sorted(list(set(''.join(names))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

for n in names:
    chs = ['.'] + list(n) + ['.']
    for ch1, ch2, in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [None]:
P = N.float()
P = P / P.sum(1, keepdim=True)

g = torch.Generator().manual_seed(2147483647)

for i in range(1000): 
    out = []
    ix = 0
    while True:
        p = P[ix]
        
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))
    

In [None]:
log_likelihood = 0.0
n = 0

for w in ["test"]:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        print(f'{ch1}{ch2}: {prob:.4f} {logprob:.4f}')

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

# A Neural-Network Based Approach

In [None]:
xs, ys = [], []

for n in names:
    chs = ['.'] + list(n) + ['.']
    for ch1, ch2, in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
W = torch.randn((27, 27), requires_grad=True)

nelems = xs.nelement()

In [None]:
for i in range(100): 
    # performs a forward pass
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = (xenc @ W)
    counts = logits.exp()
    probs = counts /  counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(nelems), ys].log().mean() + 0.01*(W**2).mean()
    print(loss.item())

    # backward pass
    W.grad = None
    loss.backward() # PyTorch creates compute graph to calculate derivatives
    
    W.data += -50 * W.grad

In [None]:
for i in range(5):
    out = []
    ix = 0
    
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdims=True)

        ix = torch.multinomial(p, num_samples=1, replacement=True).item()
        out.append(itos[ix])
        if(ix == 0):
            break
    print(''.join(out))
    