In [None]:
words = open("names.txt", 'r').read().splitlines()

In [None]:
b = {}
for w in words:
    # Add start and ending characters
    chs = ["<S>"] + list(w) + ["<E>"] 
    # Get pairs of characters in a sliding window and add to dict
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1

In [None]:
# Sort by highest occurence
sorted(b.items(), key = lambda kv: kv[1], reverse = True)

In [None]:
import torch

In [None]:
N = torch.zeros((27, 27), dtype = torch.int32)

In [None]:
# Get all letters of the alphabet in a list
chars = sorted(list(set(''.join(words))))

# Create mapping
s_to_i = {s:i for i, s in enumerate(chars, start=1)}
s_to_i['.'] = 0
s_to_i

# Reverse Mapping
i_to_s = {i:s for s, i in s_to_i.items()}
i_to_s

In [None]:
for w in words:
    # Add start and ending characters
    chs = ["."] + list(w) + ["."] 
    # Get pairs of characters in a sliding window and add to dict
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = s_to_i[ch1]
        ix2 = s_to_i[ch2]
        N[ix1, ix2] += 1

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.imshow(N, cmap='Blues')

for i in range(27):
    for j in range(27):
        chstr = i_to_s[i] + i_to_s[j]
        plt.text(j, i, chstr, ha="center", va="bottom", color="gray", fontsize='4')
        plt.text(j, i, N[i, j].item(), ha="center", va="top", color="gray", fontsize='4')

plt.axis('off')

In [None]:
N[0]

In [None]:
p = N[0].float()
p = p / p.sum()
p

In [None]:
# We do +1 to smooth the model and prevent -inf prob from log
P = (N+1).float()
# Sum each row of the vector, and return sum in column (1) format
# Keep dim is True to hold dimensions and will be used for broadcasting
P /= P.sum(1, keepdim=True) # /= is faster because in place and doesn't create new memory
P[0].sum()

In [None]:
# Generator for multimonial
g = torch.Generator().manual_seed(214748)

for i in range(10):
    out = []
    ix = 0
    while True:

        p = P[ix] # Made code below more efficient on top
        # p = N[ix].float()
        # Get prob for each p (first letter)
        # p = p / p.sum()
        # print(p)
        
        # draw random samples from a dataset where some items are more likely to be picked than others
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(i_to_s[ix])
        if ix == 0:
            break
    print(''.join(out))
    

In [17]:
# GOAL: maximise the likelihood of data w.r.t. model parameters
# equivalent to maximising log likelihood
# equivalent to minimising negative log likelihood
# equivalent to minmising the average nll 

# We use log because log(a * b * c) = log(a) + log(b) + log(c)

log_likelihood = 0.0
n = 0

for w in words:
    # Add start and ending characters
    chs = ["."] + list(w) + ["."] 
    # Get pairs of characters in a sliding window and add to dict
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = s_to_i[ch1]
        ix2 = s_to_i[ch2]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1
        print(f'{ch1}{ch2}: {prob:.4f}{logprob:.4f}')

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n=}')


.e: 0.0478-3.0410
em: 0.0377-3.2793
mm: 0.0253-3.6753
ma: 0.3885-0.9454
a.: 0.1958-1.6305
.o: 0.0123-4.3965
ol: 0.0779-2.5526
li: 0.1774-1.7293
iv: 0.0152-4.1845
vi: 0.3508-1.0476
ia: 0.1380-1.9807
a.: 0.1958-1.6305
.a: 0.1376-1.9835
av: 0.0246-3.7041
va: 0.2473-1.3971
a.: 0.1958-1.6305
.i: 0.0185-3.9919
is: 0.0743-2.5998
sa: 0.1478-1.9119
ab: 0.0160-4.1363
be: 0.2455-1.4044
el: 0.1589-1.8396
ll: 0.0962-2.3408
la: 0.1876-1.6733
a.: 0.1958-1.6305
.s: 0.0641-2.7468
so: 0.0654-2.7270
op: 0.0121-4.4180
ph: 0.1947-1.6364
hi: 0.0955-2.3485
ia: 0.1380-1.9807
a.: 0.1958-1.6305
.c: 0.0481-3.0339
ch: 0.1869-1.6774
ha: 0.2937-1.2251
ar: 0.0963-2.3405
rl: 0.0325-3.4256
lo: 0.0496-3.0047
ot: 0.0149-4.2032
tt: 0.0670-2.7031
te: 0.1281-2.0549
e.: 0.1948-1.6357
.m: 0.0792-2.5358
mi: 0.1885-1.6687
ia: 0.1380-1.9807
a.: 0.1958-1.6305
.a: 0.1376-1.9835
am: 0.0482-3.0321
me: 0.1228-2.0971
el: 0.1589-1.8396
li: 0.1774-1.7293
ia: 0.1380-1.9807
a.: 0.1958-1.6305
.h: 0.0273-3.6011
ha: 0.2937-1.2251
ar: 0.0963

## Full Run

In [None]:
# create a training set for the bigrams (x,y)
xs, ys = [], []

for w in words:
    # Add start and ending characters
    chs = ["."] + list(w) + ["."] 
    # Get pairs of characters in a sliding window and add to dict
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = s_to_i[ch1]
        ix2 = s_to_i[ch2]
        print(ch1, ch2)
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

In [None]:
import torch.nn.functional as F

# initialise the network
W = torch.randn(27, 27, requires_grad=True)

for k in range(100):
    # -- forward pass
    # xenc(5, 27), W = (27,27)
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = (xenc @ W) # predicts log counts
    # this creates the softmax function
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True) # probs for next character
    loss = -probs[torch.arange(num), ys].log().mean() + 0.1 * (W**2).mean() # applying regularisation - way to penalise using large weights

    # -- backward pass
    W.grad = None # set to zero
    loss.backward()

    # -- update
    W.data += -50 * W.grad 
    print(loss)

