In [3]:
import torch
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
words = open('names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [5]:
min_length = min(len(w) for w in words)
max_length = max(len(w) for w in words)
count = len(words)
print(f'{min_length=} {max_length=} {count=}')

min_length=2 max_length=15 count=32033


In [6]:
characters = ['.'] + sorted(list(set(''.join(words))))
ALPHABET_SIZE = len(characters)
N = torch.zeros((ALPHABET_SIZE, ALPHABET_SIZE), dtype=torch.int32)

In [7]:
chr_to_index = {s:i for i,s in enumerate(characters)}
index_to_chr = {i:s for i,s in enumerate(characters)}

class StoiMapper(object):
    def __getitem__(self, key):
        return chr_to_index[key[0]] * ALPHABET_SIZE + chr_to_index[key[1]]


class ItosMapper(object):
    def __getitem__(self, key):
        return index_to_chr[key // ALPHABET_SIZE] + index_to_chr[key % ALPHABET_SIZE]

stoi = StoiMapper()
itos = ItosMapper()
stoi['aa']

28

In [8]:
# Create the training set of all the bigrams
xs, ys = [], []

for w in words:
    chs = ['.', '.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        x_index = stoi[ch1 + ch2]
        y_index = chr_to_index[ch3]
        xs.append(x_index)
        ys.append(y_index)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [9]:
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes=ALPHABET_SIZE ** 2).float()
xenc

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.]])

In [10]:
xenc.shape

torch.Size([228146, 729])

In [11]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((ALPHABET_SIZE ** 2, ALPHABET_SIZE), generator=g, requires_grad=True)
xenc @ W

tensor([[ 1.5674, -0.2373, -0.0274,  ..., -0.0707,  2.4968,  2.4448],
        [ 0.4724,  1.4830,  0.3175,  ..., -0.4275, -2.1259,  0.9604],
        [ 0.1275,  1.7862,  0.9084,  ..., -0.0410,  0.4848, -0.9423],
        ...,
        [ 0.5146, -1.0181, -1.2665,  ..., -1.0288,  0.5508, -1.0114],
        [-0.9299,  0.3116,  1.3902,  ...,  2.1475,  1.6333,  1.9126],
        [-0.5997,  0.4037, -0.3309,  ..., -0.5076, -2.0352, -0.1582]],
       grad_fn=<MmBackward0>)

In [29]:
learning_rate = -100
# gradient descent
for k in range(100):
    # forward pass
    xenc = F.one_hot(xs, num_classes=ALPHABET_SIZE ** 2).float()
    logits = xenc @ W # log-counts
    loss = F.cross_entropy(logits, ys) + 0.001 * (W**2).mean()
    print(loss.item())

    # backward pass
    W.grad = None
    loss.backward()

    # update
    W.data += learning_rate * W.grad

2.279918909072876
2.2796058654785156
2.2792952060699463
2.278986692428589
2.2786805629730225
2.278376817703247
2.2780747413635254
2.2777748107910156
2.2774770259857178
2.277181625366211
2.276887893676758
2.2765963077545166
2.2763068675994873
2.27601957321167
2.2757339477539062
2.2754502296447754
2.2751686573028564
2.274888515472412
2.274610996246338
2.274334669113159
2.2740607261657715
2.2737886905670166
2.2735178470611572
2.2732491493225098
2.272982120513916
2.272716999053955
2.272453784942627
2.2721922397613525
2.2719321250915527
2.2716739177703857
2.2714173793792725
2.271162271499634
2.270909070968628
2.2706573009490967
2.2704074382781982
2.2701587677001953
2.269912004470825
2.2696666717529297
2.269422769546509
2.2691807746887207
2.268939971923828
2.26870059967041
2.268462896347046
2.2682266235351562
2.2679922580718994
2.26775860786438
2.267526626586914
2.267296314239502
2.2670674324035645
2.2668395042419434
2.266613245010376
2.266388416290283
2.266165018081665
2.2659428119659424
2.

In [30]:
# sample neural net
g = torch.Generator().manual_seed(2147483647)

for i in range(20):
    out = []
    ix = 0
    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=ALPHABET_SIZE**2).float()
        logits = xenc @ W
        p = logits.exp()
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        new_letter = index_to_chr[ix]
        out.append(new_letter)
        if new_letter == '.':
            break
    print(''.join(out))


misabrucheneviwozeluchinitalqujaaquprhezasinarahelilelalaumalameroluolimalyumi.
gadoadamazocanananolorakhelladabhaborhemadaxruxanelivisufazahujoqumananozreyusinanisisavitrelivaluyaleroyuquwauxisadayalanoledalenememamamanilizalozevazakalemanazyroloxtorojoavatodanosharyivarokanimanevaishelaucanatinakrelueloralunalyumaro.
jarodelatikanadaroshaemadanarufinarhavasyaravilelevamumabelinangraroadezyoqusalujadelespedoryoladedanabryaninoshamarasaocaremazhehalosurolondemaluvaelazishelelivizetralevanelizalarelalumalabraholinoluxtrumunotyakezastatyatanilelananahanajadelixqbavevamalamaosowelomyeyareirishayogrorhamarhastomanasariscladocrolomomaniyadalitazuorutraivememadamidadakicouralemuroxayauzabralomilhofrouflayososanalasaryasuomelyianyamemazecrilamaralomcotelesilayadelanyaamesedanocrorarirelurelaasyaremaladelalanamesarileustanaloshaalamakamivalelulaylikaphellalezaninanakelevitadalanifodadaveselolarakedemalizelanirmalurozyamasanorozabemchilalaeliroletalufrazabraledanemesarogelemaxchanavizevalamisul

In [28]:
x = W[stoi['ad']].exp()
x /= x.sum()
x[chr_to_index['.']].item()

0.0976322591304779

In [16]:
# Learnings
# - One hot is useful for categorical data
# - Loss function of NLL is useful for categorical data as opposed to MSE for regression
# - Softmax is useful for categorical data

# Questions:
# Q: Why is the learning rate value negative?
# A: Because we are trying to minimize the loss function, and the gradient is pointing in the direction of the steepest ascent.
#    So we need to go in the opposite direction.
# Q: What is -0.1 a reasonable learning rate?

# counting loss 2.45
# nn loss 2.46