# my_makemore

In [1]:
%pip install torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch import nn
import random

### Preparing the training data

In [3]:
with open('names.txt', 'r') as file:
    contents = file.read()

In [4]:
# We'll use '.' as the start/end delimiter. We could make them different but not sure it makes
# a difference and this way the processing into bigrams here is really simple
contents = '.' + contents.replace('\n', '.') + '.'
print(contents[:20] + " ... " + contents[-20:])
bigrams = [x + y for x, y in zip(contents[:-1], contents[1:])]
print(bigrams[:20] + bigrams[-20:])

.emma.olivia.ava.isa ... n.zyrie.zyron.zzyzx.
['.e', 'em', 'mm', 'ma', 'a.', '.o', 'ol', 'li', 'iv', 'vi', 'ia', 'a.', '.a', 'av', 'va', 'a.', '.i', 'is', 'sa', 'ab', 'an', 'n.', '.z', 'zy', 'yr', 'ri', 'ie', 'e.', '.z', 'zy', 'yr', 'ro', 'on', 'n.', '.z', 'zz', 'zy', 'yz', 'zx', 'x.']


In [5]:
# here we convert the chars to their indices in the char dictionary. This makes it easier to encode
# them as one-hot vectors for input into our model
letters = sorted(set(''.join(bigrams)))
ltoi = {l:i for i, l in enumerate(letters)}
itol = {i:l for l, i in ltoi.items()}
print(ltoi['a'])
itol[ltoi['a']]

1


'a'

In [6]:
def one_hot(letter: str) -> torch.Tensor:
    vec = torch.zeros(len(letters))
    vec[ltoi[letter]] = 1.0
    return vec

In [7]:
converted = map(lambda bigram: (one_hot(bigram[0]), ltoi[bigram[1]]), bigrams)
print(list(converted)[0])

(tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.]), 5)


### Set up the model

In [8]:
class NextLetterSimple(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(len(letters), len(letters))

    def forward(self, x):
        logits = self.linear(x)
        return logits

In [9]:
class NextLetterFancy(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(len(letters), 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, len(letters)),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [10]:
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [11]:
def do_test(num_names, model):
    inputs = torch.stack(list(map(one_hot, letters)), dim=0).to(device)
    model.eval()
    with torch.no_grad():
        output = model(inputs)
        probs = torch.nn.functional.softmax(output, dim=1)    
        for i in range(num_names):
            word = ''
            next_letter = itol[torch.multinomial(probs[ltoi['.']], 1).item()]
            while (next_letter != '.' and len(word) < 100):
                word += next_letter
                next_letter = itol[torch.multinomial(probs[ltoi[next_letter]], 1).item()]
            print(f"{i}: {word}")

In [12]:
# set up the training X, y
inputs = torch.stack(list(map(lambda g: one_hot(g[0]), bigrams)))
# expected = torch.stack(list(map(lambda g: one_hot(g[1]), bigrams)))
expected = torch.tensor(list(map(lambda g: ltoi[g[1]], bigrams)))
inputs = inputs.to(device)
expected = expected.to(device)

In [13]:
ltoi[bigrams[0][1]]
expected[0]

tensor(5, device='mps:0')

In [14]:
def train(inputs, expected, model, epochs, print_loss=False):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
    model.train()

    for epoch in range(epochs):
        pred = model(inputs)
        loss = loss_fn(pred, expected)
        if print_loss:
            print(f"epoch: {epoch} loss: {loss.item():>7f}")
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"loss={loss}")

In [15]:
model_simple = NextLetterSimple()
model_simple = model_simple.to(device)

print('Model output before training:')
do_test(10, model_simple)

train(inputs, expected, model_simple, 5000, True)

print('Model output after training:')
do_test(10, model_simple)

Model output before training:
0: cyvdbzbczyvhklidrmgryawipqyeggothdt
1: otvdiwveflzftpxxwgtrper
2: bhndgmtmdkmfcvlpvwkpqnrinlwvrcmgcccgnznohupqhwvqfswgdlr
3: 
4: outndfhebhmudvwgn
5: scwevetthrncvtay
6: whyvnmwcjtxrofbmvtru
7: xiddvikfvkimcmloxxxebfwg
8: idgkgauwkbkyla
9: xphphrcuhexgmuhxcudojbgkrtfhbtutu
epoch: 0 loss: 3.349842
epoch: 1 loss: 3.344769
epoch: 2 loss: 3.339730
epoch: 3 loss: 3.334724
epoch: 4 loss: 3.329753
epoch: 5 loss: 3.324815
epoch: 6 loss: 3.319911
epoch: 7 loss: 3.315041
epoch: 8 loss: 3.310205
epoch: 9 loss: 3.305403
epoch: 10 loss: 3.300635
epoch: 11 loss: 3.295900
epoch: 12 loss: 3.291199
epoch: 13 loss: 3.286532
epoch: 14 loss: 3.281899
epoch: 15 loss: 3.277299
epoch: 16 loss: 3.272733
epoch: 17 loss: 3.268200
epoch: 18 loss: 3.263701
epoch: 19 loss: 3.259236
epoch: 20 loss: 3.254803
epoch: 21 loss: 3.250405
epoch: 22 loss: 3.246040
epoch: 23 loss: 3.241708
epoch: 24 loss: 3.237409
epoch: 25 loss: 3.233144
epoch: 26 loss: 3.228911
epoch: 27 loss: 3.224712
epo

In [16]:
model_fancy = NextLetterFancy()
model_fancy = model_fancy.to(device)

print('Model output before training:')
do_test(10, model_fancy)

train(inputs, expected, model_fancy, 500, True)

print('Model output after training:')
do_test(10, model_fancy)

Model output before training:
0: balvjpmawdrsrgbqgklnrtmxmexhrdznfswdututkdcarqizuwpukojsl
1: zxeaaomkfy
2: dxohyhpdivplcchntlqydscgtzcuzhx
3: prjmdschgftqiwgu
4: rrw
5: obceonfneapzfjpvjgmuu
6: vvg
7: o
8: zhsn
9: jtfqvoodvuoalffmtocixmpdykcapqtfmwlyqwfix
epoch: 0 loss: 3.297643
epoch: 1 loss: 3.287173
epoch: 2 loss: 3.276818
epoch: 3 loss: 3.266499
epoch: 4 loss: 3.256229
epoch: 5 loss: 3.245959
epoch: 6 loss: 3.235652
epoch: 7 loss: 3.225281
epoch: 8 loss: 3.214813
epoch: 9 loss: 3.204230
epoch: 10 loss: 3.193515
epoch: 11 loss: 3.182645
epoch: 12 loss: 3.171602
epoch: 13 loss: 3.160381
epoch: 14 loss: 3.148987
epoch: 15 loss: 3.137416
epoch: 16 loss: 3.125685
epoch: 17 loss: 3.113778
epoch: 18 loss: 3.101738
epoch: 19 loss: 3.089610
epoch: 20 loss: 3.077421
epoch: 21 loss: 3.065210
epoch: 22 loss: 3.053015
epoch: 23 loss: 3.040875
epoch: 24 loss: 3.028873
epoch: 25 loss: 3.017064
epoch: 26 loss: 3.005491
epoch: 27 loss: 2.994216
epoch: 28 loss: 2.983295
epoch: 29 loss: 2.972780
epo

## Observations
- The deeper NN is a _lot_ slower to train. But it has a lot more parameters
- Bigrams don't get you very far
  