# Building makemore exercise

## E01
> Train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

### Counting

In [1]:
from collections import defaultdict, Counter
import numpy
import torch
from matplotlib import pyplot as plt
import torch.nn.functional as F

Read in the data

In [2]:
with open('../data/names.txt') as f:
    words = list(map(lambda x: x.strip(), f.readlines()))

In [3]:
words[:10], len(words)

(['emma',
  'olivia',
  'ava',
  'isabella',
  'sophia',
  'charlotte',
  'mia',
  'amelia',
  'harper',
  'evelyn'],
 32033)

In [4]:
def generate_tripling(words):
    for w in words:
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            yield ch1, ch2, ch3

In [5]:
alphabets = '.abcdefghijklmnopqrstuvwxyz'
stoi = {char: alphabets.index(char) for char in alphabets}
itos = dict(map(reversed, stoi.items()))

In [6]:
for ch1, ch2, ch3 in generate_tripling(words[:3]): print(ch1, ch2, ch3)

. e m
e m m
m m a
m a .
. o l
o l i
l i v
i v i
v i a
i a .
. a v
a v a
v a .


In [56]:
sum(1 for ch1, ch2, ch3 in generate_tripling(words))

196113

In [7]:
def generate_tripling_counter(words):
    tripling_counter = Counter()
    for ch1, ch2, ch3 in generate_tripling(words):
        tripling_counter[(ch1, ch2, ch3)] += 1
    return tripling_counter    

In [8]:
tripling_counter = generate_tripling_counter(words)
tripling_counter.most_common(10)

[(('a', 'h', '.'), 1714),
 (('n', 'a', '.'), 1673),
 (('a', 'n', '.'), 1509),
 (('o', 'n', '.'), 1503),
 (('.', 'm', 'a'), 1453),
 (('.', 'j', 'a'), 1255),
 (('.', 'k', 'a'), 1254),
 (('e', 'n', '.'), 1217),
 (('l', 'y', 'n'), 976),
 (('y', 'n', '.'), 953)]

In [54]:
len(tripling_counter)

6037

In [9]:
def create_matrix():
    N = torch.zeros((27, 27, 27), dtype=torch.int32)
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        N[ix1, ix2, ix3] += 1
    return N  

In [10]:
N = create_matrix(); N.shape

torch.Size([27, 27, 27])

In [11]:
N[1, 8, 0]

tensor(1714, dtype=torch.int32)

In [12]:
P = (N+1).float()
P = P/P.sum(-1, keepdims=True)

In [13]:
def generate_tripling_prob(words):
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        prob = P[ix1, ix2, ix3]
        yield ch1, ch2, ch3, prob

In [14]:
for ch1, ch2, ch3, prob in generate_tripling_prob(words[:3]): 
    print(ch1, ch2, ch3, prob)

. e m tensor(0.1855)
e m m tensor(0.1269)
m m a tensor(0.3744)
m a . tensor(0.0669)
. o l tensor(0.2494)
o l i tensor(0.1084)
l i v tensor(0.0219)
i v i tensor(0.2669)
v i a tensor(0.1578)
i a . tensor(0.3657)
. a v tensor(0.0550)
a v a tensor(0.1882)
v a . tensor(0.1405)


In [15]:
def generate_names(count, P):
    g = torch.Generator().manual_seed(2147483647)
    for i in range(count):
        out = []
        ix1, ix2 = 0, 0
        while True:
            p = P[ix1, ix2]
            ix1 = ix2
            ix2 = torch.multinomial(p, num_samples = 1, replacement = True).item()
            out.append(itos[ix2])
            if ix2 == 0:
                break
        yield ''.join(out)

In [16]:
for name in generate_names(5, P): print(name)

bryemoxoniqqiibaireolist.
ennovayledilase.
olform.
fa.
ottias.


In [17]:
def log_likelihood(words):
    sum_log = 0
    count = 0
    for ch1, ch2, ch3, prob in generate_tripling_prob(words):
        sum_log += torch.log(prob)
        count += 1
    return sum_log/count

In [53]:
len(P)

27

In [18]:
log_likelihood(words)

tensor(-2.0927)

negative log likelihood

In [19]:
- log_likelihood(words)

tensor(2.0927)

### NN

In [20]:
def generate_training_set(words):
    xs1 = []
    xs2 = []
    ys = []
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs1.append(ix1)
        xs2.append(ix2)
        ys.append(ix3)
    xs1 = torch.tensor(xs1)
    xs2 = torch.tensor(xs2)
    xs = torch.vstack((xs1, xs2)).permute(1, 0)
    ys = torch.tensor(ys)
    return xs, ys 

#### Sample dataset

In [21]:
xs, ys = generate_training_set(words[:1])

In [22]:
xs.shape, ys.shape

(torch.Size([4, 2]), torch.Size([4]))

In [23]:
xenc = F.one_hot(xs, num_classes=27)

In [24]:
xenc.shape

torch.Size([4, 2, 27])

In [25]:
xenc

tensor([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]]])

In [26]:
xenc_flattened = xenc.view(4, -1).float()

In [27]:
W = torch.randn((27*2, 27))
W

tensor([[-1.4784, -0.0288,  0.9226,  ..., -0.2990, -1.4303,  1.3789],
        [ 0.5463, -0.0881,  0.5297,  ...,  0.2599,  0.6881, -0.2659],
        [ 1.2698,  1.2695,  0.3575,  ..., -0.8792, -0.0050, -0.0139],
        ...,
        [-0.9656, -0.4642, -0.8045,  ..., -0.1527,  1.3452,  0.1447],
        [-0.3415,  0.3170, -1.6429,  ..., -1.5612, -0.4946, -1.3341],
        [-0.2823,  2.7903,  1.6254,  ..., -0.5698,  0.9835, -0.7987]])

In [28]:
logits = xenc_flattened @ W # log counts

In [29]:
counts = logits.exp()

In [30]:
prob = counts/counts.sum(1, keepdims=True)

In [31]:
prob.shape

torch.Size([4, 27])

In [32]:
ys

tensor([13, 13,  1,  0])

In [33]:
prob[0, 13], prob[1, 13], prob[2, 1], prob[3, 0]

(tensor(0.0590), tensor(0.0087), tensor(0.0375), tensor(0.0484))

In [34]:
-prob[torch.arange(4), ys].log().mean()

tensor(3.4730)

#### Train

In [89]:
xs, ys = generate_training_set(words)

In [90]:
xenc = F.one_hot(xs, num_classes=27)
xenc_flattened = xenc.view(len(xenc), -1).float()

In [91]:
xenc_flattened.dtype

torch.float32

In [92]:
xenc_flattened.shape, ys.shape

(torch.Size([196113, 54]), torch.Size([196113]))

In [93]:
def train(X, y, epochs, lr):
    num = X.shape[0]
    print(num)
    W = torch.randn((54, 27), requires_grad=True)
    for i in range(epochs):
        logits = X @ W
        counts = logits.exp()
        prob = counts/counts.sum(1, keepdims=True)
        loss = -prob[torch.arange(num), y].log().mean()
        print(f'Epoch {i} Loss {loss}')
        
        W.grad = None
        loss.backward()
        W.data += -lr * W.grad 
    return W

In [94]:
xenc_flattened.shape[0]

196113

In [95]:
model = train(xenc_flattened, ys, 100, 50)

196113
Epoch 0 Loss 4.2755351066589355
Epoch 1 Loss 3.4453465938568115
Epoch 2 Loss 3.105201482772827
Epoch 3 Loss 2.9067018032073975
Epoch 4 Loss 2.7822177410125732
Epoch 5 Loss 2.697117805480957
Epoch 6 Loss 2.6361849308013916
Epoch 7 Loss 2.590273141860962
Epoch 8 Loss 2.554192066192627
Epoch 9 Loss 2.524949550628662
Epoch 10 Loss 2.500777244567871
Epoch 11 Loss 2.480471134185791
Epoch 12 Loss 2.4631757736206055
Epoch 13 Loss 2.448251724243164
Epoch 14 Loss 2.4352245330810547
Epoch 15 Loss 2.423734426498413
Epoch 16 Loss 2.413508176803589
Epoch 17 Loss 2.4043331146240234
Epoch 18 Loss 2.396044969558716
Epoch 19 Loss 2.388511896133423
Epoch 20 Loss 2.3816282749176025
Epoch 21 Loss 2.375307321548462
Epoch 22 Loss 2.369478940963745
Epoch 23 Loss 2.364084005355835
Epoch 24 Loss 2.3590734004974365
Epoch 25 Loss 2.3544061183929443
Epoch 26 Loss 2.3500466346740723
Epoch 27 Loss 2.3459651470184326
Epoch 28 Loss 2.3421366214752197
Epoch 29 Loss 2.3385379314422607
Epoch 30 Loss 2.335150241851

Here the loss is less, it is an improve over bigram model

In [79]:
xenc = F.one_hot(torch.tensor([0, 0]), num_classes=27).float()

In [80]:
xenc.shape

torch.Size([2, 27])

### Prediction

In [87]:
def generate_words():
    for i in range(5):
        out = []
        ix1, ix2 = 0, 0
        while True:
            xenc = F.one_hot(torch.tensor([ix1, ix2]), num_classes=27).float()
            xenc_flattened = xenc.view(1, -1)
            logits = xenc_flattened @ model # predict log-counts
            counts = logits.exp()
            p = counts/counts.sum(1, keepdims=True)
            ix1 = ix2
            ix2 = torch.multinomial(p, num_samples=1, replacement=True).item()
            out.append(itos[ix2])
            if ix2 == 0:
                break
        print(''.join(out))

In [88]:
generate_words()

san.
udany.
arie.
en.
dansiy.


## E02
> split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [96]:
xenc_flattened.shape[0]

196113