# Building makemore exercise

## E01
> Train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

### Counting

In [78]:
from collections import defaultdict, Counter
import numpy
import torch
from matplotlib import pyplot as plt
import torch.nn.functional as F
from torch.utils.data import random_split

Read in the data

In [24]:
with open('../data/names.txt') as f:
    words = list(map(lambda x: x.strip(), f.readlines()))

In [25]:
words[:10], len(words)

(['emma',
  'olivia',
  'ava',
  'isabella',
  'sophia',
  'charlotte',
  'mia',
  'amelia',
  'harper',
  'evelyn'],
 32033)

In [26]:
def generate_tripling(words):
    for w in words:
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            yield ch1, ch2, ch3

In [27]:
alphabets = '.abcdefghijklmnopqrstuvwxyz'
stoi = {char: alphabets.index(char) for char in alphabets}
itos = dict(map(reversed, stoi.items()))

In [28]:
for ch1, ch2, ch3 in generate_tripling(words[:3]): print(ch1, ch2, ch3)

. e m
e m m
m m a
m a .
. o l
o l i
l i v
i v i
v i a
i a .
. a v
a v a
v a .


In [29]:
sum(1 for ch1, ch2, ch3 in generate_tripling(words))

196113

In [30]:
def generate_tripling_counter(words):
    tripling_counter = Counter()
    for ch1, ch2, ch3 in generate_tripling(words):
        tripling_counter[(ch1, ch2, ch3)] += 1
    return tripling_counter    

In [31]:
tripling_counter = generate_tripling_counter(words)
tripling_counter.most_common(10)

[(('a', 'h', '.'), 1714),
 (('n', 'a', '.'), 1673),
 (('a', 'n', '.'), 1509),
 (('o', 'n', '.'), 1503),
 (('.', 'm', 'a'), 1453),
 (('.', 'j', 'a'), 1255),
 (('.', 'k', 'a'), 1254),
 (('e', 'n', '.'), 1217),
 (('l', 'y', 'n'), 976),
 (('y', 'n', '.'), 953)]

In [32]:
len(tripling_counter)

6037

In [33]:
def create_matrix():
    N = torch.zeros((27, 27, 27), dtype=torch.int32)
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        N[ix1, ix2, ix3] += 1
    return N  

In [34]:
N = create_matrix(); N.shape

torch.Size([27, 27, 27])

In [35]:
N[1, 8, 0]

tensor(1714, dtype=torch.int32)

In [36]:
P = (N+1).float()
P = P/P.sum(-1, keepdims=True)

In [37]:
def generate_tripling_prob(words):
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        prob = P[ix1, ix2, ix3]
        yield ch1, ch2, ch3, prob

In [38]:
for ch1, ch2, ch3, prob in generate_tripling_prob(words[:3]): 
    print(ch1, ch2, ch3, prob)

. e m tensor(0.1855)
e m m tensor(0.1269)
m m a tensor(0.3744)
m a . tensor(0.0669)
. o l tensor(0.2494)
o l i tensor(0.1084)
l i v tensor(0.0219)
i v i tensor(0.2669)
v i a tensor(0.1578)
i a . tensor(0.3657)
. a v tensor(0.0550)
a v a tensor(0.1882)
v a . tensor(0.1405)


In [39]:
def generate_names(count, P):
    g = torch.Generator().manual_seed(2147483647)
    for i in range(count):
        out = []
        ix1, ix2 = 0, 0
        while True:
            p = P[ix1, ix2]
            ix1 = ix2
            ix2 = torch.multinomial(p, num_samples = 1, replacement = True).item()
            out.append(itos[ix2])
            if ix2 == 0:
                break
        yield ''.join(out)

In [40]:
for name in generate_names(5, P): print(name)

adreyonth.
paow.
hair.
zykkley.
weighlyah.


In [41]:
def log_likelihood(words):
    sum_log = 0
    count = 0
    for ch1, ch2, ch3, prob in generate_tripling_prob(words):
        sum_log += torch.log(prob)
        count += 1
    return sum_log/count

In [42]:
len(P)

27

In [43]:
log_likelihood(words)

tensor(-2.0927)

negative log likelihood

In [44]:
- log_likelihood(words)

tensor(2.0927)

### NN

In [45]:
def generate_training_set(words):
    xs1 = []
    xs2 = []
    ys = []
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs1.append(ix1)
        xs2.append(ix2)
        ys.append(ix3)
    xs1 = torch.tensor(xs1)
    xs2 = torch.tensor(xs2)
    xs = torch.vstack((xs1, xs2)).permute(1, 0)
    ys = torch.tensor(ys)
    return xs, ys 

#### Sample dataset

In [46]:
xs, ys = generate_training_set(words[:1])

In [47]:
xs.shape, ys.shape

(torch.Size([4, 2]), torch.Size([4]))

In [48]:
xenc = F.one_hot(xs, num_classes=27)

In [49]:
xenc.shape

torch.Size([4, 2, 27])

In [50]:
xenc

tensor([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]]])

In [51]:
xenc_flattened = xenc.view(4, -1).float()

In [52]:
W = torch.randn((27*2, 27))
W

tensor([[-0.7410,  0.5983, -0.0857,  ..., -0.5280, -1.9061,  0.5682],
        [ 0.9326,  0.3938,  1.8656,  ..., -1.4538,  0.0360,  0.9010],
        [ 0.2878,  0.0030, -1.1351,  ..., -1.6832,  0.2385, -1.9534],
        ...,
        [-0.3587, -1.5526,  0.6228,  ...,  0.4512,  0.5258, -0.4301],
        [ 0.0477,  1.0556, -0.4355,  ...,  1.2603, -0.7227, -1.9504],
        [-0.6375, -0.0029, -0.4922,  ..., -0.0877,  0.9230,  1.1457]])

In [53]:
logits = xenc_flattened @ W # log counts

In [54]:
counts = logits.exp()

In [55]:
prob = counts/counts.sum(1, keepdims=True)

In [56]:
prob.shape

torch.Size([4, 27])

In [57]:
ys

tensor([13, 13,  1,  0])

In [58]:
prob[0, 13], prob[1, 13], prob[2, 1], prob[3, 0]

(tensor(0.0352), tensor(0.0554), tensor(0.0118), tensor(0.0079))

In [59]:
-prob[torch.arange(4), ys].log().mean()

tensor(3.8801)

#### Train

In [60]:
xs, ys = generate_training_set(words)

In [61]:
xs[0], ys[0]

(tensor([0, 5]), tensor(13))

In [62]:
xenc = F.one_hot(xs, num_classes=27)
xenc_flattened = xenc.view(len(xenc), -1).float()

In [63]:
xenc_flattened.dtype

torch.float32

In [64]:
xenc_flattened.shape, ys.shape

(torch.Size([196113, 54]), torch.Size([196113]))

In [65]:
def train(X, y, epochs, lr):
    num = X.shape[0]
    print(num)
    W = torch.randn((54, 27), requires_grad=True)
    for i in range(epochs):
        logits = X @ W
        counts = logits.exp()
        prob = counts/counts.sum(1, keepdims=True)
        loss = -prob[torch.arange(num), y].log().mean()
        print(f'Epoch {i} Loss {loss}')
        
        W.grad = None
        loss.backward()
        W.data += -lr * W.grad 
    return W

In [66]:
xenc_flattened.shape[0]

196113

In [67]:
model = train(xenc_flattened, ys, 100, 50)

196113
Epoch 0 Loss 3.99216365814209
Epoch 1 Loss 3.3084821701049805
Epoch 2 Loss 3.0252816677093506
Epoch 3 Loss 2.855762481689453
Epoch 4 Loss 2.7456722259521484
Epoch 5 Loss 2.6689293384552
Epoch 6 Loss 2.613008975982666
Epoch 7 Loss 2.570136547088623
Epoch 8 Loss 2.5359742641448975
Epoch 9 Loss 2.507956027984619
Epoch 10 Loss 2.484590768814087
Epoch 11 Loss 2.4648184776306152
Epoch 12 Loss 2.4479176998138428
Epoch 13 Loss 2.4333159923553467
Epoch 14 Loss 2.4205989837646484
Epoch 15 Loss 2.4094254970550537
Epoch 16 Loss 2.399540424346924
Epoch 17 Loss 2.3907299041748047
Epoch 18 Loss 2.3828296661376953
Epoch 19 Loss 2.3757007122039795
Epoch 20 Loss 2.369234085083008
Epoch 21 Loss 2.363337755203247
Epoch 22 Loss 2.357937812805176
Epoch 23 Loss 2.352971315383911
Epoch 24 Loss 2.3483874797821045
Epoch 25 Loss 2.344142436981201
Epoch 26 Loss 2.3401999473571777
Epoch 27 Loss 2.3365275859832764
Epoch 28 Loss 2.3331000804901123
Epoch 29 Loss 2.329892873764038
Epoch 30 Loss 2.32688617706298

Here the loss is less, it is an improve over bigram model

In [68]:
xenc = F.one_hot(torch.tensor([0, 0]), num_classes=27).float()

In [69]:
xenc.shape

torch.Size([2, 27])

### Prediction

In [70]:
def generate_words():
    for i in range(5):
        out = []
        ix1, ix2 = 0, 0
        while True:
            xenc = F.one_hot(torch.tensor([ix1, ix2]), num_classes=27).float()
            xenc_flattened = xenc.view(1, -1)
            logits = xenc_flattened @ model # predict log-counts
            counts = logits.exp()
            p = counts/counts.sum(1, keepdims=True)
            ix1 = ix2
            ix2 = torch.multinomial(p, num_samples=1, replacement=True).item()
            out.append(itos[ix2])
            if ix2 == 0:
                break
        print(''.join(out))

In [71]:
generate_words()

om.
yaliany.
la.
unk.
yna.


## E02
> split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [87]:
xenc_num = xenc_flattened.shape[0]

In [98]:
range(xenc_num)

range(0, 196113)

In [105]:
test_subset, valid_subset, train_subset = random_split(range(xenc_num), [0.1, 0.1, 0.8], 
                          generator=torch.Generator().manual_seed(42))

In [106]:
train_idx = torch.tensor(train_subset)
valid_idx = torch.tensor(valid_subset)
test_idx = torch.tensor(test_subset)

In [107]:
train_idx.shape, valid_idx.shape, test_idx.shape

(torch.Size([156890]), torch.Size([19611]), torch.Size([19612]))

In [111]:
x_train, y_train = xenc_flattened[train_idx], ys[train_idx]
x_valid, y_valid = xenc_flattened[valid_idx], ys[valid_idx]
x_test, y_test = xenc_flattened[test_idx], ys[test_idx]

In [135]:
model_trigram = train(x_train, y_train, 100, 10)

156890
Epoch 0 Loss 4.207601547241211
Epoch 1 Loss 4.009314060211182
Epoch 2 Loss 3.8391053676605225
Epoch 3 Loss 3.690490961074829
Epoch 4 Loss 3.560753107070923
Epoch 5 Loss 3.448688507080078
Epoch 6 Loss 3.3531737327575684
Epoch 7 Loss 3.272313117980957
Epoch 8 Loss 3.203514337539673
Epoch 9 Loss 3.1441755294799805
Epoch 10 Loss 3.0922048091888428
Epoch 11 Loss 3.0461015701293945
Epoch 12 Loss 3.0048084259033203
Epoch 13 Loss 2.9675512313842773
Epoch 14 Loss 2.933732509613037
Epoch 15 Loss 2.902869939804077
Epoch 16 Loss 2.8745672702789307
Epoch 17 Loss 2.8484928607940674
Epoch 18 Loss 2.8243699073791504
Epoch 19 Loss 2.801967144012451
Epoch 20 Loss 2.781090259552002
Epoch 21 Loss 2.7615761756896973
Epoch 22 Loss 2.7432875633239746
Epoch 23 Loss 2.726107358932495
Epoch 24 Loss 2.7099366188049316
Epoch 25 Loss 2.694688081741333
Epoch 26 Loss 2.6802890300750732
Epoch 27 Loss 2.666672945022583
Epoch 28 Loss 2.653782844543457
Epoch 29 Loss 2.6415672302246094
Epoch 30 Loss 2.629980325698

Loss on the dev set

In [133]:
logits_valid = x_valid @ model_trigram
counts_valid = logits_valid.exp()
pred_valid = counts_valid/counts_valid.sum(1, keepdims=True)
- pred_valid[torch.arange(x_valid.shape[0]), y_valid].log().mean()

tensor(2.3278, grad_fn=<NegBackward0>)

Loss on the test set

In [134]:
logits_test = x_test @ model_trigram
counts_test = logits_test.exp()
pred_test = counts_test/counts_test.sum(1, keepdims=True)
- pred_test[torch.arange(x_test.shape[0]), y_test].log().mean()

tensor(2.3411, grad_fn=<NegBackward0>)

The loss on test and validation dataset are about the same for the trigram model on the training set.

Same we can for the bigram model that I have computed in `nbs/lecture_notes/02_building_makemore.ipynb`. The validation and test dataset have about the same losses to the training set.

The trigram is better than bigram