# Building makemore exercise

## E01
> Train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

### Counting

In [1]:
from collections import defaultdict, Counter
import numpy
import torch
from matplotlib import pyplot as plt
import torch.nn.functional as F
from torch.utils.data import random_split

Read in the data

In [2]:
with open('../data/names.txt') as f:
    words = list(map(lambda x: x.strip(), f.readlines()))

In [3]:
words[:10], len(words)

(['emma',
  'olivia',
  'ava',
  'isabella',
  'sophia',
  'charlotte',
  'mia',
  'amelia',
  'harper',
  'evelyn'],
 32033)

In [4]:
def generate_tripling(words):
    for w in words:
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            yield ch1, ch2, ch3

In [5]:
alphabets = '.abcdefghijklmnopqrstuvwxyz'
stoi = {char: alphabets.index(char) for char in alphabets}
itos = dict(map(reversed, stoi.items()))

In [6]:
for ch1, ch2, ch3 in generate_tripling(words[:3]): print(ch1, ch2, ch3)

. e m
e m m
m m a
m a .
. o l
o l i
l i v
i v i
v i a
i a .
. a v
a v a
v a .


In [7]:
sum(1 for ch1, ch2, ch3 in generate_tripling(words))

196113

In [8]:
def generate_tripling_counter(words):
    tripling_counter = Counter()
    for ch1, ch2, ch3 in generate_tripling(words):
        tripling_counter[(ch1, ch2, ch3)] += 1
    return tripling_counter    

In [9]:
tripling_counter = generate_tripling_counter(words)
tripling_counter.most_common(10)

[(('a', 'h', '.'), 1714),
 (('n', 'a', '.'), 1673),
 (('a', 'n', '.'), 1509),
 (('o', 'n', '.'), 1503),
 (('.', 'm', 'a'), 1453),
 (('.', 'j', 'a'), 1255),
 (('.', 'k', 'a'), 1254),
 (('e', 'n', '.'), 1217),
 (('l', 'y', 'n'), 976),
 (('y', 'n', '.'), 953)]

In [10]:
len(tripling_counter)

6037

In [11]:
def create_matrix():
    N = torch.zeros((27, 27, 27), dtype=torch.int32)
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        N[ix1, ix2, ix3] += 1
    return N  

In [12]:
N = create_matrix(); N.shape

torch.Size([27, 27, 27])

In [13]:
N[1, 8, 0]

tensor(1714, dtype=torch.int32)

In [14]:
P = (N+1).float()
P = P/P.sum(-1, keepdims=True)

In [15]:
def generate_tripling_prob(words):
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        prob = P[ix1, ix2, ix3]
        yield ch1, ch2, ch3, prob

In [16]:
for ch1, ch2, ch3, prob in generate_tripling_prob(words[:3]): 
    print(ch1, ch2, ch3, prob)

. e m tensor(0.1855)
e m m tensor(0.1269)
m m a tensor(0.3744)
m a . tensor(0.0669)
. o l tensor(0.2494)
o l i tensor(0.1084)
l i v tensor(0.0219)
i v i tensor(0.2669)
v i a tensor(0.1578)
i a . tensor(0.3657)
. a v tensor(0.0550)
a v a tensor(0.1882)
v a . tensor(0.1405)


In [17]:
def generate_names(count, P):
    g = torch.Generator().manual_seed(2147483647)
    for i in range(count):
        out = []
        ix1, ix2 = 0, 0
        while True:
            p = P[ix1, ix2]
            ix1 = ix2
            ix2 = torch.multinomial(p, num_samples = 1, replacement = True).item()
            out.append(itos[ix2])
            if ix2 == 0:
                break
        yield ''.join(out)

In [18]:
for name in generate_names(5, P): print(name)

juandswxedilyiahj.
iahmoja.
nise.
zie.
bro.


In [19]:
def log_likelihood(words):
    sum_log = 0
    count = 0
    for ch1, ch2, ch3, prob in generate_tripling_prob(words):
        sum_log += torch.log(prob)
        count += 1
    return sum_log/count

In [20]:
len(P)

27

In [21]:
log_likelihood(words)

tensor(-2.0927)

negative log likelihood

In [22]:
- log_likelihood(words)

tensor(2.0927)

### NN

In [23]:
def generate_training_set(words):
    xs1 = []
    xs2 = []
    ys = []
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs1.append(ix1)
        xs2.append(ix2)
        ys.append(ix3)
    xs1 = torch.tensor(xs1)
    xs2 = torch.tensor(xs2)
    xs = torch.vstack((xs1, xs2)).permute(1, 0)
    ys = torch.tensor(ys)
    return xs, ys 

#### Sample dataset

In [24]:
xs, ys = generate_training_set(words[:1])

In [25]:
xs.shape, ys.shape

(torch.Size([4, 2]), torch.Size([4]))

In [26]:
xenc = F.one_hot(xs, num_classes=27)

In [27]:
xenc.shape

torch.Size([4, 2, 27])

In [28]:
xenc

tensor([[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]],

        [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0],
         [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0]]])

In [29]:
xenc_flattened = xenc.view(4, -1).float()

In [30]:
W = torch.randn((27*2, 27))
W

tensor([[-0.6339,  1.0934,  1.5940,  ..., -0.5997, -0.3661, -1.5327],
        [ 0.2762,  0.7459, -1.6092,  ...,  0.9993,  0.0276, -0.5299],
        [ 0.8111, -0.1221, -0.3052,  ...,  0.4578,  0.6346, -0.9694],
        ...,
        [ 0.0223,  0.0072, -0.8025,  ..., -0.2540, -2.2072,  0.5459],
        [-2.1170, -0.1290, -0.9017,  ...,  0.8187,  2.0031,  1.3627],
        [ 1.4702, -1.0431,  0.4211,  ..., -0.3957, -0.2581, -0.0458]])

In [31]:
logits = xenc_flattened @ W # log counts

In [32]:
counts = logits.exp()

In [33]:
prob = counts/counts.sum(1, keepdims=True)

In [34]:
prob.shape

torch.Size([4, 27])

In [35]:
ys

tensor([13, 13,  1,  0])

In [36]:
prob[0, 13], prob[1, 13], prob[2, 1], prob[3, 0]

(tensor(0.0405), tensor(0.0056), tensor(0.0850), tensor(0.0354))

In [37]:
-prob[torch.arange(4), ys].log().mean()

tensor(3.5515)

#### Train

In [38]:
xs, ys = generate_training_set(words)

In [39]:
xs[0], ys[0]

(tensor([0, 5]), tensor(13))

In [40]:
xenc = F.one_hot(xs, num_classes=27)
xenc_flattened = xenc.view(len(xenc), -1).float()

In [41]:
xenc_flattened.dtype

torch.float32

In [42]:
xenc_flattened.shape, ys.shape

(torch.Size([196113, 54]), torch.Size([196113]))

In [43]:
def train(X, y, epochs, lr):
    num = X.shape[0]
    print(num)
    W = torch.randn((54, 27), requires_grad=True)
    for i in range(epochs):
        logits = X @ W
        counts = logits.exp()
        prob = counts/counts.sum(1, keepdims=True)
        loss = -prob[torch.arange(num), y].log().mean()
        print(f'Epoch {i} Loss {loss}')
        
        W.grad = None
        loss.backward()
        W.data += -lr * W.grad 
    return W

In [44]:
xenc_flattened.shape[0]

196113

In [45]:
model = train(xenc_flattened, ys, 100, 50)

196113
Epoch 0 Loss 4.214053630828857
Epoch 1 Loss 3.3863525390625
Epoch 2 Loss 3.074662923812866
Epoch 3 Loss 2.9096202850341797
Epoch 4 Loss 2.7980833053588867
Epoch 5 Loss 2.719160556793213
Epoch 6 Loss 2.658668279647827
Epoch 7 Loss 2.6116385459899902
Epoch 8 Loss 2.5733280181884766
Epoch 9 Loss 2.5419392585754395
Epoch 10 Loss 2.5155203342437744
Epoch 11 Loss 2.493177652359009
Epoch 12 Loss 2.473928689956665
Epoch 13 Loss 2.4572641849517822
Epoch 14 Loss 2.4426419734954834
Epoch 15 Loss 2.429755687713623
Epoch 16 Loss 2.418288230895996
Epoch 17 Loss 2.4080440998077393
Epoch 18 Loss 2.3988263607025146
Epoch 19 Loss 2.390503406524658
Epoch 20 Loss 2.3829457759857178
Epoch 21 Loss 2.376060724258423
Epoch 22 Loss 2.3697593212127686
Epoch 23 Loss 2.3639745712280273
Epoch 24 Loss 2.358642816543579
Epoch 25 Loss 2.353714942932129
Epoch 26 Loss 2.3491451740264893
Epoch 27 Loss 2.344896078109741
Epoch 28 Loss 2.3409347534179688
Epoch 29 Loss 2.3372325897216797
Epoch 30 Loss 2.3337647914886

Here the loss is less, it is an improve over bigram model

In [46]:
xenc = F.one_hot(torch.tensor([0, 0]), num_classes=27).float()

In [47]:
xenc.shape

torch.Size([2, 27])

### Prediction

In [48]:
def generate_words():
    for i in range(5):
        out = []
        ix1, ix2 = 0, 0
        while True:
            xenc = F.one_hot(torch.tensor([ix1, ix2]), num_classes=27).float()
            xenc_flattened = xenc.view(1, -1)
            logits = xenc_flattened @ model # predict log-counts
            counts = logits.exp()
            p = counts/counts.sum(1, keepdims=True)
            ix1 = ix2
            ix2 = torch.multinomial(p, num_samples=1, replacement=True).item()
            out.append(itos[ix2])
            if ix2 == 0:
                break
        print(''.join(out))

In [49]:
generate_words()

arlya.
ilane.
imbzaracirvileito.
uren.
aarvan.


## E02
> split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?

In [50]:
xenc_num = xenc_flattened.shape[0]

In [51]:
range(xenc_num)

range(0, 196113)

In [52]:
test_subset, valid_subset, train_subset = random_split(range(xenc_num), [0.1, 0.1, 0.8], 
                          generator=torch.Generator().manual_seed(42))

In [53]:
train_idx = torch.tensor(train_subset)
valid_idx = torch.tensor(valid_subset)
test_idx = torch.tensor(test_subset)

In [54]:
train_idx.shape, valid_idx.shape, test_idx.shape

(torch.Size([156890]), torch.Size([19611]), torch.Size([19612]))

In [55]:
x_train, y_train = xenc_flattened[train_idx], ys[train_idx]
x_valid, y_valid = xenc_flattened[valid_idx], ys[valid_idx]
x_test, y_test = xenc_flattened[test_idx], ys[test_idx]

In [56]:
model_trigram = train(x_train, y_train, 100, 10)

156890
Epoch 0 Loss 4.166134357452393
Epoch 1 Loss 3.9657092094421387
Epoch 2 Loss 3.7897253036499023
Epoch 3 Loss 3.637380838394165
Epoch 4 Loss 3.5074849128723145
Epoch 5 Loss 3.3979551792144775
Epoch 6 Loss 3.305875539779663
Epoch 7 Loss 3.228022575378418
Epoch 8 Loss 3.161454677581787
Epoch 9 Loss 3.1038119792938232
Epoch 10 Loss 3.053314685821533
Epoch 11 Loss 3.0086400508880615
Epoch 12 Loss 2.9687905311584473
Epoch 13 Loss 2.932997941970825
Epoch 14 Loss 2.900658130645752
Epoch 15 Loss 2.8712871074676514
Epoch 16 Loss 2.8444929122924805
Epoch 17 Loss 2.8199524879455566
Epoch 18 Loss 2.797395944595337
Epoch 19 Loss 2.776595115661621
Epoch 20 Loss 2.757355213165283
Epoch 21 Loss 2.7395074367523193
Epoch 22 Loss 2.7229044437408447
Epoch 23 Loss 2.7074191570281982
Epoch 24 Loss 2.6929397583007812
Epoch 25 Loss 2.679368257522583
Epoch 26 Loss 2.6666200160980225
Epoch 27 Loss 2.6546199321746826
Epoch 28 Loss 2.6433022022247314
Epoch 29 Loss 2.6326088905334473
Epoch 30 Loss 2.622488975

Loss on the dev set

In [57]:
def evaluate_loss(model, x, y):
    logits = x @ model
    counts = logits.exp()
    pred = counts/counts.sum(1, keepdims=True)
    return - pred[torch.arange(x.shape[0]), y].log().mean()

In [59]:
evaluate_loss(model_trigram, x_valid, y_valid)

tensor(2.3818, grad_fn=<NegBackward0>)

Loss on the test set

In [60]:
evaluate_loss(model_trigram, x_test, y_test)

tensor(2.3922, grad_fn=<NegBackward0>)

The loss on test and validation dataset are about the same for the trigram model on the training set.

Same we can for the bigram model that I have computed in `nbs/lecture_notes/02_building_makemore.ipynb`. The validation and test dataset have about the same losses to the training set.

The trigram is better than bigram

### E03
> Use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?

In [61]:
def train(X, y, epochs, lr, regularization_param, print_at_every_epoch = False):
    num = X.shape[0]
    W = torch.randn((54, 27), requires_grad=True)
    for i in range(epochs):
        logits = X @ W
        counts = logits.exp()
        prob = counts/counts.sum(1, keepdims=True)
        loss = -prob[torch.arange(num), y].log().mean()
        
        # regularization
        regularization_loss = regularization_param * (W **2).sum()
        loss += regularization_loss
        
        if print_at_every_epoch: print(f'Epoch {i} Loss {loss}')
        
        W.grad = None
        loss.backward()
        W.data += -lr * W.grad 
        
    if not print_at_every_epoch: 
    return W

In [None]:
train(x_train, y_train, 100, 10, 0)