# Building makemore exercise

1. Train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?

### Counting

In [4]:
from collections import defaultdict, Counter
import numpy
import torch
from matplotlib import pyplot as plt

Read in the data

In [5]:
with open('../data/names.txt') as f:
    words = list(map(lambda x: x.strip(), f.readlines()))

In [6]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [7]:
def generate_tripling(words):
    for w in words:
        chs = ['.'] + list(w) + ['.']
        for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
            yield ch1, ch2, ch3

In [14]:
alphabets = '.abcdefghijklmnopqrstuvwxyz'
stoi = {char: alphabets.index(char) for char in alphabets}
itos = dict(map(reversed, stoi.items()))

In [15]:
for ch1, ch2, ch3 in generate_tripling(words[:3]): print(ch1, ch2, ch3)

. e m
e m m
m m a
m a .
. o l
o l i
l i v
i v i
v i a
i a .
. a v
a v a
v a .


In [27]:
def generate_tripling_counter(words):
    tripling_counter = Counter()
    for ch1, ch2, ch3 in generate_tripling(words):
        tripling_counter[(ch1, ch2, ch3)] += 1
    return tripling_counter    

In [32]:
tripling_counter = generate_tripling_counter(words)
tripling_counter.most_common(10)

[(('a', 'h', '.'), 1714),
 (('n', 'a', '.'), 1673),
 (('a', 'n', '.'), 1509),
 (('o', 'n', '.'), 1503),
 (('.', 'm', 'a'), 1453),
 (('.', 'j', 'a'), 1255),
 (('.', 'k', 'a'), 1254),
 (('e', 'n', '.'), 1217),
 (('l', 'y', 'n'), 976),
 (('y', 'n', '.'), 953)]

In [18]:
def create_matrix():
    N = torch.zeros((27, 27, 27), dtype=torch.int32)
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        N[ix1, ix2, ix3] += 1
    return N  

In [20]:
N = create_matrix(); N.shape

torch.Size([27, 27, 27])

In [34]:
N[1, 8, 0]

tensor(1714, dtype=torch.int32)

In [37]:
P = (N+1).float()
P = P/P.sum(-1, keepdims=True)

In [46]:
def generate_tripling_prob(words):
    for ch1, ch2, ch3 in generate_tripling(words):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        prob = P[ix1, ix2, ix3]
        yield ch1, ch2, ch3, prob

In [47]:
for ch1, ch2, ch3, prob in generate_tripling_prob(words[:3]): 
    print(ch1, ch2, ch3, prob)

. e m tensor(0.1855)
e m m tensor(0.1269)
m m a tensor(0.3744)
m a . tensor(0.0669)
. o l tensor(0.2494)
o l i tensor(0.1084)
l i v tensor(0.0219)
i v i tensor(0.2669)
v i a tensor(0.1578)
i a . tensor(0.3657)
. a v tensor(0.0550)
a v a tensor(0.1882)
v a . tensor(0.1405)


In [60]:
def generate_names(count, P):
    g = torch.Generator()
#     .manual_seed(2147483647)
    for i in range(count):
        out = []
        ix1, ix2 = 0, 0
        while True:
            p = P[ix1, ix2]
            ix1 = ix2
            ix2 = torch.multinomial(p, num_samples = 1, replacement = True).item()
            out.append(itos[ix2])
            if ix2 == 0:
                break
        yield ''.join(out)

In [61]:
for name in generate_names(5, P): print(name)

wyn.
wii.
ryanjaimakeigh.
em.
nortuster.


In [64]:
def log_likelihood(words):
    sum_log = 0
    count = 0
    for ch1, ch2, ch3, prob in generate_tripling_prob(words):
        sum_log += torch.log(prob)
        count += 1
    return sum_log/count

In [65]:
log_likelihood(words)

tensor(-2.0927)

negative log likelihood

In [66]:
- log_likelihood(words)

tensor(2.0927)