In [1]:
import time
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

from collections import Counter
from tqdm import tqdm

The goal is to implement a trigram model both using counts and a neural network.

How do we construct these trigrams? What is the idea if we use counts? A trigram is a sequence of three letters. We want to model the probability of seeing a particular letter given the previous two. How do we do that for the beginning of a word? Does a word begin with two '.' elements?

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.set_default_tensor_type(torch.cuda.FloatTensor)

In [4]:
default_dtype = torch.float32
torch.set_default_dtype(default_dtype)

In [5]:
with open('names.txt', 'r') as file:
    words = file.read().splitlines()

In [6]:
def trigrams(words):
    for w in words:
        chs = ['.', '.'] + list(w) + ['.']
        for c1, c2, c3 in zip(chs, chs[1:], chs[2:]):
            yield c1, c2, c3

In [7]:
chars = sorted(list(set(''.join(words))))

A trigram count model would map two chars to a single char that follows. What dimensions should a count lookup table have? Well, what are all the possible two char sequences that we might have? Certainly can be two dots or it can start with a dot. Can't start with a letter and end with a dot, because that should have terminated evaluation earlier. So the number should be $1\cdot1 + 1\cdot26 + 26\cdot26 = 27\cdot26 + 1 = 27\cdot27-26$.

In [8]:
DOT = '.'

In [9]:
btoi = {}
i = 0

btoi[(DOT, DOT)] = i
i += 1

for c in chars:
    btoi[(DOT, c)] = i
    i += 1
    
for c1 in chars:
    for c2 in chars:
        btoi[(c1, c2)] = i
        i += 1

In [10]:
itob = {i: b for b, i in btoi.items()}

In [11]:
ctoi = {}
i = 0

ctoi[DOT] = i
i += 1

for c in chars:
    ctoi[c] = i
    i += 1
    
itoc = {i: c}

In [12]:
itoc = {i: c for c, i in ctoi.items()}

In [13]:
m = len(itob)
n = len(itoc)

# Count-based model

In [14]:
N = torch.zeros((m, n), dtype=torch.int32, device='cpu')

In [15]:
for c1, c2, c3 in trigrams(words):
    i1 = btoi[(c1, c2)]
    i2 = ctoi[c3]
    N[i1, i2] += 1

In [16]:
P = N.float()
P /= P.sum(axis=1, keepdim=True)

In [17]:
g = torch.Generator(device='cpu').manual_seed(42)

def makeone():
    i = 0
    s = ''
    while True:
        p = P[i]
        ci = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        # now we have a new character index, we need to update our lookup bigram
        # the new lookup bigram will contain the second character in the first position
        # and the new character in the second position
        # what is the current bigram? it is given by i
        if ci == 0:
            break
        i = btoi[(itob[i][1], itoc[ci])]
        s += itoc[ci]
    return s

Now want to evaluate the model. The idea is to calculate the likelihood of the dataset given the model parameters.

In [18]:
ll = 0
k = 0
for word in words:
    chs = ['.', '.'] + list(word) + ['.']
    for c1, c2, c3 in zip(chs, chs[1:], chs[2:]):
        i1 = btoi[(c1, c2)]
        i2 = ctoi[c3]
        ll += torch.log(P[i1, i2])
        k += 1
print(f"mean negative likelihood is {-ll/k}")

mean negative likelihood is 2.185652017593384


# Gradient descent optimization

Create a dataset. We do this by turning character indices into one hot vector. The first character in a bigram is an $x$, the second character is a $y$.

In [19]:
xs = []
ys = []
for word in words:
    chs = ['.', '.'] + list(word) + ['.']
    for c1, c2, c3 in zip(chs, chs[1:], chs[2:]):
        i1 = btoi[(c1, c2)]
        i2 = ctoi[c3]
        xs.append(i1)
        ys.append(i2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [20]:
xenc = F.one_hot(xs, m).to(default_dtype)

In [21]:
g = torch.Generator(device='cuda').manual_seed(2147483647)
W = torch.randn(m, n, generator=g, requires_grad=True)

### Train using one-hot encoding

In [22]:
t = time.time()
print(f"{'epoch':>6} {'loss':>10} {'time,s':>7}")
for i in range(3000+1):
    # --- ONE-HOT ---
    logits = xenc @ W # log-counts
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(len(xs)), ys].log().mean()# + 20*(W**2).mean()
    if (i+1)%1000 == 1:
        tt = time.time()
        print(f"{i+1:6} {loss.data.item():10.5f} {tt-t:>7.2f}")
        t = tt

    W.grad = None # zero out the gradients
    loss.backward()
    W.data += -50*W.grad

 epoch       loss  time,s
     1    3.75721    0.39
  1001    2.23654    2.64
  2001    2.21365    2.70
  3001    2.20539    2.70


### Train using indexing

In [23]:
t = time.time()
print(f"{'epoch':>6} {'loss':>10} {'time,s':>7}")
for i in range(3000+1):
    # --- INDEXING ---
    logits = W[xs] # log-counts
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(len(xs)), ys].log().mean()# + 20*(W**2).mean()
    if (i+1)%1000 == 1:
        tt = time.time()
        print(f"{i+1:6} {loss.data.item():10.5f} {tt-t:>7.2f}")
        t = tt

    W.grad = None # zero out the gradients
    loss.backward()
    W.data += -50*W.grad

 epoch       loss  time,s
     1    2.20538    0.00
  1001    2.20098   20.02
  2001    2.19820   20.38
  3001    2.19626   20.38


In [24]:
i = 0
s = ''
while True:
    xenc = F.one_hot(torch.tensor([i]), m).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    ci = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
    if ci == 0:
        break
    i = btoi[(itob[i][1], itoc[ci])]
    s += itoc[ci]
print(s)

aarthusia
