In [3]:
# imports
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
words = open('_resources/names.txt', 'r').read().splitlines()

b = {}
for w in words:
    chs = ['<S>'] + list(w) + ['<E>']
    for ch1, ch2 in zip(chs, chs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1
        # print(ch1, ch2)

chars = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}

N = torch.zeros((27, 27), dtype=torch.int32) # a tensor with shape (27,27)

In [54]:
# Create the dataset

# xs: inputs to the neural net
# ys: labels for the correct next character in a sequence
xs, ys = [], []

# for w in words[:1]:
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs) # for ['emma']: [ 0,  5, 13, 13,  1]
ys = torch.tensor(ys) # for ['emma']: [ 5, 13, 13,  1,  0]

num = xs.nelement()

print('number of examples:', num)

number of examples: 228146


In [61]:
# Initialize the 'network'

# randomly initialize 27 neurons' wights.
# each nueron receives 27 inputs
g = torch.Generator().manual_seed(2147483647)
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [66]:
(W**2).mean()

tensor(2.1080, grad_fn=<MeanBackward0>)

In [89]:
# Gradient descent

for k in range(100):
    
    # Forward pass: plug all the input examples (`xs`) into a neural net
    
    # input to the network: one-hot encoding
    # [5,27], mostly 0s, a few 1s
    xenc = F.one_hot(xs, num_classes=27).float()
    # predict log-counts
    # xenc is one_hot! multiplying [0,0,0,0,1,0,...] * W actually plucks out the 5th row of W
    logits = xenc @ W 
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # regularization: "spring force"... W wants to be 0, and the probabilities want to be uniform (regularization part),
    # but they also simultaneously want to match up the probabilities as indicated by the data (everything before regularization)
    loss = -probs[torch.arange(num), ys].log().mean() # + 0.01 * (W**2).mean() # regularize the loss
    print(loss.item())
    # btw: the last 2 lines here together are called a 'softmax'

    # Backward pass

    W.grad = None # set to zero the gradient
    loss.backward()

    # Update
    W.data += -50 * W.grad

# This converges on ~2.47, which is very similar to what we got in the first example by counting
# But the advantage of this approach is that it's more flexible
# ie in future exercises we will take more and more past characters and feed them into bigger and bigger neural nets,
# ...until we arrive at GPT-2

2.4623453617095947
2.462298631668091
2.462252140045166
2.4622061252593994
2.46216082572937
2.462116003036499
2.462071418762207
2.4620275497436523
2.461984157562256
2.4619410037994385
2.4618983268737793
2.4618561267852783
2.4618144035339355
2.461773157119751
2.4617323875427246
2.4616920948028564
2.4616518020629883
2.4616119861602783
2.4615726470947266
2.461533784866333
2.4614953994750977
2.4614572525024414
2.4614193439483643
2.4613821506500244
2.4613449573516846
2.461308240890503
2.4612717628479004
2.461236000061035
2.46120023727417
2.461164712905884
2.461129665374756
2.461095094680786
2.4610607624053955
2.461026668548584
2.4609928131103516
2.4609594345092773
2.4609262943267822
2.460893392562866
2.4608609676361084
2.4608287811279297
2.460797071456909
2.4607648849487305
2.460733652114868
2.460702896118164
2.4606716632843018
2.4606411457061768
2.460610866546631
2.460580587387085
2.4605510234832764
2.4605214595794678
2.4604921340942383
2.460463047027588
2.4604339599609375
2.460405588150024

In [47]:
# there are five bigrams inside `.emma.`
nlls = torch.zeros(5)
for i in range(5):
    # i-th bigram
    x = xs[i].item() # input character index
    y = ys[i].item() # label character index
    print('-------')
    print(f'bigram example {i+1}: {itos[x]}{itos[y]} (indexes {x},{y})')
    print('input to the neural net:', x)
    print('output probabilities from the neural net:', probs[i])
    print('label (actual next character):', y)
    p = probs[i, y]
    print('probability assigned by the net to the current character:', p.item())
    logp = torch.log(p)
    print('log likelihood', logp.item())
    nll = -logp
    print('negative log likelihood:', nll.item())
    nlls[i] = nll

print('==========')
# same as loss
print('average negative log likelihood, i.e. loss =', nlls.mean().item())

-------
bigram example 1: .e (indexes 0,5)
input to the neural net: 0
output probabilities from the neural net: tensor([0.0621, 0.0106, 0.0131, 0.0045, 0.0178, 0.0284, 0.0029, 0.0244, 0.0146,
        0.0328, 0.0084, 0.0292, 0.0097, 0.0088, 0.0516, 0.2141, 0.0618, 0.0027,
        0.0262, 0.0058, 0.0354, 0.0116, 0.0031, 0.0209, 0.0125, 0.1468, 0.1401],
       grad_fn=<SelectBackward0>)
label (actual next character): 5
probability assigned by the net to the current character: 0.028360897675156593
log likelihood -3.562743902206421
negative log likelihood: 3.562743902206421
-------
bigram example 2: em (indexes 5,13)
input to the neural net: 5
output probabilities from the neural net: tensor([0.0294, 0.0777, 0.0252, 0.0519, 0.1780, 0.0293, 0.0096, 0.0338, 0.0100,
        0.0305, 0.0689, 0.0233, 0.0118, 0.0400, 0.0111, 0.0319, 0.0294, 0.0047,
        0.0885, 0.0219, 0.0485, 0.0304, 0.0500, 0.0028, 0.0121, 0.0022, 0.0472],
       grad_fn=<SelectBackward0>)
label (actual next character): 13
pr

In [94]:
# Sample from the neural net
g = torch.Generator().manual_seed(2147483647)

for i in range(5):

    out = []
    ix = 0
    while True:
        # ------------
        # BEFORE:
        # p = P[ix]
        # ------------
        # NOW:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float() # take ix, encode it into one-hot row of xenc
        logits = xenc @ W # predict log-counts
        counts = logits.exp() # counts, equivalent to N
        p = counts / counts.sum(1, keepdims=True) # probabilities for next character
        # ------------

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

cexze.
momasurailezitynn.
konimittain.
llayn.
ka.


In [95]:
# our current "neural net" is very simple (the forward pass)
# over time we will keep making that more and more complex until we get to transformers