In [1]:
# name generation model
# based on this paper: https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [3]:
# read  in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
len(words)

32033

In [5]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [6]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
               # original paper uses a cotext of 3 words
X, Y = [], []
for w in words[:5]:
    print(w)
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop first character and append, rolling window

X = torch.tensor(X)
Y = torch.tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [7]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [13]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [8]:
# embed input in lower dimensional space
# original paper embeds 17,000 words in 30 dimensional space
# we have 27 possible input characters. let's try a 2 dimensional space.
C = torch.randn((27, 2))

In [9]:
C[torch.tensor([5, 6, 7])] # using list or tensor as index instead of number gives us a tensor of the respective values in C

tensor([[ 1.2794,  0.9109],
        [ 0.1409,  0.4732],
        [ 0.6917, -0.0952]])

In [10]:
emb = C[X] # we can also index with multidimensional integers
emb

tensor([[[ 0.0538, -0.7437],
         [ 0.0538, -0.7437],
         [ 0.0538, -0.7437]],

        [[ 0.0538, -0.7437],
         [ 0.0538, -0.7437],
         [ 1.2794,  0.9109]],

        [[ 0.0538, -0.7437],
         [ 1.2794,  0.9109],
         [-1.3216, -0.7413]],

        [[ 1.2794,  0.9109],
         [-1.3216, -0.7413],
         [-1.3216, -0.7413]],

        [[-1.3216, -0.7413],
         [-1.3216, -0.7413],
         [-0.6083,  1.1529]],

        [[ 0.0538, -0.7437],
         [ 0.0538, -0.7437],
         [ 0.0538, -0.7437]],

        [[ 0.0538, -0.7437],
         [ 0.0538, -0.7437],
         [ 0.4639,  0.7713]],

        [[ 0.0538, -0.7437],
         [ 0.4639,  0.7713],
         [ 0.5806,  0.4032]],

        [[ 0.4639,  0.7713],
         [ 0.5806,  0.4032],
         [ 0.4205, -0.9643]],

        [[ 0.5806,  0.4032],
         [ 0.4205, -0.9643],
         [-0.0470,  0.4804]],

        [[ 0.4205, -0.9643],
         [-0.0470,  0.4804],
         [ 0.4205, -0.9643]],

        [[-0.0470,  0

In [12]:
emb.shape

torch.Size([32, 3, 2])

In [15]:
# hidden layer
W1 = torch.randn((6, 100)) # weights
# 6 x 100 because:
# (number of inputs == 6 == embedding dimensions (2) x n-embeddings per input (3))
#   x 
# (number of neurons in this layer == some arbitrary amount of neurons (100))
b1 = torch.randn(100) # biases

In [16]:
emb @ W1 + b1 # what we want to do

RuntimeError: mat1 and mat2 shapes cannot be multiplied (96x2 and 6x100)

In [17]:
# but it doesn't work because each element in emb has size (3x2) instead of size 6.
# we need to smush these together somehow.
# there are multiple ways to acheive this depending on exact requirements
# we'll use torch.cat

In [21]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1)
# this will do it, but it's ugly because it's hard-coded.
# what if we wanted to change the dimensions of emb from e.g. (_, 3, _) to (_, M, _) ?

tensor([[ 0.0538, -0.7437,  0.0538, -0.7437,  0.0538, -0.7437],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  1.2794,  0.9109],
        [ 0.0538, -0.7437,  1.2794,  0.9109, -1.3216, -0.7413],
        [ 1.2794,  0.9109, -1.3216, -0.7413, -1.3216, -0.7413],
        [-1.3216, -0.7413, -1.3216, -0.7413, -0.6083,  1.1529],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  0.0538, -0.7437],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  0.4639,  0.7713],
        [ 0.0538, -0.7437,  0.4639,  0.7713,  0.5806,  0.4032],
        [ 0.4639,  0.7713,  0.5806,  0.4032,  0.4205, -0.9643],
        [ 0.5806,  0.4032,  0.4205, -0.9643, -0.0470,  0.4804],
        [ 0.4205, -0.9643, -0.0470,  0.4804,  0.4205, -0.9643],
        [-0.0470,  0.4804,  0.4205, -0.9643, -0.6083,  1.1529],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  0.0538, -0.7437],
        [ 0.0538, -0.7437,  0.0538, -0.7437, -0.6083,  1.1529],
        [ 0.0538, -0.7437, -0.6083,  1.1529, -0.0470,  0.4804],
        [-0.6083,  1.1529, -0.0470,  0.4

In [34]:
# ans: use torch.unbind
#torch.unbind(emb, 1) # == [emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]]
torch.cat(torch.unbind(emb, 1), 1)
# unfortunately, this is SUPA INEFFICIENT, because torch.cat will copy everything...

tensor([[ 0.0538, -0.7437,  0.0538, -0.7437,  0.0538, -0.7437],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  1.2794,  0.9109],
        [ 0.0538, -0.7437,  1.2794,  0.9109, -1.3216, -0.7413],
        [ 1.2794,  0.9109, -1.3216, -0.7413, -1.3216, -0.7413],
        [-1.3216, -0.7413, -1.3216, -0.7413, -0.6083,  1.1529],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  0.0538, -0.7437],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  0.4639,  0.7713],
        [ 0.0538, -0.7437,  0.4639,  0.7713,  0.5806,  0.4032],
        [ 0.4639,  0.7713,  0.5806,  0.4032,  0.4205, -0.9643],
        [ 0.5806,  0.4032,  0.4205, -0.9643, -0.0470,  0.4804],
        [ 0.4205, -0.9643, -0.0470,  0.4804,  0.4205, -0.9643],
        [-0.0470,  0.4804,  0.4205, -0.9643, -0.6083,  1.1529],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  0.0538, -0.7437],
        [ 0.0538, -0.7437,  0.0538, -0.7437, -0.6083,  1.1529],
        [ 0.0538, -0.7437, -0.6083,  1.1529, -0.0470,  0.4804],
        [-0.6083,  1.1529, -0.0470,  0.4

In [35]:
# even BETTER + EFFICIENT ans: use tensor.view!
# changes how tensor is INDEXED instead of STORED. efficient!
# changes storage offset, strides, and shapes
emb.view(32, 6)

tensor([[ 0.0538, -0.7437,  0.0538, -0.7437,  0.0538, -0.7437],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  1.2794,  0.9109],
        [ 0.0538, -0.7437,  1.2794,  0.9109, -1.3216, -0.7413],
        [ 1.2794,  0.9109, -1.3216, -0.7413, -1.3216, -0.7413],
        [-1.3216, -0.7413, -1.3216, -0.7413, -0.6083,  1.1529],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  0.0538, -0.7437],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  0.4639,  0.7713],
        [ 0.0538, -0.7437,  0.4639,  0.7713,  0.5806,  0.4032],
        [ 0.4639,  0.7713,  0.5806,  0.4032,  0.4205, -0.9643],
        [ 0.5806,  0.4032,  0.4205, -0.9643, -0.0470,  0.4804],
        [ 0.4205, -0.9643, -0.0470,  0.4804,  0.4205, -0.9643],
        [-0.0470,  0.4804,  0.4205, -0.9643, -0.6083,  1.1529],
        [ 0.0538, -0.7437,  0.0538, -0.7437,  0.0538, -0.7437],
        [ 0.0538, -0.7437,  0.0538, -0.7437, -0.6083,  1.1529],
        [ 0.0538, -0.7437, -0.6083,  1.1529, -0.0470,  0.4804],
        [-0.6083,  1.1529, -0.0470,  0.4

In [37]:
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)
# activations of inputs emb (h for "hidden states")
# could do emb.view(-1, 6) and pytorch would figure out what the first dimension needs to be in order to make the size work
h.shape

torch.Size([32, 100])

In [38]:
h

tensor([[-0.5682,  0.0052,  0.3113,  ...,  0.5839, -0.1124,  0.9348],
        [ 0.8612,  0.7693, -0.9890,  ...,  0.9578,  0.4915, -0.9999],
        [-0.9992, -0.8587, -0.0996,  ...,  0.9258,  0.2002,  0.9999],
        ...,
        [ 0.9914, -0.9194, -0.7099,  ...,  0.5958, -0.8240, -0.9951],
        [-0.0539, -0.1342, -0.9891,  ...,  0.9787, -0.3020,  0.9816],
        [ 0.6416, -0.8814,  0.9068,  ..., -0.0152, -0.4138, -0.9641]])

In [None]:
# let's make sure the expression "emb.view(emb.shape[0], 6) @ W1 + b1"
# is broadcasting correctly

In [41]:
(emb.view(emb.shape[0], 6) @ W1).shape

torch.Size([32, 100])

In [42]:
b1.shape

torch.Size([100])

In [43]:
# align on right, make missing dimensions size 1, then copy all dimensions of size 1
# 32 100    32 100    32 100
#    100 ->  1 100 -> 32 100
# so it's correct (note: in case this looks wrong because we know 
#                        matrix multiplication needs (N, M) x (M, O) dimensions,
#                        remember we're not doing a multiplication on these
#                        two values, we're doing an addition, so we actually
#                        do want identical sizes)

In [44]:
# final layer
W2 = torch.randn((100, 27)) # input size -> 27 characters
b2 = torch.randn(27)

In [45]:
logits = h @ W2 + b2

In [46]:
logits.shape

torch.Size([32, 27])

In [48]:
counts = logits.exp()

In [49]:
prob = counts / counts.sum(1, keepdims=True)

In [50]:
prob.shape

torch.Size([32, 27])

In [52]:
prob.sum(1, keepdims=True)

tensor([[1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000]])

In [54]:
prob

tensor([[5.5315e-13, 4.0494e-08, 5.5371e-07, 3.1887e-03, 1.2082e-09, 9.9565e-01,
         2.7799e-09, 6.4958e-11, 9.3622e-09, 4.1729e-14, 3.5972e-11, 1.1511e-04,
         2.2061e-04, 2.4117e-08, 1.5218e-09, 8.9600e-05, 7.2825e-11, 1.4529e-04,
         2.9543e-04, 8.4674e-16, 7.2751e-07, 1.6533e-04, 5.0219e-10, 3.8599e-06,
         1.2581e-04, 8.1741e-12, 4.2301e-09],
        [6.0061e-16, 7.4413e-06, 5.7400e-09, 6.1091e-06, 3.9415e-06, 9.9492e-01,
         1.7604e-08, 2.0985e-07, 1.2121e-07, 9.6651e-13, 6.6366e-12, 5.1197e-05,
         6.9465e-07, 1.5825e-09, 1.9254e-03, 5.8585e-07, 3.6856e-12, 1.0943e-10,
         2.2530e-10, 9.5484e-15, 2.0856e-05, 1.9202e-06, 1.2106e-09, 5.7655e-09,
         3.0658e-03, 6.5779e-15, 3.9983e-08],
        [2.0248e-07, 1.7439e-12, 1.8397e-12, 3.0563e-10, 1.6212e-13, 3.6910e-08,
         8.6244e-09, 4.0961e-13, 9.9812e-11, 4.8850e-10, 1.4072e-09, 2.3538e-06,
         1.7619e-09, 9.1004e-08, 1.3357e-13, 2.2425e-09, 4.2242e-09, 5.8343e-09,
         1.0000e+

In [58]:
# let's look at how likely the neural net thought the actual outputs were
prob[torch.arange(32), Y]

tensor([9.9565e-01, 1.5825e-09, 9.1004e-08, 3.0565e-09, 1.1007e-08, 8.9600e-05,
        1.6730e-06, 6.3718e-10, 2.4990e-13, 6.5318e-10, 1.7440e-08, 6.1507e-14,
        4.0494e-08, 8.3169e-06, 2.8573e-10, 6.8234e-03, 4.1729e-14, 4.1202e-18,
        5.2989e-08, 1.7135e-10, 3.0272e-05, 3.8408e-08, 7.4208e-09, 3.0639e-08,
        7.2071e-07, 8.4674e-16, 4.9502e-09, 1.0467e-12, 2.3548e-08, 1.0559e-10,
        4.7070e-08, 9.7461e-13])

In [59]:
# that's pretty bad, but it's ok because we haven't trained it at all yet

In [56]:
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [53]:
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [61]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(19.5794)

In [None]:
# ----------------------------------------------------------------
# TIDIED UP! (more respectable)
# ----------------------------------------------------------------

In [73]:
X.shape, Y.shape # dataset

(torch.Size([32, 3]), torch.Size([32]))

In [74]:
g = torch.Generator().manual_seed(2 ** 31 - 1) # for reproducibility
C = torch.randn((27, 2), generator=g)
# hidden layer
W1 = torch.randn((6, 100), generator=g) # weights
b1 = torch.randn(100, generator=g) # biases
# output layer (right?)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [75]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [76]:
emb = C[X] # [32, 3, 2]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
counts = logits.exp()
prob = counts / counts.sum(1, keepdims=True)
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(17.7697)

In [84]:
# ----------------------------------------------------------------
# TIDIED UP, 2! (even MORE respectable)
# ----------------------------------------------------------------

In [85]:
X.shape, Y.shape # dataset

(torch.Size([32, 3]), torch.Size([32]))

In [86]:
g = torch.Generator().manual_seed(2 ** 31 - 1) # for reproducibility
C = torch.randn((27, 2), generator=g)
# hidden layer
W1 = torch.randn((6, 100), generator=g) # weights
b1 = torch.randn(100, generator=g) # biases
# output layer (right?)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [87]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [88]:
emb = C[X] # [32, 3, 2]
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Y)
loss
# andrej explains a bunch of reasons to use F.cross_entropy instead of making your
# own loss function
# 1. efficiency -- pytorch can skip creating intermediate tensors which waste memory
#    and can group operations together or something for more computational
#    efficiency ??
# 2. backward pass is more efficient because F.cross_entropy knows how to do
#    backpropogation better or something ??
# 3. better numerical behaviour -- you skip the bug where calling logits.exp() ends
#    up giving you floating point infinity

tensor(17.7697)

In [84]:
# ----------------------------------------------------------------
# ITERATION 3, MAKE IT ITERATE! (now we're LEARNING with MACHINES)
# ----------------------------------------------------------------

In [85]:
X.shape, Y.shape # dataset

(torch.Size([32, 3]), torch.Size([32]))

In [86]:
g = torch.Generator().manual_seed(2 ** 31 - 1) # for reproducibility
C = torch.randn((27, 2), generator=g)
# hidden layer
W1 = torch.randn((6, 100), generator=g) # weights
b1 = torch.randn(100, generator=g) # biases
# output layer (right?)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [89]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [90]:
for p in parameters:
    p.requires_grad = True

In [93]:
for _ in range(1000):
    # forward pass
    emb = C[X] # [32, 3, 2]
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
    logits = h @ W2 + b2 # (32, 27)
    loss = F.cross_entropy(logits, Y)
    print(loss.item())
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update
    for p in parameters:
        p.data += -0.1 * p.grad

0.323263019323349
0.3221277594566345
0.32101118564605713
0.319913774728775
0.31883668899536133
0.31778037548065186
0.3167456090450287
0.31573277711868286
0.3147422671318054
0.31377437710762024
0.3128291666507721
0.3119066655635834
0.3110068142414093
0.31012940406799316
0.30927425622940063
0.3084408938884735
0.3076290190219879
0.3068382740020752
0.3060680031776428
0.30531778931617737
0.30458715558052063
0.30387547612190247
0.30318212509155273
0.30250686407089233
0.3018488585948944
0.30120766162872314
0.300582617521286
0.2999732494354248
0.2993791997432709
0.29879969358444214
0.29823437333106995
0.2976827323436737
0.2971442639827728
0.29661864042282104
0.2961055040359497
0.2956041097640991
0.29511430859565735
0.2946355640888214
0.29416772723197937
0.29371026158332825
0.29326289892196655
0.2928251624107361
0.2923969030380249
0.29197773337364197
0.29156750440597534
0.29116570949554443
0.2907721996307373
0.290386825799942
0.29000917077064514
0.2896389365196228
0.2892761826515198
0.288920521