In [142]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [143]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [144]:
len(words)

32033

In [145]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [170]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
	# print(w)
	context = [0] * block_size
	for ch in w + '.':
		ix = stoi[ch]
		X.append(context)
		Y.append(ix)
		# print(''.join(itos[i] for i in context), '--->', itos[ix])
		context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [171]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [148]:
C = torch.randn((27, 2))

In [149]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [150]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [151]:
# emb @ W1 + b1
# this won't work because emb needs to be 32 x 6

In [152]:
# this will work 
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
h

tensor([[-0.8742, -0.8591,  0.6014,  ...,  0.2661,  0.7489,  0.9776],
        [-0.9738,  0.4383, -0.0635,  ...,  0.9833,  1.0000,  0.9997],
        [-0.6078, -0.9945,  0.8831,  ..., -0.8877, -1.0000,  0.9745],
        ...,
        [-0.6249, -0.9984,  0.9604,  ..., -0.8198, -0.7372,  0.9457],
        [ 0.1864, -0.9154,  0.8577,  ..., -0.1857, -0.3292,  0.7009],
        [-0.9282, -0.8980,  0.8828,  ..., -0.4432,  0.9999,  0.9945]])

In [153]:
h.shape

torch.Size([32, 100])

In [154]:
(emb.view(-1, 6) @ W1).shape

torch.Size([32, 100])

In [155]:
b1.shape

torch.Size([100])

In [156]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [157]:
logits = h @ W2 + b2

In [158]:
logits.shape

torch.Size([32, 27])

In [159]:
logits

tensor([[ -8.9315,  -1.5398,   0.9254,  -6.5288,  -1.1674,   6.0055,  -1.0987,
           8.0733,  -6.7984,  -6.1542,   6.0707,  -2.9507,  -0.2381,  -3.0500,
          -9.4917,  -0.2935,  -3.9497,  -9.7294,   4.9842,  -1.8987,   5.5143,
         -12.9327,  -7.3810,  -1.9113,   9.0722,  12.2058,  -8.3445],
        [ -0.8935,   9.9130,   7.9156,  -5.1824,   1.8979,   9.8237,  -6.9917,
           4.2284,  -3.6098,  -3.0913,   7.9280,  -3.8731,   0.4485,  -2.6366,
          -3.9186,  -8.8504,  -1.4811,   0.5883,  -3.5460,   2.6017,  -2.8674,
         -23.3399,  -5.6375,  -6.4986,   2.1219,  14.9639, -12.6961],
        [ -4.4338, -12.8605,   9.9954,   9.0294,   4.1604,  -8.6931,   7.1021,
           2.7697,   5.6043,  -1.5752,   1.0870,  -0.4906,   1.8511,   4.0801,
          -9.3868,   7.3674,  -9.4683,  -6.3988,   2.4718,  -2.0402,  -1.0523,
           6.1155,   0.8294,  -3.8077,  -4.6265,  -7.0551,  -3.7027],
        [ 12.2795,  -5.9020,   4.0471,  -1.3756,   5.9885,  -1.7657,   0.8638,


In [160]:
counts = logits.exp()

In [161]:
prob = counts / counts.sum(1, keepdims=True)

In [162]:
prob.shape

torch.Size([32, 27])

In [163]:
loss = -prob[torch.arange(32), Y].log().mean()
loss

tensor(17.5913)

In [164]:
# ------------- now made respectable -------------

In [177]:
# dataset
X.shape, Y.shape 

(torch.Size([228146, 3]), torch.Size([228146]))

In [178]:
C = torch.randn((27, 2))

W1 = torch.randn((6, 100))
b1 = torch.randn(100)

W2 = torch.randn((100, 27))
b2 = torch.randn(27)

parameters = [C, W1, b1, W2, b2]

In [179]:
sum(p.nelement() for p in parameters) # number of parameters in total

3481

In [180]:
for p in parameters:
    p.requires_grad = True

In [199]:
for _ in range(100):

	# minibatch construct
	ix = torch.randint(0, X.shape[0], (32, ))

	# forward pass
	emb = C[X[ix]] # (32, 3, 2)
	h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
	logits = h @ W2 + b2 # (32, 27)
	loss = F.cross_entropy(logits, Y[ix])

	# backward pass
	for p in parameters:
			p.grad = None
	loss.backward()

	# update
	for p in parameters:
			p.data += -0.1 * p.grad

print(loss.item())

2.57481050491333


In [200]:
emb = C[X] # (32, 3, 2)
h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Y)
loss

tensor(2.6629, grad_fn=<NllLossBackward0>)