# makemore, but made with an MLP (Multi-Layer Perceptron)

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    torch.set_default_device(mps_device)

In [2]:
# read in all the words
words = open('names.txt', 'r').read().splitlines()
words[:8] # show first 8 words

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words) # how many words do we have?

32033

In [4]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words)))) # unique characters in the words
stoi = { s:i+1 for i,s in enumerate(chars) } # string to integer mapping
stoi['.'] = 0 # add a special end-of-word character
itos = { i:s for s,i in stoi.items() } # integer to string mapping
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


## Buiding the dataset

In [24]:
block_size = 3 # Context length: how many characters do we take to predict the next character?
X, Y = [], [] # input and output arrays
for w in words:
    # print(f'word: {w}')
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]  # convert character to integer
        X.append(context)  # append the context
        Y.append(ix)  
        # print(''.join(itos[i] for i in context), '-->', itos[ix])
        context = context[1:] + [ix]  # slide the window, add new character

X = torch.tensor(X)  # convert to tensor
Y = torch.tensor(Y)  # convert to tensor

In [6]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [7]:
# Lookup Table
C = torch.randn((27, 2))

In [8]:
emb = C[X]
emb.shape  # shape of the embedding tensor

torch.Size([228146, 3, 2])

In [9]:
W1 = torch.randn((6, 100)) # weights for the first layer
b1 = torch.randn(100) # bias for the first layer

In [10]:
# torch.cat(torch.unbind(emb, 1), dim=1).shape
h = torch.tanh(emb.view(emb.shape[0], -1) @ W1 + b1)  # apply linear transformation and activation

In [11]:
h

tensor([[-0.7160,  1.0000,  0.9999,  ...,  0.9955, -0.9979,  0.0370],
        [-0.9986,  0.9793,  0.6316,  ...,  1.0000,  0.9982,  0.1726],
        [-0.9677,  0.9920,  0.6065,  ...,  0.3565,  0.7759, -0.6143],
        ...,
        [ 0.8945,  0.9964,  0.9853,  ...,  0.9897, -0.9824, -0.1019],
        [-0.2721,  0.9980,  0.9790,  ...,  0.8297, -0.9997, -0.4355],
        [ 0.0701,  0.6992,  0.4785,  ...,  0.9997,  0.4676,  0.1409]],
       device='mps:0')

In [12]:
h.shape

torch.Size([228146, 100])

In [13]:
W2 = torch.randn((100, 27))  # weights for the output layer
b2 = torch.randn(27)  # bias for the output layer

In [14]:
logits = h @ W2 + b2  # compute logits for the output layer

In [15]:
logits.shape

torch.Size([228146, 27])

In [16]:
counts = logits.exp()  # convert logits to counts (unnormalized probabilities)

In [17]:
prob = counts / counts.sum(1, keepdim=True)  # normalize to get probabilities

In [18]:
prob.shape

torch.Size([228146, 27])

In [19]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [20]:
g = torch.Generator(device=mps_device).manual_seed(2147483647)  # set random seed for reproducibility
C = torch.randn((27, 2), generator=g)  # reinitialize the lookup table with the same seed
W1 = torch.randn((6, 100), generator=g)  # reinitialize weights
b1 = torch.randn(100, generator=g)  # reinitialize biases for the first layer
W2 = torch.randn((100, 27), generator=g)  # reinitialize weights for the output layer
b2 = torch.randn(27, generator=g)  # reinitialize biases for the output layer
parameters = [C, W1, b1, W2, b2]  # list of parameters

In [21]:
sum(p.nelement() for p in parameters)

3481

In [22]:
for p in parameters:
    p.requires_grad = True  # enable gradients for all parameters

In [23]:
#  Forward pass
X, Y = X.to(mps_device), Y.to(mps_device)  # move

for _ in range(1000):
    emb = C[X]  # get embeddings for the input characters
    h = torch.tanh(emb.view(emb.shape[0], -1) @ W1 + b1)  # apply linear transformation and activation
    logits = h @ W2 + b2  # compute logits for the output layer
    # counts = logits.exp()  # convert logits to counts (unnormalized probabilities)
    # prob = counts / counts.sum(1, keepdim=True)  # normalize to get probabilities
    # loss = -prob[torch.arange(32), Y].log().mean()  # compute the log probability of the true labels
    loss = F.cross_entropy(logits, Y)  # equivalent loss using PyTorch's built-in function

    # Backward pass
    learning_rate = 0.1  # learning rate for the optimizer
    for p in parameters:
        p.grad = None  # zero out gradients

    loss.backward()  # compute gradients
    # Update parameters
    for p in parameters:
        p.data += -learning_rate * p.grad  # update parameters using gradient descent

print(loss.item())  # print the loss value

2.5788886547088623
