In [139]:
%pylab inline
import numpy as np
from tqdm import trange
import torch
import torch.nn.functional as F

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [None]:
# from https://www.youtube.com/watch?v=TCH_1BHY58I

In [6]:
words = open("./names.txt", 'r').read().split()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [32]:
len(words) # we have 32033 words

32033

In [31]:
# our vocabulary dictionary
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0 
itos = {i:s for s,i in stoi.items()}

In [62]:
len(chars) # we have a set of chars of 26 + eos (.)

26

In [39]:
''.join([itos[i] for i in a])

'abc'

In [171]:
# building the dataset
'''
The data set corresponds to a set of context X, and a prediction next character Y.
block_size will the size of the context
'''
block_size = 3
X, Y = [], []

for w in words:
    # print(w)
    context = [0] * block_size
    for c in w + ".":
        # for the first iteration
        ix = stoi[c]
        X.append(context)
        Y.append(ix)

        # print(f"{''.join([itos[i] for i in context])} ---> {itos[ix]}")
        context = context[1:] + [ix]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [172]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [203]:
# INITIALIZATION
# lookup table C. We construct and embedding for the contexts.
dim_emb = 2
nneurons = 100
g = torch.Generator().manual_seed(2147483647)
# hidden layer. The number of inputs is 3x2. We have three characters for the context and the embeding is two dimensional
C = torch.randn((27,dim_emb), generator=g)

W1 = torch.randn((block_size * dim_emb, nneurons), generator=g)
b1 = torch.randn(nneurons, generator=g)

W2 = torch.randn((nneurons,  27), generator=g)
b2 = torch.randn(27, generator=g)

parameters = [C, W1, b1, W2, b2]

for p in parameters:
    p.requires_grad = True

In [199]:
lre = torch.linspace(-3,0,1000)
klrs = 10**lre

In [206]:
# TRAINING
epochs = 10000
lr = 0.01
batch_size = 32
# losses = []

for ep in (t := trange(epochs)):
    # *** forward pass
    ix = torch.randint(0, X.shape[0], (batch_size,))
    
    emb = C[X[ix]]
    # torch.cat(emb.unbind(1),1) 
    h = torch.tanh(emb.view(-1, block_size * dim_emb) @ W1 + b1)
    logits = h @ W2 + b2 
    # counts = logits.exp()
    # probs = counts / counts.sum(1, keepdim=True)
    # loss = -probs[torch.arange(emb.shape[0]), Y].log().mean()
    loss = F.cross_entropy(logits, Y[ix])
    accuracy = torch.mean((torch.argmax(logits,dim=1) == Y[ix]).float() * torch.tensor(1))
    
    # *** backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    # update
    # lr = lrs[ep] 
    for p in parameters:
        p.data += - lr * p.grad

    # losses.append(loss.item())
    t.set_description(f" loss: {loss:.4f}, accuracy: {accuracy:.4f}")

 loss: 2.6435, accuracy: 0.1250: 100%|█| 10000/10000 [00:31<00:00, 318.


In [192]:
# eval on the entire set
emb = C[X]
h = torch.tanh(emb.view(-1, block_size * dim_emb) @ W1 + b1)
logits = h @ W2 + b2 
loss = F.cross_entropy(logits, Y)
accuracy = torch.mean((torch.argmax(logits,dim=1) == Y).float() * torch.tensor(1))
print(f"loss: {loss} | accuracy: {accuracy}")
    

loss: 2.6750986576080322 | accuracy: 0.21990305185317993
