In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
words = open('names.txt', 'r').read().splitlines()

In [12]:
chrs = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chrs)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [19]:
block_size = 3 # chars intaken to build next char

def build_dataset(words):
    X, Y = [], []

    for word in words:
        context = [0] * block_size
        for ch in word + '.':
            chi = stoi[ch]
            X.append(context)
            Y.append(chi)
            # print(f'{''.join([itos[i] for i in context])} --> {itos[chi]}')
            context = context[1:] + [chi] # crop context and append
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    
    return X, Y

In [22]:
# splitting dataset 80 / 10 / 10
import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8 * (len(words)))
n2 = int(0.9 * (len(words)))

X_train, Y_train = build_dataset(words[:n1])
X_dev, Y_dev = build_dataset(words[n1:n2])
X_test, Y_test = build_dataset(words[n2:])

torch.Size([182580, 3]) torch.Size([182580])
torch.Size([22767, 3]) torch.Size([22767])
torch.Size([22799, 3]) torch.Size([22799])


In [39]:
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 10), generator=g) # 3 input layer
W1 = torch.randn((30, 200), generator=g) # input = 3*10, 200 nodes ( 3 = block_size)
b1 = torch.randn((200), generator=g) # 200 bias
W2 = torch.randn((200, 27), generator=g) # 200 input, 27 output (27 chrs)
b2 = torch.randn((27), generator=g) # 27 bias
parameters = [C, W1, b1, W2, b2]
print(f'Total number of parameters: {sum(p.nelement() for p in parameters)}')

Total number of parameters: 11897


In [40]:
# setting grad for backpropogation
for p in parameters:
    p.requires_grad = True

In [41]:
# learning rate experiment
lre = torch.linspace(-3, 0, 1000) # 1D tensor from -3 to 0, with 1000 value
lrs = 10**lre # exponential (10^-3 = 0.001)

lri = []
lossi = []
stepi = []

In [None]:
for i in range(200000):

    # creating minibatch
    ix = torch.randint(0, X_train.shape[0], (32,))

    # forward pass
    emb = C[X_train[ix]]
    h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y_train[ix])
    # print(loss.item())

    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # upgrade
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad

    # logging
    if i % 2000 == 0:
        print(f'Epoch: {i}, loss: {loss.item()}')


    # tracking
    # lri.append(lre[i])
    stepi.append(i)
    lossi.append(loss.log10().item())

In [None]:
plt.plot(stepi, lossi)

In [46]:
# training loss
emb = C[X_train]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # 32, 100
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y_train)
loss

tensor(2282667.5000, grad_fn=<NllLossBackward0>)

In [47]:
# validation loss
emb = C[X_dev]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y_dev)
loss

tensor(2286880., grad_fn=<NllLossBackward0>)

In [49]:
# test loss
emb = C[X_test]
h = torch.tanh(emb.view(-1, 30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y_test)
loss

tensor(2285509.2500, grad_fn=<NllLossBackward0>)

In [None]:
# sampling from model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(10):
    out = []
    context = [0] * block_size
    while True:
        emb = C[torch.tensor([context])] # (1,block_size,d)
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)
        if ix == 0:
            break
    print(''.join(itos[i] for i in out))
