In [37]:
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

In [38]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [39]:
chars = sorted(list(set(''.join(words))))
str_to_inx = {str:inx for inx, str in enumerate(chars, start=1)}
str_to_inx['.'] = 0
inx_to_str = {inx:str for inx, str in str_to_inx.items()}
str_to_inx

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [40]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for word in words[:5]:
    word = block_size * '.' + word + '.'
    # for ch1, ch2, ch3, ch4 in zip(word, word[1:], word[2:], word[3:]):
    #     # print(''.join([ch1, ch2, ch3]), '=>', ch4)
    #     X.append([str_to_inx[ch] for ch in [ch1, ch2, ch3]])
    #     Y.append(str_to_inx[ch4])
    end_inx = block_size
    for start_inx, char in enumerate(word[block_size:]):
        X.append([str_to_inx[ch] for ch in word[start_inx:end_inx]])
        Y.append(str_to_inx[char])
        end_inx += 1


X, Y = torch.tensor(X), torch.tensor(Y)
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [41]:
X.dtype, X.shape, Y.dtype, Y.shape

(torch.int64, torch.Size([32, 3]), torch.int64, torch.Size([32]))

In [42]:
X[:5]

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1]])

In [43]:
C = torch.randn((27, 2))
C[:15]

tensor([[-0.1071, -1.0641],
        [-1.4979,  0.0274],
        [-0.8261, -1.0237],
        [-0.1853, -1.2831],
        [-1.0521,  0.2812],
        [-0.3013, -1.3081],
        [ 1.0218, -0.6376],
        [ 0.1985, -0.5644],
        [-0.5787,  0.2149],
        [-1.0239, -0.5308],
        [-0.2686,  0.4993],
        [-1.4395, -0.2274],
        [-0.4591, -0.2764],
        [ 1.1824,  0.3577],
        [-1.1010, -1.7622]])

In [44]:
C[X]

tensor([[[-0.1071, -1.0641],
         [-0.1071, -1.0641],
         [-0.1071, -1.0641]],

        [[-0.1071, -1.0641],
         [-0.1071, -1.0641],
         [-0.3013, -1.3081]],

        [[-0.1071, -1.0641],
         [-0.3013, -1.3081],
         [ 1.1824,  0.3577]],

        [[-0.3013, -1.3081],
         [ 1.1824,  0.3577],
         [ 1.1824,  0.3577]],

        [[ 1.1824,  0.3577],
         [ 1.1824,  0.3577],
         [-1.4979,  0.0274]],

        [[-0.1071, -1.0641],
         [-0.1071, -1.0641],
         [-0.1071, -1.0641]],

        [[-0.1071, -1.0641],
         [-0.1071, -1.0641],
         [ 0.5842, -0.0817]],

        [[-0.1071, -1.0641],
         [ 0.5842, -0.0817],
         [-0.4591, -0.2764]],

        [[ 0.5842, -0.0817],
         [-0.4591, -0.2764],
         [-1.0239, -0.5308]],

        [[-0.4591, -0.2764],
         [-1.0239, -0.5308],
         [-0.3372, -0.1407]],

        [[-1.0239, -0.5308],
         [-0.3372, -0.1407],
         [-1.0239, -0.5308]],

        [[-0.3372, -0

In [45]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [46]:
# hidden layer, # of inputs = 6 = 2 * 3 (to the neuron), # of neurons - a variable
W1 = torch.randn((6, 100)) # 100 - neurons
b1 = torch.randn(100)

In [55]:
torch.cat(torch.unbind(emb, 1), 1) == emb.view(emb.shape[0], 6) 

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [54]:
emb.view(emb.shape[0], 6) == torch.cat([emb[:,0,:],emb[:,1,:],emb[:,2,:]],1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

   Most of the times Tanh function is usually used in hidden layers of a neural network because its values lies between -1 to 1 that’s why the mean for the hidden layer comes out be 0 or its very close to 0, hence tanh functions helps in centering the data by bringing mean close to 0 which makes learning for the next layer much easier.

In [56]:
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)

In [57]:
h.shape

torch.Size([32, 100])

In [58]:
W2 = torch.randn((100, 27))
b2 = torch.randn((27))

In [67]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [78]:
counts = logits.exp()
probs = counts / counts.sum(dim=1, keepdim=True)
loss = -probs[torch.arange(X.shape[0]), Y].log().mean()
loss

tensor(20.6457)