In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline

In [None]:
# read in all the words
words = open('/content/sample_data/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [None]:
len(words)

32033

In [None]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [None]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:

  # print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    # print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

In [None]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]), torch.int64)

In [None]:
# create embedding matrix to represent the 27 char to 2 dimensional only
C = torch.randn((27, 2))
C

tensor([[ 2.3712, -0.0491],
        [ 0.1340,  1.0703],
        [-0.4043, -0.5734],
        [-0.8195, -0.6560],
        [ 2.3479,  2.8454],
        [-0.4569, -1.5401],
        [-0.7480, -0.6382],
        [ 0.3405, -0.2371],
        [-0.8738,  0.0493],
        [ 0.9964,  0.2031],
        [-0.9057, -0.9593],
        [ 0.1187, -1.3600],
        [-0.0612, -0.4605],
        [-0.7176,  0.6154],
        [ 0.2024, -0.3024],
        [-0.9413, -1.2887],
        [-0.3117, -0.1852],
        [-1.0062,  0.0728],
        [-0.1232, -1.8310],
        [ 1.7396, -0.6001],
        [-1.5120, -0.4626],
        [-0.1749, -0.2309],
        [ 0.7361, -0.4735],
        [-1.0235, -1.1217],
        [-0.2125, -0.5951],
        [-0.0673, -0.5226],
        [ 2.0622,  0.1354]])

In [None]:
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        ...,
        [26, 26, 25],
        [26, 25, 26],
        [25, 26, 24]])

In [None]:
# mapping the X (input) to the embedding
emb = C[X]
emb

tensor([[[ 2.3712, -0.0491],
         [ 2.3712, -0.0491],
         [ 2.3712, -0.0491]],

        [[ 2.3712, -0.0491],
         [ 2.3712, -0.0491],
         [-0.4569, -1.5401]],

        [[ 2.3712, -0.0491],
         [-0.4569, -1.5401],
         [-0.7176,  0.6154]],

        ...,

        [[ 2.0622,  0.1354],
         [ 2.0622,  0.1354],
         [-0.0673, -0.5226]],

        [[ 2.0622,  0.1354],
         [-0.0673, -0.5226],
         [ 2.0622,  0.1354]],

        [[-0.0673, -0.5226],
         [ 2.0622,  0.1354],
         [-0.2125, -0.5951]]])

In [None]:
emb.shape

torch.Size([228146, 3, 2])

In [None]:
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

In [None]:
len(torch.unbind(emb, 1))

3

In [None]:
torch.unbind(emb, 1)[0]

tensor([[ 2.3712, -0.0491],
        [ 2.3712, -0.0491],
        [ 2.3712, -0.0491],
        ...,
        [ 2.0622,  0.1354],
        [ 2.0622,  0.1354],
        [-0.0673, -0.5226]])

In [None]:
torch.unbind(emb, 1)[0].shape

torch.Size([228146, 2])

In [None]:
# so I have 3 of the (228146, 2), and I want to make it (228146, 6)
torch.cat(torch.unbind(emb, 1),1)

tensor([[ 2.3712, -0.0491,  2.3712, -0.0491,  2.3712, -0.0491],
        [ 2.3712, -0.0491,  2.3712, -0.0491, -0.4569, -1.5401],
        [ 2.3712, -0.0491, -0.4569, -1.5401, -0.7176,  0.6154],
        ...,
        [ 2.0622,  0.1354,  2.0622,  0.1354, -0.0673, -0.5226],
        [ 2.0622,  0.1354, -0.0673, -0.5226,  2.0622,  0.1354],
        [-0.0673, -0.5226,  2.0622,  0.1354, -0.2125, -0.5951]])

In [None]:
torch.cat(torch.unbind(emb, 1),1).shape

torch.Size([228146, 6])

In [40]:
# we can do it with view, which gives the exact same result, but more efficient under the hood
emb.view(228146, 6)

tensor([[ 2.3712, -0.0491,  2.3712, -0.0491,  2.3712, -0.0491],
        [ 2.3712, -0.0491,  2.3712, -0.0491, -0.4569, -1.5401],
        [ 2.3712, -0.0491, -0.4569, -1.5401, -0.7176,  0.6154],
        ...,
        [ 2.0622,  0.1354,  2.0622,  0.1354, -0.0673, -0.5226],
        [ 2.0622,  0.1354, -0.0673, -0.5226,  2.0622,  0.1354],
        [-0.0673, -0.5226,  2.0622,  0.1354, -0.2125, -0.5951]])

In [43]:
# so finally we can do dot product of the weight and inputs:
# note: -1 will automatically do emb.shape[0]
emb.view(-1, 6) @ W1 + b1

tensor([[ -3.5315,   6.4929,   4.3976,  ...,  -2.7966,  -2.2413,  -0.6848],
        [-10.6152,   3.7463,  -0.1935,  ...,  -2.0375,   0.2731,   1.5617],
        [ -2.2791,   0.8871,   0.4137,  ...,   0.7567,  -3.1662,   3.5476],
        ...,
        [ -6.4230,   4.5809,   0.2495,  ...,  -1.0632,  -0.7823,   0.9898],
        [ -0.2600,   4.5404,   3.5010,  ...,  -2.6450,  -1.3485,   1.0368],
        [ -5.5992,   1.7473,   1.2197,  ...,   0.2007,  -0.9737,  -1.2299]])

In [46]:
# do tanh for non linearity
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)
# note: this W1 + b1 will do broadcasting as follows:
# W1 (228146, 100)
# b1 (100)
# =>
# W1 (228146, 100)
# b1 (     1, 100) => so this is correct, because we wanna do row wise sum!
h

tensor([[-0.9983,  1.0000,  0.9997,  ..., -0.9926, -0.9776, -0.5946],
        [-1.0000,  0.9989, -0.1911,  ..., -0.9666,  0.2665,  0.9157],
        [-0.9793,  0.7100,  0.3916,  ...,  0.6391, -0.9965,  0.9983],
        ...,
        [-1.0000,  0.9998,  0.2444,  ..., -0.7869, -0.6540,  0.7573],
        [-0.2543,  0.9998,  0.9982,  ..., -0.9900, -0.8737,  0.7766],
        [-1.0000,  0.9411,  0.8396,  ...,  0.1980, -0.7503, -0.8425]])

In [48]:
h.shape

torch.Size([228146, 100])

In [47]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [49]:
logits = h @ W2 + b2

In [50]:
logits.shape

torch.Size([228146, 27])

In [51]:
counts = logits.exp()

In [52]:
# so softmax activation function
prob = counts / counts.sum(1, keepdims=True)

In [53]:
prob.shape

torch.Size([228146, 27])