[Paper Reference](https://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

In [4]:
chars = sorted(list(set(''.join(words))))
stoi = {s: i for i, s in enumerate(chars, 1)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

In [5]:
# Building datasets
block_size = 3
X, Y = [], []
for word in words[:5]:
    print(word)
    context = [0] * block_size
    for ch in word + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '---->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X) # examples
Y = torch.tensor(Y) # Labels

emma
... ----> e
..e ----> m
.em ----> m
emm ----> a
mma ----> .
olivia
... ----> o
..o ----> l
.ol ----> i
oli ----> v
liv ----> i
ivi ----> a
via ----> .
ava
... ----> a
..a ----> v
.av ----> a
ava ----> .
isabella
... ----> i
..i ----> s
.is ----> a
isa ----> b
sab ----> e
abe ----> l
bel ----> l
ell ----> a
lla ----> .
sophia
... ----> s
..s ----> o
.so ----> p
sop ----> h
oph ----> i
phi ----> a
hia ----> .


In [6]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

`X` are examples, and the `Y` are the labels

In [7]:
C = torch.rand(27, 2) # lookup table

In [8]:
C[5]

tensor([0.6241, 0.0930])

In [9]:
F.one_hot(torch.tensor(5), num_classes=27).float() @ C # converting, the number to one_hot, then multiply with the Matrix to get the value

tensor([0.6241, 0.0930])

In [10]:
emb = C[X] # mapping to a look up table
emb.shape

torch.Size([32, 3, 2])

In [11]:
W1 = torch.randn((6, 100)) # 3 x 2 = 6
b1 = torch.randn(100)

In [12]:
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape

torch.Size([32, 6])

In [13]:
[emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]]

[tensor([[0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.6241, 0.0930],
         [0.9623, 0.1863],
         [0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.5498, 0.2820],
         [0.8586, 0.3946],
         [0.7895, 0.2352],
         [0.1907, 0.6550],
         [0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.1200, 0.9014],
         [0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.7895, 0.2352],
         [0.8140, 0.8377],
         [0.1200, 0.9014],
         [0.7275, 0.1998],
         [0.6241, 0.0930],
         [0.8586, 0.3946],
         [0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.8140, 0.8377],
         [0.5498, 0.2820],
         [0.6504, 0.5814],
         [0.7506, 0.1616]]),
 tensor([[0.4078, 0.2866],
         [0.4078, 0.2866],
         [0.6241, 0.0930],
         [0.9623, 0.1863],
         [0.9623, 0.1863],

In [14]:
torch.cat(torch.unbind(emb, 1), 1).shape

torch.Size([32, 6])

[Resume from here](https://youtu.be/TCH_1BHY58I?t=1430)

In [21]:
a = torch.arange(18)

In [22]:
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [23]:
a.view(3, 3, 2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

[Reference on view](https://blog.ezyang.com/2019/05/pytorch-internals/)

In [43]:
emb.view(32, 6).shape

torch.Size([32, 6])

In [45]:
emb.shape

torch.Size([32, 3, 2])

### Understanding `torch.tensor.view`

In [51]:
tensor = torch.tensor([[[1, 2], [3, 4], [5, 6]],
                       [[7, 8], [9, 10], [11, 12]],
                       [[13, 14], [15, 16], [17, 18]]])
tensor.shape

torch.Size([3, 3, 2])

In [58]:
tensor.view(9, 1, 2)

tensor([[[ 1,  2]],

        [[ 3,  4]],

        [[ 5,  6]],

        [[ 7,  8]],

        [[ 9, 10]],

        [[11, 12]],

        [[13, 14]],

        [[15, 16]],

        [[17, 18]]])

In [63]:
h = torch.tanh(emb.view(-1, 6) @ W1 + b1)

In [70]:
h.shape

torch.Size([32, 100])

In [76]:
W2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [71]:
W2.shape

torch.Size([100, 27])

In [81]:
logits = h @ W2 + b2
logits.shape

torch.Size([32, 27])

In [83]:
counts = logits.exp()

In [84]:
prob = counts / counts.sum(1, keepdims=True)

In [85]:
prob.shape

torch.Size([32, 27])

In [86]:
prob[torch.arange(32), Y]

tensor([9.9933e-01, 1.7867e-12, 7.5371e-12, 1.3108e-04, 8.1031e-08, 8.5639e-12,
        5.5566e-08, 6.5239e-13, 1.1142e-06, 1.0033e-14, 1.7782e-06, 4.9662e-08,
        3.6511e-06, 4.8122e-07, 3.8690e-07, 8.7253e-09, 2.0429e-12, 1.1279e-06,
        1.8466e-03, 6.0728e-07, 9.9999e-01, 8.9751e-05, 7.4862e-06, 1.1481e-04,
        1.4722e-08, 1.6829e-06, 1.8020e-10, 6.2329e-13, 1.0148e-04, 7.3568e-15,
        1.2633e-05, 1.7157e-08])

In [93]:
loss = -prob[torch.arange(32), Y].log().mean()

[Resume from here](https://youtu.be/TCH_1BHY58I?t=1947)