In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
# build vocab of characters:
chars = sorted(list(set(''.join(words))))

# create mapping from int to chars and vice-versa
stoi, itos = {}, {}
for num, char in enumerate(chars):
    stoi[char] = num + 1
    itos[num+1] = char

stoi['.'] = 0
itos[0] = '.'

In [36]:
# build the dataset

block_size = 3 # context-definition: how many prev chars to be fed to the net

X, Y = [], [] # list of tensors of ints corresponding to characters 

"""my for loop"""
# for word in words[:3]:
#     for _ in range(block_size):
#         word = '.' + word
#     word += '.'
#     for start_idx in range(0,len(word)-block_size):
#         input_context = ""
#         # star from start_idx and take the next "block_size" chars
#         for a in range(block_size):
#             input_context += word[start_idx + a]
#         X.append(input_context)
#         Y.append(word[start_idx + block_size])
# for idx in range(len(X)):
#     print(X[idx]," ---> ", Y[idx])

"""andrej's for loop"""
for word in words:
    context = [0] * block_size
    for ch in word + '.':
        Y.append([stoi[ch]]) # this is the label/target
        X.append(context)
        # print(''.join(itos[i] for i in context), "-->", ch)
        context = context[1:] + [stoi[ch]]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [37]:
# lets cram these 27 chars into 2 dimentional space: (this is our EMBEDDINGS)
# (each of the 27 chars is represented by a 2-dimentional vector)

# look-up table for creating embedddings:
C = torch.randn((27, 2))

In [38]:
# we want embeddings for all the integers in matrix X:
emb = C[X]
emb.shape

# emb is our 1st layer of MLP

torch.Size([228146, 3, 2])

In [7]:
# 2nd layer (hidden)
w1 = torch.randn((6, 100))
b1 = torch.randn(100)

# getting inputs from the 1st layer for the second layer
# METHOD 1: (not flexible with block-size)

# torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape
"""inefficiend, as torch.cat creates new memory"""

# METHOD 2:
# we want to unbind the 1st dimention:
ips = torch.unbind(emb, 1) # this is exactly the sequence of tensors passed in METHOD 1
# returns a sequence 
changed = torch.cat(ips, 1).shape

In [39]:
# METHOD 3: (more efficient)
emb.shape


torch.Size([228146, 3, 2])

In [9]:
h = torch.tanh(emb.view(-1, 6) @ w1 + b1) # hidden states (2nd layer)
# w1 => (6, 100)
h.shape

torch.Size([32, 100])

In [10]:
h

tensor([[ 0.7563, -0.8238, -0.6941,  ...,  0.8139,  0.9687,  0.1659],
        [ 0.9473, -0.9942, -0.7948,  ...,  0.3868,  0.9993,  0.7937],
        [ 0.2126, -0.9987,  0.3770,  ...,  0.9963,  1.0000,  0.9477],
        ...,
        [-0.1014, -0.9972,  0.9949,  ...,  0.9885,  0.9121,  0.7394],
        [ 0.9788,  0.8237, -0.7886,  ..., -0.7689,  0.9999,  0.8654],
        [-0.0976, -0.9372, -0.8782,  ...,  0.3995,  0.9751,  1.0000]])

In [11]:
w2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [12]:
logits = h @ w2 + b2

In [13]:
logits.shape

torch.Size([32, 27])

In [14]:
counts = logits.exp() # fake couns

prob = counts / counts.sum(1, keepdim=True)

In [15]:
prob[0].sum()

tensor(1.)

In [16]:
# plucking out the probs of the targets present in Y:

loss = -prob[torch.arange(32), Y].log().mean()
loss.requires_grad = True

In [40]:
# --------------- arranged:-------------------
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g) # embedding/look-up table
w1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
w2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, w1, b1, w2, b2]

In [41]:
# total number of parameters:
sum(p.nelement() for p in parameters)

3481

In [42]:
Y = Y.squeeze(1) # else gettig an error in F.cross_entropy

In [43]:
for p in parameters: 
    p.requires_grad = True

In [46]:
for _ in range(20): 
    # getting embeddings for each input
    emb = C[X] # (32, 3, 2)

    # hidden layer output:
    h =  torch.tanh(emb.view(-1, 6) @ w1 + b1) # between 0 to 1
    # (32, 100)

    logits = h @ w2 + b2 # (32, 27)
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdim=True)
    # loss = -prob[torch.arange(len(X)), Y].log().mean()
    # loss is negative log likelihood

    # more efficient way in pytorch: (used in classification problems)
    loss = F.cross_entropy(logits, Y)
    print(loss.item())

    # reasons for using F.cross_entropy: 
    # 1. forward pass and backward pass can be much more efficient 
    # 3. it subtracts largest num from logits, so that we don't get inf (while doing .exp())

    # backward pass:
    for p in parameters:
        p.grad = None
    loss.backward()

    # update:
    for p in parameters:
        p.data += -0.1 * p.grad

8.33366584777832
8.147385597229004
7.968584060668945
7.797062397003174
7.63268518447876
7.475356578826904
7.324982166290283
7.181427478790283
7.044488906860352
6.913883686065674
6.789266586303711
6.670251369476318
6.556445598602295
6.447472095489502
6.342978477478027
6.242645740509033
6.1461873054504395
6.053345203399658
5.96389102935791
5.87762451171875
