In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [None]:
# build vocab of characters:
chars = sorted(list(set(''.join(words))))

# create mapping from int to chars and vice-versa
stoi, itos = {}, {}
for num, char in enumerate(chars):
    stoi[char] = num + 1
    itos[num+1] = char

stoi['.'] = 0
itos[0] = '.'

In [4]:
# build the dataset

block_size = 3 # context-definition: how many prev chars to be fed to the net

X, Y = [], [] # list of tensors of ints corresponding to characters 

"""my for loop"""
# for word in words[:3]:
#     for _ in range(block_size):
#         word = '.' + word
#     word += '.'
#     for start_idx in range(0,len(word)-block_size):
#         input_context = ""
#         # star from start_idx and take the next "block_size" chars
#         for a in range(block_size):
#             input_context += word[start_idx + a]
#         X.append(input_context)
#         Y.append(word[start_idx + block_size])
# for idx in range(len(X)):
#     print(X[idx]," ---> ", Y[idx])

"""andrej's for loop"""
for word in words:
    context = [0] * block_size
    for ch in word + '.':
        Y.append([stoi[ch]]) # this is the label/target
        X.append(context)
        # print(''.join(itos[i] for i in context), "-->", ch)
        context = context[1:] + [stoi[ch]]

X = torch.tensor(X)
Y = torch.tensor(Y)

In [5]:
# lets cram these 27 chars into 2 dimentional space: (this is our EMBEDDINGS)
# (each of the 27 chars is represented by a 2-dimentional vector)

# look-up table for creating embedddings:
C = torch.randn((27, 2))

In [6]:
# we want embeddings for all the integers in matrix X:
emb = C[X]
emb.shape

# emb is our 1st layer of MLP

torch.Size([228146, 3, 2])

In [7]:
# 2nd layer (hidden)
w1 = torch.randn((6, 100))
b1 = torch.randn(100)

# getting inputs from the 1st layer for the second layer
# METHOD 1: (not flexible with block-size)

# torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape
"""inefficiend, as torch.cat creates new memory"""

# METHOD 2:
# we want to unbind the 1st dimention:
ips = torch.unbind(emb, 1) # this is exactly the sequence of tensors passed in METHOD 1
# returns a sequence 
changed = torch.cat(ips, 1).shape

In [8]:
# METHOD 3: (more efficient)
emb.shape


torch.Size([228146, 3, 2])

In [9]:
h = torch.tanh(emb.view(-1, 6) @ w1 + b1) # hidden states (2nd layer)
# w1 => (6, 100)
h.shape

torch.Size([228146, 100])

In [10]:
h

tensor([[ 0.7528, -0.1936,  0.9004,  ...,  0.9958,  0.6418,  0.7108],
        [ 0.8455,  0.9491,  0.9660,  ...,  0.9949,  0.1586, -0.4045],
        [ 0.1003, -0.9989, -0.9029,  ...,  0.8927,  0.9890,  0.9557],
        ...,
        [-0.5398, -0.3526, -0.8893,  ...,  0.8386, -0.9337, -1.0000],
        [-0.9650, -0.9892, -0.9828,  ..., -0.9143,  0.1100, -0.9999],
        [ 0.7725, -0.9992, -0.9624,  ...,  0.9987,  0.2231, -0.9999]])

In [11]:
w2 = torch.randn((100, 27))
b2 = torch.randn(27)

In [12]:
logits = h @ w2 + b2

In [13]:
logits.shape

torch.Size([228146, 27])

In [14]:
counts = logits.exp() # fake couns

prob = counts / counts.sum(1, keepdim=True)

In [15]:
prob[0].sum()

tensor(1.0000)

In [16]:
# plucking out the probs of the targets present in Y:

loss = -prob[torch.arange(32), Y].log().mean()
loss.requires_grad = True

In [17]:
# --------------- arranged:-------------------
g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27, 2), generator=g) # embedding/look-up table
w1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
w2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, w1, b1, w2, b2]

In [18]:
# total number of parameters:
sum(p.nelement() for p in parameters)

3481

In [19]:
Y = Y.squeeze(1) # else gettig an error in F.cross_entropy

In [20]:
for p in parameters: 
    p.requires_grad = True

In [63]:
ix = torch.randint(0, X.shape[0], (32, )) # 32 is the batch size

In [65]:
for _ in range(20): 
    # getting embeddings for each input
    emb = C[X[ix]] # (32, 3, 2)

    # hidden layer output:
    h =  torch.tanh(emb.view(-1, 6) @ w1 + b1) # between 0 to 1
    # (32, 100)

    logits = h @ w2 + b2 # (32, 27)
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdim=True)
    # loss = -prob[torch.arange(len(X)), Y].log().mean()
    # loss is negative log likelihood

    # more efficient way in pytorch: (used in classification problems)
    loss = F.cross_entropy(logits, Y[ix])
    print(loss.item())

    # reasons for using F.cross_entropy: 
    # 1. forward pass and backward pass can be much more efficient 
    # 3. it subtracts largest num from logits, so that we don't get inf (while doing .exp())

    # backward pass:
    for p in parameters:
        p.grad = None
    loss.backward()

    # update:
    for p in parameters:
        p.data += -0.1 * p.grad

1.1950815916061401
1.1401885747909546
1.0870945453643799
1.0356354713439941
0.9856897592544556
0.9371851682662964
0.8901216983795166
0.8446082472801208
0.8009154200553894
0.7595261335372925
0.7211036086082458
0.6862764954566956
0.6553112268447876
0.6279760599136353
0.6037631034851074
0.5821837186813354
0.5628863573074341
0.5456346273422241
0.5302464962005615
0.5165478587150574


In [26]:
torch.randint(0, 5, (10, ))

tensor([4, 4, 1, 0, 1, 0, 3, 3, 4, 2])

In [31]:
X.shape

torch.Size([228146, 3])

In [66]:
# how do we know the right learning rate