In [27]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for making figures
%matplotlib inline


we are going to create a mlp here according to the image given in the paper  - Bengio et al. 2003 MLP language model paper (pdf).

In [28]:
# read in all the words
words = open('/Users/sairam/Downloads/makemore-master/names.txt', 'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [29]:
len(words)

32033

In [30]:
# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


 the main objective we are performing here is we are giving 3 inputs (block size ) and we are expecting the model to predict the 4th chracter all 3 characters are stored in x and the corresponding right 4th character is stored in y 

In [31]:
# build the dataset

block_size = 3 # context length: how many characters do we take to predict the next one?
X, Y = [], []
for w in words:
  
  # print(w)
  context = [0] * block_size
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    # print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix] # crop and append
  
X = torch.tensor(X)
Y = torch.tensor(Y)


In [32]:
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  for w in words:

    #print(w)
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      #print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [33]:
print(X),print(Y)

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        ...,
        [26, 26, 25],
        [26, 25, 26],
        [25, 26, 24]])
tensor([ 5, 13, 13,  ..., 26, 24,  0])


(None, None)

In [34]:
X.shape,X.dtype,Y.shape

(torch.Size([228146, 3]), torch.int64, torch.Size([228146]))

In [35]:
C=torch.randn((27,2))#here we are embedding each of our 27 characters into two feature vectors

C

tensor([[-1.0913e+00, -8.5314e-01],
        [ 1.3844e-01, -2.8442e+00],
        [-4.8429e-01,  7.1311e-01],
        [ 6.1174e-01,  3.0853e-02],
        [ 1.1137e+00, -5.7607e-01],
        [ 1.1076e+00, -7.1426e-01],
        [-3.2160e-01, -8.7181e-01],
        [ 1.0347e-01,  4.8481e-01],
        [ 1.3271e+00,  1.3897e+00],
        [ 5.4794e-01,  9.8996e-01],
        [-5.0172e-02,  9.1731e-01],
        [ 4.1628e-01,  4.9472e-02],
        [ 1.7292e+00, -1.7667e-01],
        [-1.1792e+00,  9.5789e-01],
        [-2.8957e-01,  5.4085e-01],
        [ 1.1142e-01,  2.8481e-01],
        [-4.0273e-01,  5.1519e-02],
        [-4.5896e-01, -1.4237e+00],
        [ 2.1864e+00, -1.7478e-02],
        [ 9.8049e-01, -7.9047e-01],
        [-4.6231e-01,  4.4643e-01],
        [-1.2057e+00, -7.4790e-01],
        [-6.0430e-01, -2.6668e-03],
        [-4.5323e-01, -4.0506e-01],
        [-2.7592e+00, -8.6137e-01],
        [-1.9368e-01,  9.2884e-01],
        [ 4.8495e-01,  3.0556e-01]])

In [36]:
emb=C[X]#here we are embedding each character in X matrix , so each row of X has 3 characters and each characters is converted into embedding of two 
# so the final shape of C[X] is 32,3,2

now below we will be creating our hidden layer

In [37]:
W1=torch.randn((6,100))# the input to this hidden layer will be 6 as we have 3 input nodes containing 2 embeddings each and the second parameter 
#100 is up to us as it is the number of neurons we want inthis hidden layer 
b1=torch.randn(100)

now the problem above is that we cannot multiply (32,3,2) matrix by (6,100) , remember here only we are doing our operation of wx+b , the main objective we are performing here is we are giving 3 inputs (block size ) and we are expecting the model to predict the 4th chracter. our first embedding row C[X][0] is represented as this (3,2)

tensor([[-1.0430, -0.3342],
        [-1.0430, -0.3342],
        [-1.0430, -0.3342]])

we can just change the view of this and make it 32,6 as we can take all three row and place them right next to each other , by doing this we will be able to multiple (32,6) embedding matrix by (6,100) hidden layer weights we have initialized and this would give us (32,100) matrix 



now the number 32 has arrived because we have taken the first 5 names which happened to have 32 examples of x and y , in future we have to change it to train on whole dataset, so instead of hardcoding the value 32 into this we will change it to emb.shape[0] which will contain the len of whole dataset,Finally 

In [38]:
emb.shape[0]

228146

In [39]:
h=torch.tanh(emb.view(emb.shape[0],6) @ W1+b1)

In [40]:
h.shape# this is our hidden layer

torch.Size([228146, 100])

Below we will be creating our final output layer , according to the diagram in the paper

In [41]:
W2=torch.randn((100,27))# we will be getting a input of 100 from the hidden layer and our output will be one among 27 characters (remember we are predicting the 4th character)
B2=torch.randn(27)


In [42]:
logits= h@W2 +B2 # (32,100) @ (100,27) + 27 biases = (32,27)

In [43]:
logits.shape

torch.Size([228146, 27])

now that we have the logits , as we did in our previous session , we will exponentiate them , and then normalize them so that the row(which are the probabilities of the next character to come ) has the sum of 1 

In [44]:
# loss = -probs[torch.arange(32), Y].log().mean()# this plucks out the probabilities assigned to the correct numbers which should come after 
# #the 3 chars basically the indexes  represented by Y 


In [45]:
loss=F.cross_entropy(logits,Y)
loss

# the above we were performing is equivalent to calling cross_entropy loss on logits 

tensor(20.2193)

#so summing everything up we can write the final forward pass as follows 



In [46]:
Xtr.shape,Ytr.shape

(torch.Size([182625, 3]), torch.Size([182625]))

In [47]:
g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 300), generator=g)# i have later increased the the number of neurons in hidden layer to 300
b1 = torch.randn(300, generator=g)
W2 = torch.randn((300, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]

In [48]:
sum(p.nelement() for p in parameters)

10281

In [49]:
for p in parameters:
    p.requires_grad=True

In [50]:

#forward pass
for _ in range(10000):
    ix=torch.randint(0,Xtr.shape[0],(32,))
    emb=C[Xtr[ix]]#(32,3,2)
    h=torch.tanh(emb.view(emb.shape[0],6) @ W1+b1)#(32,100)
    logits= h@W2 +b2#(32,27)
    loss=F.cross_entropy(logits,Ytr[ix])
    for p in parameters:
        p.grad = None
    loss.backward()
    for p in parameters:
        p.data += -0.01* p.grad

print(loss.item())


2.6026203632354736


In [51]:
emb = C[Xdev] # (32, 3, 2)
h = torch.tanh(emb.view(-1,6) @ W1 + b1) # (32, 100)
logits = h @ W2 + b2 # (32, 27)
loss = F.cross_entropy(logits, Ydev)
loss

tensor(2.5127, grad_fn=<NllLossBackward0>)