<a href="https://colab.research.google.com/github/ameyaoka/-makemore-/blob/main/makemore_MPL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A neural probabilistic language model



### mlp - multilayer perceptron

In [1]:
import torch 
import torch.nn.functional as F
import matplotlib.pyplot
%matplotlib inline

In [2]:
! wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2023-06-09 11:24:46--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2023-06-09 11:24:46 (7.93 MB/s) - ‘names.txt’ saved [228145/228145]



In [3]:
words =  open('names.txt','r').read().splitlines()

In [4]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [5]:
len(words) # total vocabulary 

32033

- The set() function is used to remove duplicate characters, ensuring each character appears only once.
- list() is then used to convert the set back into a list.'        
sorted() is applied to sort the characters in alphabetical order.

In [6]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


## Build the dataset 

In [7]:

block_size = 3  # how many chars serve as input for prediction of next word 
X ,Y =[],[]         # Initialize empty lists for input-output pairs.

for w in words[:5]: # iterate over words (first 5)

  print(w)              # print word 
  context = [0]*block_size      # initialize list with name context .
                                # This means that initially, the context list
                                # is filled with block_size number of zeros
                                # block_size =3 , context = [0,0,0]
  for ch in w + '.':        #Iterate over each character in the current word,
    ix= stoi[ch]            # convert the character to its corresponding index 
    X.append(context)        # Append the current context to the input list "X
    Y.append(ix)              # append current index to output list Y  
    print(''.join(itos[i] for i in context), '--->', itos[ix])# Append the current context to the input list "X
    context = context[1:] + [ix]    # Update the context by removing the first element and adding the current index
  
X = torch.tensor(X)  # Convert the input list "X" to a PyTorch tensor
Y = torch.tensor(Y)  # Convert the output list "Y" to a PyTorch tensor

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [8]:
X.shape , X.dtype , Y.shape , Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [9]:
X # training examples

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [10]:
Y # labels  

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [11]:
C = torch.randn((27,2))

In [12]:
C

tensor([[-0.6600, -0.7575],
        [-0.5145, -1.7448],
        [-2.4160,  0.2244],
        [ 0.8445,  1.6032],
        [ 0.2172,  1.4561],
        [ 0.0522,  2.0993],
        [ 0.9500, -1.4552],
        [-1.7349,  0.1087],
        [ 0.1026,  0.7291],
        [-0.0655, -1.2086],
        [-0.0517, -1.9406],
        [-0.3543, -0.1366],
        [ 2.2147,  0.0240],
        [-0.3967, -0.4748],
        [ 1.2574, -2.2430],
        [-1.6160,  0.3758],
        [ 0.3545,  0.1855],
        [-0.3210, -0.6135],
        [ 1.6804, -0.2795],
        [ 0.1026, -1.0732],
        [ 0.6562, -0.0504],
        [-0.0296, -0.7490],
        [-0.9290, -0.4704],
        [-0.3983,  1.1339],
        [ 1.7175,  0.1215],
        [-0.0433, -0.3369],
        [-1.0604, -0.6501]])

In [13]:
F.one_hot(torch.tensor(5),num_classes=27)

tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])

In [14]:
# Take one hot vect and mulitply by C
# one_hot encoding by default is int . so need to convert to float.
F.one_hot(torch.tensor(5),num_classes=27).float() @ C


tensor([0.0522, 2.0993])

In [15]:
C[5]

tensor([0.0522, 2.0993])

- both output of above lines are same  .

- Pytorch indexing -- learn

In [16]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [17]:
# weights
W1 = torch.randn((6,100))
# bias
b1 = torch.randn(100)   

In [18]:
torch.cat([emb[:,0,:],emb[:,1,:],emb[:,2,:]],1)

tensor([[-0.6600, -0.7575, -0.6600, -0.7575, -0.6600, -0.7575],
        [-0.6600, -0.7575, -0.6600, -0.7575,  0.0522,  2.0993],
        [-0.6600, -0.7575,  0.0522,  2.0993, -0.3967, -0.4748],
        [ 0.0522,  2.0993, -0.3967, -0.4748, -0.3967, -0.4748],
        [-0.3967, -0.4748, -0.3967, -0.4748, -0.5145, -1.7448],
        [-0.6600, -0.7575, -0.6600, -0.7575, -0.6600, -0.7575],
        [-0.6600, -0.7575, -0.6600, -0.7575, -1.6160,  0.3758],
        [-0.6600, -0.7575, -1.6160,  0.3758,  2.2147,  0.0240],
        [-1.6160,  0.3758,  2.2147,  0.0240, -0.0655, -1.2086],
        [ 2.2147,  0.0240, -0.0655, -1.2086, -0.9290, -0.4704],
        [-0.0655, -1.2086, -0.9290, -0.4704, -0.0655, -1.2086],
        [-0.9290, -0.4704, -0.0655, -1.2086, -0.5145, -1.7448],
        [-0.6600, -0.7575, -0.6600, -0.7575, -0.6600, -0.7575],
        [-0.6600, -0.7575, -0.6600, -0.7575, -0.5145, -1.7448],
        [-0.6600, -0.7575, -0.5145, -1.7448, -0.9290, -0.4704],
        [-0.5145, -1.7448, -0.9290, -0.4

- **generalization of above code**

In [19]:
torch.cat(torch.unbind(emb,1),1)

tensor([[-0.6600, -0.7575, -0.6600, -0.7575, -0.6600, -0.7575],
        [-0.6600, -0.7575, -0.6600, -0.7575,  0.0522,  2.0993],
        [-0.6600, -0.7575,  0.0522,  2.0993, -0.3967, -0.4748],
        [ 0.0522,  2.0993, -0.3967, -0.4748, -0.3967, -0.4748],
        [-0.3967, -0.4748, -0.3967, -0.4748, -0.5145, -1.7448],
        [-0.6600, -0.7575, -0.6600, -0.7575, -0.6600, -0.7575],
        [-0.6600, -0.7575, -0.6600, -0.7575, -1.6160,  0.3758],
        [-0.6600, -0.7575, -1.6160,  0.3758,  2.2147,  0.0240],
        [-1.6160,  0.3758,  2.2147,  0.0240, -0.0655, -1.2086],
        [ 2.2147,  0.0240, -0.0655, -1.2086, -0.9290, -0.4704],
        [-0.0655, -1.2086, -0.9290, -0.4704, -0.0655, -1.2086],
        [-0.9290, -0.4704, -0.0655, -1.2086, -0.5145, -1.7448],
        [-0.6600, -0.7575, -0.6600, -0.7575, -0.6600, -0.7575],
        [-0.6600, -0.7575, -0.6600, -0.7575, -0.5145, -1.7448],
        [-0.6600, -0.7575, -0.5145, -1.7448, -0.9290, -0.4704],
        [-0.5145, -1.7448, -0.9290, -0.4

In [20]:
a = torch.arange(18)

In [21]:
a.shape

torch.Size([18])

In [22]:
a.view(3,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [23]:
a.view(9,2)

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

- storage remains same but seen as different 
-Blog below goes in depth 
- http://blog.ezyang.com/2019/05/pytorch-internals/

**Imp**
- **A tensor is always representated as one dim vector.**
- **when we call view some internal attributes of view of tensor changes .**


In [24]:
a.storage()

  a.storage()


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

- more effecient way . 

In [25]:
emb.shape

torch.Size([32, 3, 2])

In [26]:
emb.view(32,6)

tensor([[-0.6600, -0.7575, -0.6600, -0.7575, -0.6600, -0.7575],
        [-0.6600, -0.7575, -0.6600, -0.7575,  0.0522,  2.0993],
        [-0.6600, -0.7575,  0.0522,  2.0993, -0.3967, -0.4748],
        [ 0.0522,  2.0993, -0.3967, -0.4748, -0.3967, -0.4748],
        [-0.3967, -0.4748, -0.3967, -0.4748, -0.5145, -1.7448],
        [-0.6600, -0.7575, -0.6600, -0.7575, -0.6600, -0.7575],
        [-0.6600, -0.7575, -0.6600, -0.7575, -1.6160,  0.3758],
        [-0.6600, -0.7575, -1.6160,  0.3758,  2.2147,  0.0240],
        [-1.6160,  0.3758,  2.2147,  0.0240, -0.0655, -1.2086],
        [ 2.2147,  0.0240, -0.0655, -1.2086, -0.9290, -0.4704],
        [-0.0655, -1.2086, -0.9290, -0.4704, -0.0655, -1.2086],
        [-0.9290, -0.4704, -0.0655, -1.2086, -0.5145, -1.7448],
        [-0.6600, -0.7575, -0.6600, -0.7575, -0.6600, -0.7575],
        [-0.6600, -0.7575, -0.6600, -0.7575, -0.5145, -1.7448],
        [-0.6600, -0.7575, -0.5145, -1.7448, -0.9290, -0.4704],
        [-0.5145, -1.7448, -0.9290, -0.4

In [27]:
emb.view(32,6) == torch.cat(torch.unbind(emb,1),1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [28]:
h= torch.tan(emb.view(-1,6) @ W1 + b1)

In [29]:
h

tensor([[-1.2931e+00, -3.1555e-03,  2.5725e+00,  ...,  4.5177e+00,
         -7.7827e+00,  4.1133e+00],
        [-4.4199e-01,  4.5517e-01, -2.5949e+00,  ..., -1.1722e+00,
          3.7941e-01, -3.9073e-01],
        [ 5.9821e-01,  7.5502e-01,  2.5025e+00,  ..., -3.9093e-01,
          1.0260e+00,  8.2895e-01],
        ...,
        [ 8.8530e-01, -1.6355e+01, -1.1575e+00,  ...,  2.4817e+00,
          2.4210e-01,  6.2626e-01],
        [-3.9058e-01, -7.2485e-02, -4.9877e+00,  ...,  1.0667e+00,
          1.1304e+00,  1.3725e+01],
        [-8.8970e-01, -9.0477e-01,  2.3066e+00,  ..., -1.7233e+01,
         -5.7079e+01,  1.8746e+00]])

- inputs are 100 
- outputs are 27 ( possible category)
- bias are 27

In [30]:
W2 = torch.randn((100,27)) 

b2 = torch.randn(27)

- logits = output 
- 

In [31]:
logits = h @ W2 +b2

In [32]:
logits.shape

torch.Size([32, 27])

In [33]:
counts = logits.exp()

In [34]:
# normalised
prob = counts / counts.sum(1,keepdims=True)

In [35]:
loss = -prob[torch.arange(32),Y].log().mean()

In [36]:
F.cross_entropy(logits,Y)

tensor(511.5619)

### Full neural network  neural net 

- Dataset
- X - input is (32,3)(3 words )
- Y -  labels (32)(expected word)



In [37]:
X.shape , Y.shape 

(torch.Size([32, 3]), torch.Size([32]))

1. g - This will ensure that the random numbers generated by the torch.randn functions are reproducible.
2. This line creates a tensor C of shape (27, 10) and fills it with random numbers from a normal distribution with mean 0 and variance 1. 

3. w1,w2,b1,b2, weights and biases.

4. parameters : This line creates a list parameters containing the tensors C, W1, b1, W2, and b2. This list will be used to train the neural network.

In [46]:


g = torch.Generator().manual_seed(2147483647) # for reproducibility
C = torch.randn((27, 2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn(100, generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn(27, generator=g)
parameters = [C, W1, b1, W2, b2]



In [57]:
for p in parameters:
  p.requires_grad = True


-  the sum function to calculate the total number of parameters in the list parameters

- nelement() function is used to count the number of elements in a tensor. 

In [41]:
sum(p.nelement() for p in parameters) # number of parameters in total


11897

In [59]:
# fordward pass
for _ in range(1000):
  emb = C[X]
  h = torch.tanh(emb.view(-1,6) @ W1 + b1 )
  logits = h @ W2 +b2 
  loss = F.cross_entropy(logits ,Y)
# backward pass
  for  p in parameters:
    p.grad = None 
  loss.backward()

# update 
  for p in parameters:
    p.data += -0.1 * p.grad

print(loss.item())

0.2556455135345459


In [None]:
# training split , dev/validation split , test split
# 80% ,  10% , 10%