<a href="https://colab.research.google.com/github/ameyaoka/-makemore-/blob/main/makemore_MPL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A neural probabilistic language model



In [134]:
import torch 
import torch.nn.functional as F
import matplotlib.pyplot
%matplotlib inline

In [109]:
! wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2023-06-07 13:42:29--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt.2’


2023-06-07 13:42:29 (36.6 MB/s) - ‘names.txt.2’ saved [228145/228145]



In [110]:
words =  open('names.txt','r').read().splitlines()

In [111]:
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [112]:
len(words) # total vocabulary 

32033

- The set() function is used to remove duplicate characters, ensuring each character appears only once.
- list() is then used to convert the set back into a list.'        
sorted() is applied to sort the characters in alphabetical order.

In [113]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.']=0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


## Build the dataset 

In [143]:

block_size = 3  # how many chars serve as input for prediction of next word 
X ,Y =[],[]         # Initialize empty lists for input-output pairs.

for w in words[:5]: # iterate over words (first 5)

  print(w)              # print word 
  context = [0]*block_size      # initialize list with name context .
                                # This means that initially, the context list
                                # is filled with block_size number of zeros
                                # block_size =3 , context = [0,0,0]
  for ch in w + '.':        #Iterate over each character in the current word,
    ix= stoi[ch]            # convert the character to its corresponding index 
    X.append(context)        # Append the current context to the input list "X
    Y.append(ix)              # append current index to output list Y  
    print(''.join(itos[i] for i in context), '--->', itos[ix])# Append the current context to the input list "X
    context = context[1:] + [ix]    # Update the context by removing the first element and adding the current index
  
X = torch.tensor(X)  # Convert the input list "X" to a PyTorch tensor
Y = torch.tensor(Y)  # Convert the output list "Y" to a PyTorch tensor

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [144]:
X.shape , X.dtype , Y.shape , Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

In [145]:
X # training examples

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [146]:
Y # labels  

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [147]:
C = torch.randn((27,2))

In [148]:
C

tensor([[ 2.0904e-03,  9.0645e-01],
        [ 1.1368e+00,  1.6517e+00],
        [ 6.7847e-01, -9.8973e-02],
        [ 2.2176e-01,  7.9648e-01],
        [-1.8063e+00,  6.9835e-01],
        [-1.0929e+00,  8.5351e-01],
        [-1.3887e+00,  1.2002e-01],
        [-1.3192e+00,  1.3315e+00],
        [ 1.5518e+00,  9.5298e-01],
        [-1.1259e+00, -9.3239e-01],
        [ 9.4702e-01, -2.0137e-01],
        [ 1.0914e+00,  1.3549e+00],
        [-1.9764e+00,  3.0578e-01],
        [-5.6006e-02,  6.4116e-01],
        [-9.8936e-01,  2.8582e-01],
        [ 1.4573e+00,  4.2988e-01],
        [ 8.1093e-02,  5.6186e-01],
        [ 2.5375e-01, -2.9978e+00],
        [-7.7257e-02,  1.2575e-01],
        [-7.8492e-01,  9.3908e-01],
        [ 7.9190e-01, -1.1250e-01],
        [-5.0543e-01,  5.6309e-01],
        [-1.7296e-02, -1.1071e+00],
        [ 1.8864e-01, -9.7440e-01],
        [ 5.8567e-01,  1.2346e-01],
        [ 1.9585e+00,  1.8582e+00],
        [-1.2695e+00, -1.6762e-01]])

In [149]:
F.one_hot(torch.tensor(5),num_classes=27)

tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])

In [150]:
# Take one hot vect and mulitply by C
# one_hot encoding by default is int . so need to convert to float.
F.one_hot(torch.tensor(5),num_classes=27).float() @ C


tensor([-1.0929,  0.8535])

In [151]:
C[5]

tensor([-1.0929,  0.8535])

- both output of above lines are same  .

- Pytorch indexing -- learn

In [152]:
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [153]:
# weights
W1 = torch.randn((6,100))
# bias
b1 = torch.randn(100)   

In [154]:
torch.cat([emb[:,0,:],emb[:,1,:],emb[:,2,:]],1)

tensor([[ 0.0021,  0.9065,  0.0021,  0.9065,  0.0021,  0.9065],
        [ 0.0021,  0.9065,  0.0021,  0.9065, -1.0929,  0.8535],
        [ 0.0021,  0.9065, -1.0929,  0.8535, -0.0560,  0.6412],
        [-1.0929,  0.8535, -0.0560,  0.6412, -0.0560,  0.6412],
        [-0.0560,  0.6412, -0.0560,  0.6412,  1.1368,  1.6517],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  0.0021,  0.9065],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  1.4573,  0.4299],
        [ 0.0021,  0.9065,  1.4573,  0.4299, -1.9764,  0.3058],
        [ 1.4573,  0.4299, -1.9764,  0.3058, -1.1259, -0.9324],
        [-1.9764,  0.3058, -1.1259, -0.9324, -0.0173, -1.1071],
        [-1.1259, -0.9324, -0.0173, -1.1071, -1.1259, -0.9324],
        [-0.0173, -1.1071, -1.1259, -0.9324,  1.1368,  1.6517],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  0.0021,  0.9065],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  1.1368,  1.6517],
        [ 0.0021,  0.9065,  1.1368,  1.6517, -0.0173, -1.1071],
        [ 1.1368,  1.6517, -0.0173, -1.1

- **generalization of above code**

In [155]:
torch.cat(torch.unbind(emb,1),1)

tensor([[ 0.0021,  0.9065,  0.0021,  0.9065,  0.0021,  0.9065],
        [ 0.0021,  0.9065,  0.0021,  0.9065, -1.0929,  0.8535],
        [ 0.0021,  0.9065, -1.0929,  0.8535, -0.0560,  0.6412],
        [-1.0929,  0.8535, -0.0560,  0.6412, -0.0560,  0.6412],
        [-0.0560,  0.6412, -0.0560,  0.6412,  1.1368,  1.6517],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  0.0021,  0.9065],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  1.4573,  0.4299],
        [ 0.0021,  0.9065,  1.4573,  0.4299, -1.9764,  0.3058],
        [ 1.4573,  0.4299, -1.9764,  0.3058, -1.1259, -0.9324],
        [-1.9764,  0.3058, -1.1259, -0.9324, -0.0173, -1.1071],
        [-1.1259, -0.9324, -0.0173, -1.1071, -1.1259, -0.9324],
        [-0.0173, -1.1071, -1.1259, -0.9324,  1.1368,  1.6517],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  0.0021,  0.9065],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  1.1368,  1.6517],
        [ 0.0021,  0.9065,  1.1368,  1.6517, -0.0173, -1.1071],
        [ 1.1368,  1.6517, -0.0173, -1.1

In [156]:
a = torch.arange(18)

In [157]:
a.shape

torch.Size([18])

In [158]:
a.view(3,3,2)

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

In [159]:
a.view(9,2)

tensor([[ 0,  1],
        [ 2,  3],
        [ 4,  5],
        [ 6,  7],
        [ 8,  9],
        [10, 11],
        [12, 13],
        [14, 15],
        [16, 17]])

- storage remains same but seen as different 
-Blog below goes in depth 
- http://blog.ezyang.com/2019/05/pytorch-internals/

**Imp**
- **A tensor is always representated as one dim vector.**
- **when we call view some internal attributes of view of tensor changes .**


In [160]:
a.storage()

 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

- more effecient way . 

In [161]:
emb.shape

torch.Size([32, 3, 2])

In [162]:
emb.view(32,6)

tensor([[ 0.0021,  0.9065,  0.0021,  0.9065,  0.0021,  0.9065],
        [ 0.0021,  0.9065,  0.0021,  0.9065, -1.0929,  0.8535],
        [ 0.0021,  0.9065, -1.0929,  0.8535, -0.0560,  0.6412],
        [-1.0929,  0.8535, -0.0560,  0.6412, -0.0560,  0.6412],
        [-0.0560,  0.6412, -0.0560,  0.6412,  1.1368,  1.6517],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  0.0021,  0.9065],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  1.4573,  0.4299],
        [ 0.0021,  0.9065,  1.4573,  0.4299, -1.9764,  0.3058],
        [ 1.4573,  0.4299, -1.9764,  0.3058, -1.1259, -0.9324],
        [-1.9764,  0.3058, -1.1259, -0.9324, -0.0173, -1.1071],
        [-1.1259, -0.9324, -0.0173, -1.1071, -1.1259, -0.9324],
        [-0.0173, -1.1071, -1.1259, -0.9324,  1.1368,  1.6517],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  0.0021,  0.9065],
        [ 0.0021,  0.9065,  0.0021,  0.9065,  1.1368,  1.6517],
        [ 0.0021,  0.9065,  1.1368,  1.6517, -0.0173, -1.1071],
        [ 1.1368,  1.6517, -0.0173, -1.1

In [163]:
emb.view(32,6) == torch.cat(torch.unbind(emb,1),1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [164]:
h= torch.tan(emb.view(-1,6) @ W1 + b1)

In [165]:
h

tensor([[ -0.0665,  -1.3373,   0.7257,  ...,  -0.0448,  -7.7892,  -7.8338],
        [ -1.9753,   2.0500,  -1.1315,  ...,  -0.5698,   0.5867,   5.2612],
        [  0.6016,   0.7166,  -0.6400,  ...,   1.4561,  -1.9683,  -0.8288],
        ...,
        [ -0.1505,  -1.5884,  -0.6301,  ...,  -0.2086,   3.9526,  -0.5199],
        [ -0.9674,   0.3879,  -1.8017,  ...,  -0.7731,  -3.0949,   3.8567],
        [  3.3939,  -4.9078,  -0.7281,  ...,   2.0472, -16.3581,  -0.6247]])

In [166]:
W2 = torch.randn((100,27))

b2 = torch.randn(27)

In [167]:
logits = h @ W2 +b2

In [168]:
logits.shape

torch.Size([32, 27])

In [169]:
counts = logits.exp()

In [170]:
# normalised
prob = counts / counts.sum(1,keepdims=True)

In [171]:
loss = -prob[torch.arange(32),Y].log().mean()

In [172]:
F.cross_entropy(logits,Y)

tensor(409.0743)

### arrange neural net 

In [173]:
# forward pass
  emb = C[Xtr[ix]] # (32, 3, 2)
  h = torch.tanh(emb.view(-1, 30) @ W1 + b1) # (32, 100)
  logits = h @ W2 + b2 # (32, 27)
  loss = F.cross_entropy(logits, Ytr[ix])
  #print(loss.item())
  
  # backward pass
  for p in parameters:
    p.grad = None
  loss.backward()
  

IndentationError: ignored