# Makemore 3

In [1]:
import torch
import matplotlib.pyplot as plt
import torch.nn.functional as F
%matplotlib inline

In [2]:
words =  open("names.txt",'r').read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
len(words)

32033

#### build vocab of chars and mapping from/to integers

In [4]:
chars =  list(set(''.join(words)))
stoi = {ch:i+1 for i,ch in enumerate(chars)}
stoi['.'] = 0
itos = { i:ch for ch,i in stoi.items()}
vocab_size = len(stoi)
print(stoi)
print("----------------------------")
print(vocab_size)

{'m': 1, 'k': 2, 'h': 3, 'v': 4, 'u': 5, 'l': 6, 'd': 7, 'c': 8, 'a': 9, 'p': 10, 'b': 11, 'n': 12, 'r': 13, 'e': 14, 'j': 15, 'o': 16, 't': 17, 'g': 18, 'f': 19, 's': 20, 'y': 21, 'z': 22, 'i': 23, 'w': 24, 'x': 25, 'q': 26, '.': 0}
----------------------------
27


In [5]:
# create the dataset with block_size and label
block_size = 3
def build_dataset(words):
    context = [0]*block_size
    X,Y = [],[]
    for w in words:
        # print(w)
        for ch in w+'.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            # print(''.join([itos[x]for x in context])," ----> ",itos[ix])
            context = context[1:] + [ix]
    X,Y = torch.tensor(X),torch.tensor(Y)
    print(X.shape,Y.shape)
    return X,Y

import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,Ytr = build_dataset(words[:n1])      #80%
Xdev,Ydev = build_dataset(words[n1:n2])  #10%
Xte,Yte = build_dataset(words[n2:])      #10%

torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


### Parameters init

In [7]:
# khaiming normal init for tanh non linerality 
# kn = (5/3) / (n_embd*block_size)**0.5

### [batch noramlization](https://arxiv.org/pdf/1502.03167)

In [8]:
# take the mean of each nueron activation values 
# subtract the mean from hpreact and divide by std of the same
# add gain and shift (equivalant to w and b) 
# so nuerons can learn learn those features of the distribution from data

In [9]:
n_embd = 10 # the dimentionality of the character embedding vector
n_hidden=200 # number of nuerons in hidden layer of the MLP

g  = torch.Generator().manual_seed(2147483647)  # for reproducibility
C  = torch.randn((vocab_size,n_embd),           generator=g)
W1 = torch.randn((n_embd*block_size,n_hidden),  generator=g) * (5/3)/((n_embd * block_size)**0.5) #* 0.2
# b1 = torch.randn(n_hidden,                      generator=g) * 0.01 
W2 = torch.randn((n_hidden,vocab_size),         generator=g) * 0.01
b2 = torch.randn(vocab_size,                    generator=g) * 0 

bngain = torch.ones((1,n_hidden))
bnbias = torch.randn((1,n_hidden))

bnmean_running = torch.zeros((1,n_hidden))
bnstd_running = torch.ones((1,n_hidden))

parameters = [C,W1,W2,b2,bngain,bnbias]
print(sum(p.nelement() for p in parameters)) # total number of parameters
for p in parameters:
    p.requires_grad = True


12097


In [11]:
max_steps = 200000
batch_size = 32
lossi = []
for i in range(200000):
    
    #minibatch creation
    ix = torch.randint(0,Xtr.shape[0],(batch_size,),generator=g)
    Xb,Yb = Xtr[ix],Ytr[ix]
    
    # Forward Pass
    emb = C[Xb] # embed chars into vectors
    embcat = emb.view(emb.shape[0],-1) # concatenate the vectors
    hpreact = embcat @ W1 #+ b1   # hidden layer pre-activation
    
    # batchnorm layer
    bnmeani = hpreact.mean(0,keepdim=True)
    bnstdi = hpreact.std(0,keepdim=True)
    hpreact = bngain * (hpreact - bnmeani / bnstdi) + bnbias
    
    # nonlinear Layer
    h = torch.tanh(hpreact)   #hidden layer 
    logits = h @ W2 + b2   #output layer
    loss = F.cross_entropy(logits,Yb) #loss function
    
    with torch.no_grad():
        bnmean_running = bnmean_running * 0.999 + 0.001 * bnmeani
        bnstd_running = bnstd_running * 0.999 + 0.001 * bnstdi
        
    #  backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i%10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():4f}')
#         break
    lossi.append(loss.log10().item())

      0/ 200000: 2.042166
  10000/ 200000: 1.997786
  20000/ 200000: 2.348114
  30000/ 200000: 2.022869
  40000/ 200000: 2.180098
  50000/ 200000: 2.016807
  60000/ 200000: 2.273655
  70000/ 200000: 2.351571
  80000/ 200000: 2.079373
  90000/ 200000: 1.781339
 100000/ 200000: 2.128270
 110000/ 200000: 2.275824
 120000/ 200000: 2.084750
 130000/ 200000: 1.645144
 140000/ 200000: 1.859368
 150000/ 200000: 1.959660
 160000/ 200000: 1.942291
 170000/ 200000: 1.736757
 180000/ 200000: 2.109583
 190000/ 200000: 2.178949


In [12]:
# since we coupled the batch of training examples and with mean and std 
# but during inference we are only using single example at a time 
# we have to caliberate the inference by calculating mean and std of traning data 

In [14]:
# caliberate the inference by calculating mean std over training set

with torch.no_grad():
    emb = C[Xtr]
    embcat = emb.view(-1,n_embd*block_size)
    hpreact = embcat @ W1 #+ b1
    bnmean = hpreact.mean(0,keepdim=True)
    bnstd = hpreact.std(0,keepdim=True)

In [None]:
# above cell is understanding purpose only , running mean and running std is used in production

### training loss 

In [None]:
@torch.no_grad() # this decorator disables gradient tracking 
def split_losss(split):
    X,Y={
        'train': (Xtr,Ytr) ,
        'val':   (Xdev,Ydev) ,
        'test':  (Xte,Yte)
    }[split]
    emb = C[X] # (32, 3, 2)
    embcat = emb.view(emb.shape[0],-1)
    hpreact = embcat @ W1 #+ b1  
    hpreact = bngain * (hpreact-bnmean_running / bnstd_running) + bnbias
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2 
    loss = F.cross_entropy(logits, Y)
    print(split,loss.item())

split_losss("train")
split_losss("val")

In [None]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
        emb = C[torch.tensor([context])] # (1,block_size,d)
        embcat = emb.view(emb.shape[0],-1)
        hpreact = embcat @ W1 #+ b1 
        hpreact = bngain * (hpreact-bnmean_running / bnstd_running) + bnbias
        h = torch.tanh(hpreact)
        logits = h @ W2 + b2
        probs = F.softmax(logits, dim=1)
        # Sample from the distribution
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        # shift the context window and track the smaples 
        context = context[1:] + [ix]
        # if  we sample the special '.' token, break
        out.append(ix)
        if ix == 0:
            break
    
    print(''.join(itos[i] for i in out)) # decode and print the generated word