<a href="https://colab.research.google.com/github/alibekk93/NLP_practice/blob/main/Micrograd_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Andrej Karpathy's Makemore-1 tutorial from https://www.youtube.com/watch?v=PaCmpygFfXo and https://github.com/karpathy/nn-zero-to-hero/tree/master/lectures/makemore

https://karpathy.ai/zero-to-hero.html

# Libraries

In [None]:
from tqdm import tqdm
# import torch
# import torch.nn as nn
# from torch.nn import functional as F
# torch.manual_seed(1993)

<torch._C.Generator at 0x7fd23433e2d0>

# Shakespeare

## Setup

In [None]:
#@title ##### download data
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [None]:
#@title ##### open file
with open('input.txt', 'r', encoding='utf-8') as f:
  shakespeareText = f.read()

In [None]:
#@title ##### looking at the data
print('Length of dataset in characters:', len(shakespeareText))
print(shakespeareText[:200])

Length of dataset in characters: 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [None]:
#@title ##### setup character vocab
shakespeareChars = sorted(list(set(shakespeareText)))
shakespeareVocabSize = len(shakespeareChars)
print(''.join(shakespeareChars))
print('Vocab size is', shakespeareVocabSize)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size is 65


In [None]:
#@title ##### mapping characters and indices
shakespeareStoI = { ch:i for i,ch in enumerate(shakespeareChars) }
shakespeareItoS = { i:ch for i,ch in enumerate(shakespeareChars) }
shakespeareEncode = lambda s: [shakespeareStoI[c] for c in s]
shakespeareDecode = lambda l: ''.join(shakespeareItoS[i] for i in l)

print(shakespeareEncode('Shakespeare is cool!'))
print(shakespeareDecode(shakespeareEncode('Shakespeare is cool!')))

# a more complex tokenizer by Google: https://github.com/google/sentencepiece
# a more complex tokenizer by OpenAI: https://github.com/openai/tiktoken

[31, 46, 39, 49, 43, 57, 54, 43, 39, 56, 43, 1, 47, 57, 1, 41, 53, 53, 50, 2]
Shakespeare is cool!


In [None]:
#@title ##### encoding Shakespeare into a tensor
data = torch.tensor(shakespeareEncode(shakespeareText), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:200])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59])


In [None]:
#@title ##### splitting train and validation sets
n = int(0.1*len(data))
train_data = data[n:]
val_data = data[:n]

## Data loader

In [None]:
#@title ##### setting up batch parameters
batch_size = 4 # number of parallel processes
block_size = 8 # maximum context length

In [None]:
#@title ##### function to get a batch
def get_batch(validation=False):
  # generates a small batch of data of inputs x and targets y
  data = val_data if validation else train_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  return x, y

In [None]:
#@title ##### example: getting a batch
xb, yb = get_batch()
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size):
  for t in range(block_size):
    context = xb[b, :t+1]
    target = yb[b, t]
    print(f'when input is {context.tolist()} the target is {target}')

inputs:
torch.Size([4, 8])
tensor([[41, 53, 51, 51, 39, 52, 42, 43],
        [57, 41, 53, 56, 43,  1, 51, 43],
        [58,  1, 63, 53, 59, 56,  1, 47],
        [50, 53, 59, 56,  8,  1, 35, 46]])
targets:
torch.Size([4, 8])
tensor([[53, 51, 51, 39, 52, 42, 43, 42],
        [41, 53, 56, 43,  1, 51, 43,  1],
        [ 1, 63, 53, 59, 56,  1, 47, 52],
        [53, 59, 56,  8,  1, 35, 46, 39]])
----
when input is [41] the target is 53
when input is [41, 53] the target is 51
when input is [41, 53, 51] the target is 51
when input is [41, 53, 51, 51] the target is 39
when input is [41, 53, 51, 51, 39] the target is 52
when input is [41, 53, 51, 51, 39, 52] the target is 42
when input is [41, 53, 51, 51, 39, 52, 42] the target is 43
when input is [41, 53, 51, 51, 39, 52, 42, 43] the target is 42
when input is [57] the target is 41
when input is [57, 41] the target is 53
when input is [57, 41, 53] the target is 56
when input is [57, 41, 53, 56] the target is 43
when input is [57, 41, 53, 56, 43]

## Modelling

### Bigram

In [None]:
#@ ##### class for Bigram Language Model
class BigramLanguageModel(nn.Module):
  
  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
  
  def forward(self, idx, targets=None):
    # idx and targets are both (B,T) tensor of integers
    logits = self.token_embedding_table(idx) # (B,T,C)
    if targets == None:
      loss = None
    else:
      # fixing the (B,T,C) <=> (B,C,T) issue
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      # evaluate loss
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B,T) array of indices in the current context
    for _ in range(max_new_tokens):
      # predict
      logits, loss = self(idx)
      # focus on last time step
      logits = logits[:,-1,:]
      # get probabilities using softmax
      probs = F.softmax(logits, dim=-1)
      # sample from distribution
      idx_next = torch.multinomial(probs, num_samples=1)
      # append sample to the running sequence
      idx = torch.cat((idx, idx_next), dim=1)
    return idx

In [None]:
#@title ##### example: initializing up a Bigram model
m_bigram = BigramLanguageModel(shakespeareVocabSize)
logits, loss = m_bigram(xb, yb)
print(logits.shape)
print(loss)
# set initial idx as a new line
idx = torch.zeros((1,1), dtype=torch.long)
print(shakespeareDecode(m_bigram.generate(idx, max_new_tokens=100)[0].tolist()))

torch.Size([32, 65])
tensor(4.8409, grad_fn=<NllLossBackward0>)

h?pLpEVk!3VauGxoDWjF$plVZ,
d'?NqeYo,d
dOOKymbA3d aDMzgsk$sTkOWQ,dTipQN:muGtK!G&I.jJ3N;.3vFhRKiS!Kx J


In [None]:
#@title ##### creating a PyTorch optimizer
optimizer = torch.optim.AdamW(m_bigram.parameters(), lr=1e-3)

In [None]:
#@title ##### training the model
batch_size = 32
for steps in tqdm(range(20000)):
  # sample a batch
  xb, yb = get_batch()
  # evaluate loss
  logits, loss = m_bigram(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print()
print(loss.item())

100%|██████████| 20000/20000 [00:38<00:00, 526.01it/s]

2.432394504547119





In [None]:
print(shakespeareDecode(m_bigram.generate(idx, max_new_tokens=100)[0].tolist()))


Thin moteffodyourare br hat ticis I o; burishayf Wh ait
ARCOu He-t be me r giond nd th o t OFirarmif
