 - From https://www.youtube.com/watch?v=kCc8FmEb1nY&t=3895s
 - Repo: https://github.com/karpathy/nanoGPT
 - google colab book: https://colab.research.google.com/drive/1JMLa53HDuA-i7ZBmqV7ZnA3c_fvtXnx-?usp=sharing


## 1. Importing library

In [1]:
import numpy as np     
import pandas as pd    
import matplotlib.pyplot as plt        
import torch           


In [2]:
has_mps = torch.backends.mps.is_built()
has_mps

True

In [3]:
torch.cuda.is_available()

False

## 2. Get Data

In [4]:
!pwd

/Users/abhishekde/Desktop/Projects/DS/pytorch


In [5]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-11-28 11:20:59--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-11-28 11:20:59 (11.6 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [6]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [8]:
len(text)

1115394

In [11]:
text[:1000]

"First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you know Caius Marcius is chief enemy to the people.\n\nAll:\nWe know't, we know't.\n\nFirst Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?\n\nAll:\nNo more talking on't; let it be done: away, away!\n\nSecond Citizen:\nOne word, good citizens.\n\nFirst Citizen:\nWe are accounted poor citizens, the patricians good.\nWhat authority surfeits on would relieve us: if they\nwould yield us but the superfluity, while it were\nwholesome, we might guess they relieved us humanely;\nbut they think we are too dear: the leanness that\nafflicts us, the object of our misery, is as an\ninventory to particularise their abundance; our\nsufferance is a gain to them Let us revenge this with\nour pikes, ere we become rakes: for the gods know I\nspeak this in hunger 

In [12]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [21]:
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}


In [22]:
"h" in stoi

True

In [23]:
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[c] for c in l])

In [24]:
print(encode("hii there"))

[46, 47, 47, 1, 58, 46, 43, 56, 43]


In [25]:
print(decode(encode("hii there")))

hii there


In [26]:
import torch

In [31]:
data = torch.tensor(encode(text), dtype = torch.long)

In [28]:
print(data.shape, data.dtype)
print(data[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [29]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [32]:
block_size = 8
train_data[: block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [34]:
x = train_data[:block_size]
y = train_data[1: block_size+1]
print(train_data[:block_size+1])
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, target is {target}")

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])
when input is tensor([18]), target is 47
when input is tensor([18, 47]), target is 56
when input is tensor([18, 47, 56]), target is 57
when input is tensor([18, 47, 56, 57]), target is 58
when input is tensor([18, 47, 56, 57, 58]), target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is 58


In [36]:
torch.manual_seed(1332)
batch_size = 4 # how many independent sequences will be processed in parallel
block_size = 8 # what is the maximum length of predictions?

def get_batch(split):
    data = train_data if split=="train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+1+block_size] for i in ix])
    return x, y


xb, yb = get_batch("train")
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)



inputs:
torch.Size([4, 8])
tensor([[44,  1, 61, 47, 58, 46,  1, 63],
        [27, 44,  1, 53, 59, 56,  1, 45],
        [40, 59, 58,  1, 39,  1, 57, 50],
        [ 1, 61, 46, 39, 58,  6,  1, 53]])
targets:
torch.Size([4, 8])
tensor([[ 1, 61, 47, 58, 46,  1, 63, 53],
        [44,  1, 53, 59, 56,  1, 45, 56],
        [59, 58,  1, 39,  1, 57, 50, 47],
        [61, 46, 39, 58,  6,  1, 53,  5]])


In [38]:
for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"when input is {context}, target is {target}")

when input is tensor([44]), target is 1
when input is tensor([44,  1]), target is 61
when input is tensor([44,  1, 61]), target is 47
when input is tensor([44,  1, 61, 47]), target is 58
when input is tensor([44,  1, 61, 47, 58]), target is 46
when input is tensor([44,  1, 61, 47, 58, 46]), target is 1
when input is tensor([44,  1, 61, 47, 58, 46,  1]), target is 63
when input is tensor([44,  1, 61, 47, 58, 46,  1, 63]), target is 53
when input is tensor([27]), target is 44
when input is tensor([27, 44]), target is 1
when input is tensor([27, 44,  1]), target is 53
when input is tensor([27, 44,  1, 53]), target is 59
when input is tensor([27, 44,  1, 53, 59]), target is 56
when input is tensor([27, 44,  1, 53, 59, 56]), target is 1
when input is tensor([27, 44,  1, 53, 59, 56,  1]), target is 45
when input is tensor([27, 44,  1, 53, 59, 56,  1, 45]), target is 56
when input is tensor([40]), target is 59
when input is tensor([40, 59]), target is 58
when input is tensor([40, 59, 58]), ta

In [39]:
import torch
import torch.nn as nn   
import torch.nn.functional as F
torch.manual_seed(1332)

<torch._C.Generator at 0x7f9c7a0ae1b0>

In [40]:
nn.Embedding?

[0;31mInit signature:[0m
[0mnn[0m[0;34m.[0m[0mEmbedding[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mnum_embeddings[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0membedding_dim[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpadding_idx[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mint[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_norm[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mfloat[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnorm_type[0m[0;34m:[0m [0mfloat[0m [0;34m=[0m [0;36m2.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mscale_grad_by_freq[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msparse[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0m_weight[0m[0;34m:[0m [0mUni

In [44]:
em = nn.Embedding(10, 10)
em

Embedding(10, 10)

In [45]:
dir(em)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__constants__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_buffers',
 '_call_impl',
 '_compiled_call_impl',
 '_fill_padding_idx_with_zero',
 '_forward_hooks',
 '_forward_hooks_always_called',
 '_forward_hooks_with_kwargs',
 '_forward_pre_hooks',
 '_forward_pre_hooks_with_kwargs',
 '_get_backward_hooks',
 '_get_backward_pre_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules

In [46]:
em.weight

Parameter containing:
tensor([[-1.0175, -0.2013,  0.7380,  0.6510, -0.0429, -0.4522, -1.5231,  0.3527,
          0.3391,  0.5226],
        [-1.4219, -0.0108, -0.3183,  1.1943, -1.1718, -0.1582, -1.7327, -0.5154,
         -2.0280, -2.1273],
        [-0.0853, -0.3235,  0.3193, -0.6140, -1.3470,  0.3112,  0.2305,  0.2442,
         -0.1316, -1.1528],
        [ 1.0391, -1.7309,  1.6593,  0.0595, -0.2043, -0.8348, -1.3504,  1.7978,
          0.2779,  0.0637],
        [-1.8127,  0.2159, -0.3877,  1.3709, -0.4423,  0.5503,  1.1506,  0.9796,
          0.8442, -0.0377],
        [-0.7907,  1.4548, -1.2190,  0.5885,  0.0604,  1.7116,  0.9181,  0.2841,
         -1.2448,  0.5812],
        [ 0.4969, -0.4833,  0.9447, -0.4740, -1.5136, -1.4680,  0.5735, -0.0234,
          0.4481,  0.5951],
        [ 0.7591, -0.4095,  0.1171, -1.0498, -0.4200, -1.5601, -0.9316, -1.5470,
         -0.0929,  2.1709],
        [ 1.0895,  1.9371,  0.0048,  0.5337, -1.1086,  0.2672,  0.6059,  0.1532,
         -1.0170,  0.1772

In [47]:
vocab_size

65

In [62]:
torch.multinomial?

[0;31mDocstring:[0m
multinomial(input, num_samples, replacement=False, *, generator=None, out=None) -> LongTensor

Returns a tensor where each row contains :attr:`num_samples` indices sampled
from the multinomial (a stricter definition would be multivariate,
refer to torch.distributions.multinomial.Multinomial for more details)
probability distribution located in the corresponding row
of tensor :attr:`input`.

.. note::
    The rows of :attr:`input` do not need to sum to one (in which case we use
    the values as weights), but must be non-negative, finite and have
    a non-zero sum.

Indices are ordered from left to right according to when each was sampled
(first samples are placed in first column).

If :attr:`input` is a vector, :attr:`out` is a vector of size :attr:`num_samples`.

If :attr:`input` is a matrix with `m` rows, :attr:`out` is an matrix of shape
:math:`(m \times \text{num\_samples})`.

If replacement is ``True``, samples are drawn with replacement.

If not, they are d

In [100]:
block_size

8

In [106]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cpu'

In [None]:
## BiGram language model
n_embd = 32 
class BiGramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        
    def forward(self, idx, targets=None):
        B, T = idx.shape
        
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else: 
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx
      
    
m = BiGramLanguageModel()
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
        
        
        
idx = torch.zeros((1,1), dtype=torch.long)
print(decode())
    

TypeError: __init__() takes 1 positional argument but 2 were given

In [68]:
idx = torch.zeros((1,1), dtype=torch.long)
decode(m.generate(idx, max_new_tokens = 100)[0].tolist())

"\nVm-Yb;oU3gcqLiBN-&YDOlmlTDJAohXTR$NDXuB,WJpA\ny?!kMnABVPmQlWgHB3,oc'qCbvlnyM:TZkPopJnG&vrCoIrs&-Skt Z"

In [69]:
# create a pytorch optimizer object
optimizer = torch.optim.AdamW(m.parameters(), lr = 1e-3)

In [76]:
# Train the BiGram model
batch_size = 32
for steps in range(10000):
    
    # sample a batch of data
    xb, yb = get_batch("train")
    
    # evaluate loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    
print(loss.item())
    

2.423017978668213


In [78]:
idx = torch.zeros((1,1), dtype=torch.long)
decode(m.generate(idx, max_new_tokens = 1000)[0].tolist())

"\nHigamee ut ares atru poo s l-ieilouney halavire t on blis.\nFII' cughio tho y ners? If wesith \nPrs d gindu ave thile monghe in mimbivefro m merathand I y k's an y. A:\nGulursouitwolthe w ARIG macore hicieazererk y ice r ipe to pass apthis, kivee urkn Who wnd the? tovepe s hillindorweovistomary LAnenoviof aren tond fos athe hat my more, y 'd, y winf\n\nDor s putheye ale'sereacat IUKE:'d ithyowsblly l!\n\nHAnd and hethy, t mirs ajerantas Buime o mseditithall alou s,\nFe j; s IOLidin ick hmy y\nI'Rilyoll:\nAR: tcerare ave; lon, qu tlicla ss h, sorgs acaroves s pllos f mave vaslase one fingo w, sofeanthe, ith icintharind y thed ghan ICHathes HAn s heous ajove ind be 'd,\nY:\nAUThowein: 'sppres fe h intout\nMy lled s:\nIO'sene or'st he.\nBuchu t t bove mbr non\nINClouleerf ch mase apserr revetothinerdothowe d nthe wofod nqu ETRCARS:\nWebrunlliserat,\nRENG ppthe?\nAle the playondo fally blom ryouthe ts:\nTes m w le'lendondyore prsumso isagaspevantsasiove oulller\nAR:\nTUSTCEWelplaryord g

### The mathematical trick in self-attention

In [None]:
torch.manual_seed(1332)
B, T, C = 4, 8, 2 # batch, time, channels
x = torch.rand(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [98]:
torch.allclose(x_bow, x_bow2)

True

In [81]:
x[:2, :2, :2]

tensor([[[0.2151, 0.3679],
         [0.7507, 0.9052]],

        [[0.3940, 0.9941],
         [0.4775, 0.9544]]])

In [82]:
x[0]

tensor([[0.2151, 0.3679],
        [0.7507, 0.9052],
        [0.2884, 0.7917],
        [0.2249, 0.1734],
        [0.0311, 0.2682],
        [0.9956, 0.7544],
        [0.3207, 0.2828],
        [0.6517, 0.0474]])

In [None]:
# version 1
x_bow = torch.zeros((B, T, C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        x_bow[b,t] = torch.mean(xprev, 0)

In [85]:
x_bow[0]

tensor([[0.2151, 0.3679],
        [0.4829, 0.6366],
        [0.4180, 0.6883],
        [0.3697, 0.5596],
        [0.3020, 0.5013],
        [0.4176, 0.5435],
        [0.4038, 0.5062],
        [0.4348, 0.4489]])

In [89]:
torch.tril(torch.ones((3,3)))

tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])

In [99]:
# version 2
wei = torch.tril(torch.ones((T,T)))
wei = wei/torch.sum(wei, 1, keepdim=True) 
x_bow2 = wei @ x # (B, T, T) @ (B, T, C)  -> (B, T, C)
x_bow2[0]

tensor([[0.2151, 0.3679],
        [0.4829, 0.6366],
        [0.4180, 0.6883],
        [0.3697, 0.5596],
        [0.3020, 0.5013],
        [0.4176, 0.5435],
        [0.4038, 0.5062],
        [0.4348, 0.4489]])

In [92]:
torch.manual_seed(32)
a = torch.tril(torch.ones((3,3)))
a = a/torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print(f"a={a}")
print(f"b={b}")
print(f"c={c}")

a=tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=tensor([[5., 3.],
        [3., 4.],
        [4., 6.]])
c=tensor([[5.0000, 3.0000],
        [4.0000, 3.5000],
        [4.0000, 4.3333]])


In [93]:
torch.matmul(a, b) # same as a @ b

tensor([[5.0000, 3.0000],
        [4.0000, 3.5000],
        [4.0000, 4.3333]])

In [109]:
# version 4: self attention
torch.manual_seed(32)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C)

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float("-inf"))
wei = F.softmax(wei, dim=1)
out = wei @ x
print(out.shape)  



torch.Size([4, 8, 32])
