In [1]:
# We" always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-05-21 19:32:14--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-05-21 19:32:14 (90.7 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
# read it

with open('input.txt', 'r', encoding='utf-8') as f:
  text = f.read()



In [3]:
print("Length", len(text))

Length 1115394


In [4]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
# getting full vocab
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(f"Vocabulary: {''.join(chars)}, Vocab Size: {vocab_size}")

Vocabulary: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz, Vocab Size: 65


In [6]:
# creating the mapping
itos = {i:s for i,s in enumerate(chars)}
stoi = {s:i for i,s in itos.items()}

encode = lambda l: [stoi[c] for c in l]
decode = lambda l: "".join([itos[c] for c in l])


print(encode('hii there'))
print(decode(encode('hii there')))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [7]:
# use pytorch to access and store it
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [8]:
# splitting data into train and test
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]


In [9]:
block_size = 8
train_data[:block_size + 1] # we gotta do block size + 1 cuz we'll use 8 chars as context to genrate the 9th one

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [10]:
xb = train_data[:block_size + 1]
yb = train_data[1:block_size + 1]


for t in range(block_size):
  print("Input", xb[:t+1])
  print("Output Given input", yb[t])



Input tensor([18])
Output Given input tensor(47)
Input tensor([18, 47])
Output Given input tensor(56)
Input tensor([18, 47, 56])
Output Given input tensor(57)
Input tensor([18, 47, 56, 57])
Output Given input tensor(58)
Input tensor([18, 47, 56, 57, 58])
Output Given input tensor(1)
Input tensor([18, 47, 56, 57, 58,  1])
Output Given input tensor(15)
Input tensor([18, 47, 56, 57, 58,  1, 15])
Output Given input tensor(47)
Input tensor([18, 47, 56, 57, 58,  1, 15, 47])
Output Given input tensor(58)


In [11]:
torch.manual_seed(1337)

batch_size = 4
block_size = 8

def get_batch(split):

  data = train_data if split=='train' else val_data
  ix = torch.randint(0,len(data)-block_size, (batch_size,))
  xb = torch.stack([data[i:i+block_size] for i in ix])
  yb = torch.stack([data[i+1:i+block_size+1] for i in ix])


  return xb, yb


xb,yb = get_batch('train')
print('inputs')
print(xb.shape)
print(xb)

print('outputs')
print(yb.shape)
print(yb)


for b in range(batch_size):

  for t in range(block_size):
    context = xb[b, :t+1]
    target = yb[b, t]

    print(f"Context: {context.tolist()}, target: {target}")

inputs
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
outputs
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
Context: [24], target: 43
Context: [24, 43], target: 58
Context: [24, 43, 58], target: 5
Context: [24, 43, 58, 5], target: 57
Context: [24, 43, 58, 5, 57], target: 1
Context: [24, 43, 58, 5, 57, 1], target: 46
Context: [24, 43, 58, 5, 57, 1, 46], target: 43
Context: [24, 43, 58, 5, 57, 1, 46, 43], target: 39
Context: [44], target: 53
Context: [44, 53], target: 56
Context: [44, 53, 56], target: 1
Context: [44, 53, 56, 1], target: 58
Context: [44, 53, 56, 1, 58], target: 46
Context: [44, 53, 56, 1, 58, 46], target: 39
Context: [44, 53, 56, 1, 58, 46, 39], target: 58
Context: [44, 53, 56, 1, 58, 46, 39, 5

## Baseline: Bigram Language Model

In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, target=None):

    logits = self.embedding_table(idx)
    B,T,C = logits.shape

    if target is not None:
      logits = logits.view(B*T,C)
      target = target.view(B*T,)
      loss = F.cross_entropy(logits, target)
    else:
      loss = None

    return logits, loss

  def generate(self, idx, max_num_tokens):
    for _ in range(max_num_tokens):
      logits, loss = self(idx) # B x T x C
      # print(logits.shape)
      logits = logits[:, -1, :] # take last layer dim alone
      probs = F.softmax(logits, dim=-1) # column wise for each batch

      idx_new = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_new), dim=1) # concat along dim 1, so it becomes Bx T+1

    return idx



m = BigramLanguageModel(vocab_size)

logits, loss = m(xb, yb)
print(logits.shape, loss)
generated_result = m.generate(torch.zeros((1,1),dtype=torch.long), 100)
print(decode(generated_result[0].tolist()))

torch.Size([32, 65]) tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [51]:
torch.zeros((1,1),dtype=torch.long)

tensor([[0]])

In [46]:
logits.shape, loss

(torch.Size([32, 65]), tensor(4.8786, grad_fn=<NllLossBackward0>))

In [41]:
emb = nn.Embedding(vocab_size, vocab_size)

emb_out = emb(xb)

print(xb.shape, emb_out.shape)



emb_out[:,-1,:] # assumption is that you're using the last word to generate

torch.Size([4, 65])

In [34]:
B,T,C = emb_out.shape

yb.view(-1)

tensor([43, 58,  5, 57,  1, 46, 43, 39, 53, 56,  1, 58, 46, 39, 58,  1, 58,  1,
        58, 46, 39, 58,  1, 46, 17, 27, 10,  0, 21,  1, 54, 39])

In [40]:
emb_out.view(B*T, C)

tensor([[-1.5101, -0.0948,  1.0927,  ..., -0.6126, -0.6597,  0.7624],
        [ 0.3323, -0.0872, -0.7470,  ..., -0.6716, -0.9572, -0.9594],
        [ 0.2475, -0.6349, -1.2909,  ...,  1.3064, -0.2256, -1.8305],
        ...,
        [-2.1910, -0.7574,  1.9656,  ..., -0.3580,  0.8585, -0.6161],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [-0.6787,  0.8662, -1.6433,  ...,  2.3671, -0.7775, -0.2586]],
       grad_fn=<ViewBackward0>)

In [39]:
yb.view(B*T)

tensor([43, 58,  5, 57,  1, 46, 43, 39, 53, 56,  1, 58, 46, 39, 58,  1, 58,  1,
        58, 46, 39, 58,  1, 46, 17, 27, 10,  0, 21,  1, 54, 39])

In [87]:
# training the bigram model

optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [91]:
batch_size = 32
for _ in range(10000):
  xb, yb = get_batch('train')
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
print(loss.item())

2.4210023880004883


In [92]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_num_tokens=500)[0].tolist()))



Thaiby Lulfaseabot alt mpw thar, d is cthe parn gers b, ace t d t tllined y sss u dst hothin h.
An,

QUThiroligmushielowilit?-f s aishe se achyano OUST:
Juy, wn pr ber,
KE w r wourw. thate ma ore,
TOpacedairar me hileof rrut pit; chin! aire ur ye yon outhe hed;
Mut,
S:
Iforoupem blf ad s thoomat I:
Chthe!
A theamurswor RD anouponts vear he erm y s heasineisinel; wevin:
Th s y sthis:
G hestist, oriturea,
Whes,

Agallare bes T:
Toiniseinllldet y, perekeil fus m aghaked sikie, ke angupes bee tisha


## Attention: Intuition

What are we building?
Let's understand that first. First we want the tokens to communicate with one anohter. Currently they aren't.

How?

Most naive example is an average.
But, we only want present tokens interacting with past, not with future. how do we do that?

In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [14]:
# naive way
torch.manual_seed(1337)
B,T,C = 4,8,3

x = torch.randn((B,T,C))
xbow = torch.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1] # T,C
    xbow[b,t] = xprev.mean(axis=0)


In [15]:
# improve on this
'''
Thats a lot of for loops.
We can optimize this. Let's do it on a matrix instead
'''
xtril = torch.tril(torch.ones(T,T))
xtril /= torch.sum(xtril,dim=1,keepdim=True)

output = xtril @ x
torch.allclose(output,xbow)

True

In [16]:
# improve on this further
head_size = 16
Q = nn.Linear(C, head_size)
K = nn.Linear(C, head_size)
V = nn.Linear(C, head_size)

q = Q(x) # B T hs
k = K(x) # B T hs

wei = q @ k.transpose(-2,-1)

xtril = torch.tril(torch.ones(T,T))
wei = F.softmax(wei.masked_fill(xtril==0, value=float('-inf')) * head_size**0.5,dim=-1) # we divide by sqrt(head_size) to normalize. If we don't the values will look peaky

v = V(x) # B T HS
xbow = wei @ v
# torch.allclose(output,output2)


In [17]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3]),dim=-1)

tensor([0.3376, 0.2501, 0.4123])

In [18]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3])*8,dim=-1) # here if you see its sooo sharp towards the third option

tensor([0.1655, 0.0150, 0.8195])

In [19]:
torch.manual_seed(1337)

class MultiHeadAttention(nn.Module):
  def __init__(self, n_embd, head_size, block_size, num_heads):
    super().__init__()
    self.heads = nn.ModuleList([SelfAttention(n_embd, head_size, block_size) for _ in range(num_heads)])

  def forward(self, x):
    x = torch.cat([h(x) for h in self.heads],dim=-1)
    return x


class SelfAttention(nn.Module):
    def __init__(self, n_embd, head_size, block_size):
        super().__init__()
        self.q = nn.Linear(n_embd, head_size, bias=False)
        self.k = nn.Linear(n_embd, head_size, bias=False)
        self.v = nn.Linear(n_embd, head_size, bias=False)
        self.hs = head_size
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, token_emb):
        B,T,C = token_emb.shape
        q = self.q(token_emb) # B T HS
        k = self.k(token_emb) # B T HS
        v = self.v(token_emb) # B T HS

        wei = q @ k.transpose(-2,-1) # B T HS @ B HS T
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf')) * self.hs ** 0.5
        wei = F.softmax(wei,dim=-1) # B T T
        xbow = wei @ v # B T T @ B T HS
        return xbow # B T HS


class BigramLanguageModel(nn.Module):

  def __init__(self, vocab_size, n_embd, block_size, head_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    # self.sa_head = SelfAttention(n_embd, head_size, block_size)
    self.sa_heads = MultiHeadAttention(n_embd, head_size//num_heads, block_size, num_heads)
    self.lm_head = nn.Linear(head_size, vocab_size)

  def forward(self, idx, target=None):
    B,T = idx.shape
    token_emb = self.token_embedding_table(idx) #B,T,n_embd
    positional_output = self.position_embedding_table(torch.tensor(torch.arange(0,T),device=device)) #T,n_embd
    x = token_emb + positional_output # B T n_embd
    x = self.sa_heads(x) # B, T, HS
    logits = self.lm_head(x) # B,T,vocab_size


    if target is not None:
      B,T,C = logits.shape
      logits = logits.view(B*T,C)
      target = target.view(B*T,)
      loss = F.cross_entropy(logits, target)
    else:
      loss = None

    return logits, loss

  def generate(self, idx, max_num_tokens):
    for i in range(max_num_tokens):
      idx_needed = idx[:, -block_size:]
      logits, loss = self(idx_needed) # B x T x C
      # print(logits.shape)
      logits = logits[:, -1, :] # take last layer dim alone
      probs = F.softmax(logits, dim=-1) # column wise for each batch

      idx_new = torch.multinomial(probs, num_samples=1)
      idx = torch.cat((idx, idx_new), dim=1) # concat along dim 1, so it becomes Bx T+1
    return idx

@torch.no_grad()
def evaluate_model(model):
    '''
    get train loss, val loss
    '''
    model.eval()
    out_dict = {}
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)

        for k in range(eval_iters):
            xb,yb = get_batch(split)
            logits, loss = model(xb,yb)
            losses[k] = loss.item()
        out_dict[f'{split}_loss'] = losses.mean().item()
    model.train()
    return out_dict





eval_iters = 200
batch_size = 32
eval_interval = 300
max_steps = 10000
n_embd = 32
block_size = 8
head_size = 16
num_heads = 4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Detected Device", device)



m = BigramLanguageModel(vocab_size, n_embd, block_size, head_size).to(device)



Detected Device cpu


In [20]:
logits, loss = m(torch.randint(0,vocab_size,(1,1)))

  positional_output = self.position_embedding_table(torch.tensor(torch.arange(0,T),device=device)) #T,n_embd


In [21]:
logits.shape

torch.Size([1, 1, 65])

In [22]:
generated_result = m.generate(torch.zeros((1,1),dtype=torch.long), 1000)


  positional_output = self.position_embedding_table(torch.tensor(torch.arange(0,T),device=device)) #T,n_embd


In [23]:
print(decode(generated_result[0].tolist()))


tDrK-k;E;:MZflJNcytT&-wBPfMZRLNcNBaabFdFAytVxFyFU:fGikXDRSLTbUPL?$La.'jH;
kWKiyHzkmGsoCZg&auYkLsPNae'eC.iXN!pwJI-kC'j$OPdu!RafvAr.jUCivH$.WFEhgC?t:W'PBuGz;3Cf&;&tkXo$T.sTAFj.e; rCYh3,O.ztffkK.vEa
CMmPXcn?qeCkkVfMXnFF inUEmZvKZWmYEX:fcydFuFwC.3cnp'&fSTHrbfciCBofpypsfeYM?:MmtyzMrJnVXcODsGujnLNp'?cmZHu3YvFe.rshrIH,J,Z!ri,All$aS
;oukLtFBYz:aPe;e$Lu$aZeSpSncIbWYNmmgEqsXxBaJ!HPR$ApDUyQ:QnXjg
CRAua&.kyxuV;zY: Cb;sRNZF':I
EF;dKtK& '?,QePiFWm
UhXpmhrrvssn&:.qiajsAMliNiYPbDKFe.!d?JTHEMQsnkNf FsU dBcWxmgsY?VhSRc
.-lhP?pCs ,pZxs
CNW&aSKKeq:nhVpdN3,FR'piiGVwyxo3$VwacF$fWjoVBxNVrkgRzOGC!vGrrzm
gDLY.L Nsbc.3rMi$f
-F &.bBsQLqUbXUyAqja!hQw-.;TQeYq-KnjR$aMbkGG'h-jGFn3BoHFCXac
RfuSdJ$UJmuDsfkkWAgx.i-qqMPF hfJkwiETnOnuqpDgpy?grMiWGeREdMm.duYxNKhUEV?uD$UDq
YTRjX;3I$ hVPZNPVnCxqzmSlnYeWTM3nuHiPXbGexA'OSnvBSxLG saBaV-Ro3d-NrbBaxkq$jNgST3NFgVq$ u!eY&GG$SxHRlkSbK-
terOlos;eShueGHK?3SSAlI
Uo
eRWtEPtxE$Zi.H!CMfHmaXbdeFY;.YiGBY-HnfedbK NB?&yY
VTfYDhq?z&ypiGm.P&X,B,uk-DXnGszhb3BaeL!EDk-k
CQEAuH3iSDQmqbAoBgp$Zr!j!

In [50]:
class BatchNorm1d(nn.Module):

  def __init__(self, n_embd, momentum=1e-3, eps=1e-8, training=True):

    self.gamma = torch.ones((1,n_embd))
    self.beta = torch.zeros((1,n_embd))

    self.running_mean = torch.zeros((1,n_embd))
    self.running_var = torch.ones((1,n_embd))
    self.training = training
    self.eps = eps
    self.momentum = momentum

  def __call__(self,x):
    # x size - B,C
    if not self.training:
      xmean = self.running_mean
      xvar = self.running_var

    else:
      xmean = x.mean(0,keepdim=True) # 1,C
      xvar = x.var(0,keepdim=True) # 1,C

    x_bar = (x-xmean)/torch.sqrt(xvar+self.eps) # 1,c
    x_bar = self.gamma * x_bar + self.beta # 1, c
    if self.training:
      with torch.no_grad():
        self.running_mean = self.momentum * xmean + (1-self.momentum) * self.running_mean
        self.running_var = self.momentum * xmean + (1-self.momentum) * self.running_var

    return x_bar



In [51]:
bn = BatchNorm1d(n_embd)
x = torch.randn(4,n_embd)

In [52]:
out = bn(x)

In [55]:
out[:,0].mean()

tensor(0.)

In [54]:
out[:,2].std()

tensor(1.)

In [64]:
class LayerNorm:

  def __init__(self, n_embd, eps=1e-8):

    self.gamma = torch.ones((1,n_embd))
    self.beta = torch.zeros((1,n_embd))
    self.eps = eps

  def __call__(self,x):
    # x size - B,C

    xmean = x.mean(1,keepdim=True) # B,1
    xvar = x.var(1,keepdim=True) # B,1

    x_bar = (x-xmean)/torch.sqrt(xvar+self.eps) # B,C
    x_bar = self.gamma * x_bar + self.beta # B,C

    return x_bar

  def parameters(self):
    return [self.gamma + self.beta]



In [66]:
x = torch.randn(4,n_embd)
ln = LayerNorm(n_embd)
out = ln(x)
out[2,:].mean(), out[2,:].std()

(tensor(-1.1176e-08), tensor(1.))