<a href="https://colab.research.google.com/github/bd1ng/mini-gpt-tutorial/blob/main/mini_gpt_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Obtain dummy text (raw)
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2025-11-10 13:58:39--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2025-11-10 13:58:39 (30.3 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [None]:
# Tokenize Text

chars = sorted(list(set(text)))
enc = {ch:i for i, ch in enumerate(chars)}
dec = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [enc[c] for c in s]
decode = lambda l: ''.join(dec[c] for c in l)

encoded = encode('hi there!')
print(encoded)
print(decode(encoded))


[46, 47, 1, 58, 46, 43, 56, 43, 2]
hi there!


In [None]:
# start torch
import torch
data = torch.tensor(encode(text), dtype = torch.long)

In [None]:
# train/val/test splits
'''
n_train = int(0.8 * len(data))
n_val = int(0.9 * len(data))

train_data = data[:n_train] # first 80% train
val_data = data[n_train:n_val] # next 10% will be val
test_data = data[n_val:] # final 10% will be test
'''

n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]


In [None]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
torch.manual_seed(1337)
batch_size = 4 # independent sequences to run in parallel
block_size = 8 # max context length for predictions

def get_batch(split):
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data)-block_size,(batch_size,)) # randint expects (high, size); processes 0-high; size expects tuple for [x]D tensor
  x = torch.stack([data[i:i+block_size]for i in ix]) # for ix range, create a list of numbers that starts at i and ends at i+8
  y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # for ix range, create a list of numbers that starts at i+1 and ends at i+9
  return x, y # not yet the iterative process, we're just pulling the random blocks of training data (block-size, block-size +1)

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")


inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53

In [None]:
print(xb)

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


In [None]:
# We need the vocab_size from all unique chars
vocab_size = len(chars)

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    logits = self.token_embedding_table(idx)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets =targets.view(B*T)

      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self,idx,max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx) #carries out forward step (defined above)
      logits = logits[:,-1,:] # [all batches, the last token, all logit scores]
      probs = F.softmax(logits, dim=1) # softmax logits with columns (c)
      idx_next = torch.multinomial(probs, num_samples = 1) # sample across prob distribution, retrieve 1 sample
      idx = torch.cat((idx,idx_next), dim=1) # concats predicted token to original index -> column-wise
    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long) #create a 1x1 tensor holding a zero, kicks off generation. 0 stands for a unilong character
max_new_tokens = 100

print(decode(m.generate(idx, max_new_tokens)[0].tolist())) # m.generate(...)[0] because we want the the first (and only) row from the returned idx



torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3) #PyTorch optimizer, lr typically 1e-4, smaller models can handle higher LR

In [None]:
batch_size = 32
for steps in range(100000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

print(f"loss = {loss.item()}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2.4576823711395264
2.311732292175293
2.555223226547241
2.3821189403533936
2.4820756912231445
2.499711513519287
2.4206106662750244
2.4204540252685547
2.442276954650879
2.5140843391418457
2.454397678375244
2.4492290019989014
2.4507150650024414
2.5304956436157227
2.3795864582061768
2.45661997795105
2.4679205417633057
2.375251054763794
2.534993886947632
2.485304594039917
2.468569040298462
2.4627294540405273
2.3540732860565186
2.47808575630188
2.5648343563079834
2.413731575012207
2.433547258377075
2.4962892532348633
2.3065569400787354
2.405484437942505
2.43231201171875
2.4564034938812256
2.3056411743164062
2.4106132984161377
2.4288716316223145
2.4984121322631836
2.459920883178711
2.4548566341400146
2.448111057281494
2.4195027351379395
2.475379228591919
2.4630260467529297
2.4892494678497314
2.439258575439453
2.4204254150390625
2.5259950160980225
2.422452688217163
2.5025899410247803
2.380354166030884
2.5568909645080566
2.5054650

KeyboardInterrupt: 

In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


D:
THat
Iomy; w tee.
UCHAntshon st when frd t? ym fo arusterercorausthe faprrveaks fos I freld bo cother k,

LAg t, m spe buclove, t:
OLolathelect prollll ayorus y g n ghe wontoturous mass, idngonar avors Wilotod
AS d.
Forbeeimelen: ar byrind tr to IV:
LATooorerapuebr par MEEELLAUKESous anderishat whathap.
STho, ow.
IO, e woreevean.
Mouthat oike onind Letizooo,
Dishe u s Chierrrathind sts JUCry NGRG watitr th then tenghect de thenger
Whiechy G d his e thad over,-thyoves courot d fe,-h, ese mon m


In [None]:
torch.manual_seed(42)
B,T,C = 4, 8, 2
x = torch.randn(B,T,C)
x.shape

# Create a random tensor

torch.Size([4, 8, 2])

In [None]:
xbow = torch.zeros((B,T,C))
for b in range(B):
  for t in range(T):
    xprev = x[b,:t+1] #all previous -> up to current token; shape = t, C
    xbow[b,t]= torch.mean(xprev, 0) # mean @ dimension 0 (xprev)

In [None]:
# math trick for averaging - matrix multiplication trick

torch.manual_seed(42)

a = torch.tril(torch.ones(3, 3)) # create a 3x3 tensor of 1s, set all above diagonal to 0
print('a=')
print(a)
print('--')

a = a / torch.sum(a, 1, keepdim=True) # transform 3x3 tensor every element divides by sum of said row (1 = column)
print('a=')
print(a)
print('--')

b = torch.randint(0,10,(3,2)).float() # low = 0, high = 10, shape = 3x2 (for matrix multiplication)
print('b=')
print(b)
print('--')

c = a @ b
print('--')
print('c=')
print(c)

a=
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
--
a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
wei = torch.tril(torch.ones(T,T)) #create lower triangle TxT tensor
wei = wei/ torch.sum(wei, 1, keepdim=True)
xbow2 = wei @ x

In [None]:
tril = torch.tril(torch.ones(T, T)) # create lower triangle TxT tensor
wei = torch.zeros((T,T)) # empty weight tensor TxT
wei = wei.masked_fill(tril == 0, float('-inf')) # replace all 0s in tril tensor with -inf
wei = F.softmax(wei, dim=-1) #softmax wei columns; -inf becomes 0
xbow3 = wei @ x
#how you get softmax values using the same matrix operation

tensor([[[ 1.9269,  1.4873],
         [ 1.4138, -0.3091],
         [ 1.1687, -0.6176],
         [ 0.8657, -0.8644],
         [ 0.5422, -0.3617],
         [ 0.3864, -0.5354],
         [ 0.2272, -0.5388],
         [ 0.1027, -0.3762]],

        [[ 1.6423, -0.1596],
         [ 0.5725,  0.1400],
         [ 0.1289,  0.4528],
         [ 0.2969,  0.7597],
         [ 0.4933,  0.8671],
         [ 0.5129,  0.9450],
         [ 0.4065,  0.8160],
         [ 0.3242,  0.8215]],

        [[-1.3847, -0.8712],
         [-0.8040,  0.4231],
         [-0.4297,  0.1405],
         [-0.2459, -0.0882],
         [-0.5082,  0.1285],
         [-0.5701,  0.0069],
         [-0.6707,  0.3092],
         [-0.7412,  0.2095]],

        [[-0.9138, -0.6581],
         [-0.4179, -0.0662],
         [-0.4413,  0.3530],
         [-0.5344,  0.0808],
         [-0.7082,  0.0718],
         [-0.6008,  0.1724],
         [-0.5289,  0.4113],
         [-0.6109,  0.5329]]])

In [None]:
n_embed = 32

class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.token_embedding_table = nn.Embedding(vocab_size, n_embed) # now C in form of n_embed number of features
    self.linear = nn.linear(n_embed, vocab_size) #We now use a linear layer to transform our 32 features into vocab_size shape

  def forward(self, idx, targets=None):
    tok_emb = self.token_embedding_table(idx) # remember, output now in 32 dimensions, < vocab_size
    logits = self.linear(tok_emb) # takes input of tok_emb, outputs vocab_size-shaped output

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets =targets.view(B*T)

      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self,idx,max_new_tokens):
    for _ in range(max_new_tokens):
      logits, loss = self(idx) #carries out forward step (defined above)
      logits = logits[:,-1,:] # [all batches, the last token, all logit scores]
      probs = F.softmax(logits, dim=1) # softmax logits with columns (c)
      idx_next = torch.multinomial(probs, num_samples = 1) # sample across prob distribution, retrieve 1 sample
      idx = torch.cat((idx,idx_next), dim=1) # concats predicted token to original index -> column-wise
    return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1), dtype=torch.long) #create a 1x1 tensor holding a zero, kicks off generation. 0 stands for a unilong character
max_new_tokens = 100

print(decode(m.generate(idx, max_new_tokens)[0].tolist())) # m.generate(...)[0] because we want the the first (and only) row from the returned idx



In [None]:
# self-attention starts here.
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

# one head within a transformer
head_size = 16 # head_size is a hyperparameter
key = nn.Linear(C, head_size, bias=False) # Remember, nn.Linear params (in, out, bias =T/F)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16), output
q = query(x) # (B, T, 16), output
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

'''
matrix multiplication rule:
A:(m x n)
B:(n x p)
A @ B = (m x p)
inner dimensions (columns of A and rows of B) must match, outer dimensions becomes shape of result

In batched matrix multiply (@) in PyTorch, it treats everything except last two dimensions as batch dimensions.
Hence, A.shape[-1] == B.shape[-2] (last dimension of A must match 2nd last dim of B)

transpose (-2,-1) -> swap 2nd last and last dimensions. B, T, 16) -> (B, 16, T)

result:
wei = attention scores (dot product for k, q for all T)
'''


tril = torch.tril(torch.ones(T, T)) #Generate our lower triangle TxT tensor
wei = wei.masked_fill(tril == 0, float('-inf')) # mask all above diagonal with -inf
wei = F.softmax(wei, dim=-1) # Softmax now ignores all attention scores being current t

v = value(x)
out = wei @ v

out.shape

torch.Size([4, 8, 16])


Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below




In [None]:
# important final step

k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)

# this one
wei = q @ k.transpose(-2, -1) * head_size**-0.5  # This is now scaling our weights (attention scores) by 1/sqrt(head_size)
'''
Why is this scaling necessary? randn generates a distribution of mean = 0, var = 1.
Dot product variance = sum of variance of each individual term
With large variance, softmax becomes extremely peaked (attention focuses in on one token)


Variance is expected square deviation from the mean. Therefore a scale factor must consider variance's
quadratic scaling vis-a-vis scale factor. Hence -sqrt(head-size).
'''



In [None]:
### Finalized Code

import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64 # sequences to parallel process
block_size = 256 # maximum context length for predictions
max_iters = 5000 # total training iterations
eval_interval = 500 # interval to (every 100 steps here) to run estimate_loss
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200 # number of mini-batches to average over during each evaluation
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2 ######
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# all unique chars
chars = sorted(list(set(text)))
vocab_size = len(chars)

# mapping
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # randint expects (high, size); processes 0-high; size expects tuple for xD tensor
    x = torch.stack([data[i:i+block_size] for i in ix]) # inputs
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # targets
    x, y = x.to(device), y.to(device) # new w/ cuda
    return x, y

@torch.no_grad() # just means that this operation does not involve backprop
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']: # assess both train & val set for model performance & generalization
        losses = torch.zeros(eval_iters) #1D tensor to hold eval
        for k in range(eval_iters): # for each iteration...
            X, Y = get_batch(split) # input token IDs, target token IDs for mini-batch
            logits, loss = model(X, Y) # logits, scalar loss (x-entropy) from inference
            losses[k] = loss.item() # stores loss scalar
        out[split] = losses.mean() # mean loss
    model.train() # sets back to train mode
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False) # params for Linear = (input, output, bias = T/F)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        '''
        creates block_size x block_size tensor w/ lower traingular mask
        '''

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C) k = inputs passed through linear layer -> output w/ size of head_size
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        '''
        transposing so we can do @ procedure
        scaling to adjust down variance
        '''
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T) masking so only attention score available is from preceding + current tokens
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)

        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
  # incorporate multiple heads

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)]) #Head is defined above
        '''
        self.heads is a modulelist of attention-head modules
        h(x) runs that head's forward on x (see below)
        '''
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1) # concats the output of each head in num_heads
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
  #multi-layer perceptron (We just stick this random layer in?)
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), # output = 4x embedding dimensions
            nn.ReLU(), # adds non-linearity
            nn.Linear(4 * n_embd, n_embd), # output = n_embd (returns to input shape)
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
  # our transformer block

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        # embedding dimensions / number of heads gives you the # of sub-space features each head "works on"
        self.sa = MultiHeadAttention(n_head, head_size) # multi-head attention
        self.ffwd = FeedFoward(n_embd) # feedforward through MLP
        self.ln1 = nn.LayerNorm(n_embd) # normalizes features to stabilize activations
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        # x + ... represents a residual connection; refining existing input instead of overwriting
        x = x + self.sa(self.ln1(x)) # communication (self-attention)
        x = x + self.ffwd(self.ln2(x)) # computation (feedfwrd)

        # nb: layernorms now are typically applied before sa & ffwrd, differs from original paper
        return x #final output

class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # create an embedding look-up table for [vocab_size] number of unique inputs, represented in n_embd dimensions

        self.position_embedding_table = nn.Embedding(block_size, n_embd) # self-attention relies on positions
        # create a lookup_table for THE POSITION of a token in a block, represented in n_embeddings

        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        # creates a stack ([n_layer] number of layers) of the above Transformer Block

        self.ln_f = nn.LayerNorm(n_embd)
        # final layer norm

        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        # PyTorch broadcasts smaller tensors to match larger ones when dimensions line up.
        # Broadcast = Pytorch pretends smaller tensor is repeated along certain dimensions
        # Here, it assumes it's repeated along B dimension (batches), which is intuitive bc "the meaning" of positions do not change across batches

        x = self.blocks(x) # sequential transformer blocks
        x = self.ln_f(x) # layer norm
        logits = self.lm_head(x) # final linear layer

        # now we have the raw logits, we will need to finally softmax these logits

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]  # crop idx to the last block_size tokens
            # [include all batches, block_size from end -> everything]

            logits, loss = self(idx_cond) # carries out forward step (defined above) on idx_cond; aka gets predictions

            logits = logits[:, -1, :] # [all batches, just the last token, all logit scores]
            probs = F.softmax(logits, dim=-1) # apply softmax to get probabilities
            idx_next = torch.multinomial(probs, num_samples=1) # sample from the distribution
            idx = torch.cat((idx, idx_next), dim=1) # concats predicted token to original index -> column-wise
        return idx


model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


10.788929 M parameters
step 0: train loss 4.2849, val loss 4.2823
step 500: train loss 1.6774, val loss 1.8435
step 1000: train loss 1.3518, val loss 1.5843
step 1500: train loss 1.2349, val loss 1.5048
step 2000: train loss 1.1519, val loss 1.4862
step 2500: train loss 1.0872, val loss 1.4836
step 3000: train loss 1.0267, val loss 1.5018
step 3500: train loss 0.9672, val loss 1.5243
step 4000: train loss 0.9077, val loss 1.5528
step 4500: train loss 0.8489, val loss 1.5667
step 4999: train loss 0.7897, val loss 1.6271

But with a hungry and unstain wind
To him remombered and swer to meet a flesh?
To-morrow I will make our mitle heir, night,
If he would be speeded by my soldier words:
By him had I a legs,
And say 'I incensel both a longing:
But merry that I hope have left none.
But, is he not the time here come to their beds,
Yet with their liverands true goods and leads.

JULIET:
Mercy, at least I suppose; they do prove, ask 'twere!'
Ay, in white silver subs he that seems slaves,
In t

NameError: name 'decode' is not defined