In [66]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy 
import transformers
import tiktoken
from transformers import AutoTokenizer

In [67]:
torch.manual_seed(1337)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


In [68]:
"""todos
    EX1: The n-dimensional tensor mastery challenge: Combine the `Head` and `MultiHeadAttention`
     into one class that processes all the heads in parallel, treating the heads as another batch dimension (answer is in nanoGPT).
     
    EX2: Train the GPT on your own dataset of choice! What other data could be fun to blabber on about? (A fun advanced suggestion 

    if you like: train a GPT to do addition of two numbers, i.e. a+b=c. You may find it helpful to predict the digits of c in reverse
     order, as the typical addition algorithm (that you're hoping it learns) would proceed right to left too. You may want to modify 
     the data loader to simply serve random problems and skip the generation of train.bin, val.bin. You may want to mask out the loss 
     at the input positions of a+b that just specify the problem using y=-1 in the targets (see CrossEntropyLoss ignore_index). Does 
     your Transformer learn to add? Once you have this, swole doge project: build a calculator clone in GPT, for all of +-*/. Not an easy problem. You may need Chain of Thought traces.)

    EX3: Find a dataset that is very large, so large that you can't see a gap between train and val loss. Pretrain the transformer 
    on this data, then initialize with that model and finetune it on tiny shakespeare with a smaller number of steps and lower learning
     rate. Can you obtain a lower validation loss by the use of pretraining?
     
    EX4: Read some transformer papers and implement one additional feature or change that people seem to use. Does it improve the performance of your GPT?
"""

"todos\n    EX1: The n-dimensional tensor mastery challenge: Combine the `Head` and `MultiHeadAttention`\n     into one class that processes all the heads in parallel, treating the heads as another batch dimension (answer is in nanoGPT).\n     \n    EX2: Train the GPT on your own dataset of choice! What other data could be fun to blabber on about? (A fun advanced suggestion \n\n    if you like: train a GPT to do addition of two numbers, i.e. a+b=c. You may find it helpful to predict the digits of c in reverse\n     order, as the typical addition algorithm (that you're hoping it learns) would proceed right to left too. You may want to modify \n     the data loader to simply serve random problems and skip the generation of train.bin, val.bin. You may want to mask out the loss \n     at the input positions of a+b that just specify the problem using y=-1 in the targets (see CrossEntropyLoss ignore_index). Does \n     your Transformer learn to add? Once you have this, swole doge project: bu

In [69]:
# params
# B, T, C
batch_size = 1
block_size = 32
n_embd = 64
head_size = 8
learning_rate =1e-4
n_layer = 2
n_head = 8
dropout = 0.2
max_iters = 50


In [70]:
with open('input.txt', 'r') as f:
    data = f.read()

# Tokenize inputs 
model_id = "meta-llama/Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenized_data = tokenizer.encode(data)
print(f"input token length {len(tokenized_data)}")

# Create vocabulary and mappings
vocab = sorted(set(tokenized_data))
vocab_size = len(vocab)
print(f"vocab size = {vocab_size}")

# These mappings are necessary
map_data = {old_id: new_id for new_id, old_id in enumerate(vocab)}
unmap_data = {new_id: old_id for new_id, old_id in enumerate(vocab)}

# Map tokens to new vocabulary
mapped_data = [map_data[token] for token in tokenized_data]

# Create train test splits
n = int(0.8 * len(mapped_data))
train_data = mapped_data[:n]
val_data = mapped_data[n:]

train_data = torch.tensor(train_data, dtype=torch.long)
val_data = torch.tensor(val_data, dtype=torch.long)
print(f"train data size: {train_data.shape}")
print(f"val data size: {val_data.shape}")

Token indices sequence length is longer than the specified maximum sequence length for this model (488544 > 131072). Running this sequence through the model will result in indexing errors


input token length 488544
vocab size = 14368
train data size: torch.Size([390835])
val data size: torch.Size([97709])


In [71]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))     # get batch_size starting points
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

xb, yb = get_batch('train')

print(xb.shape)
print(yb.shape)

torch.Size([1, 32])
torch.Size([1, 32])


In [72]:
class LayerNorm1D:
    # normalise each batch -> that way to get diff from mean and unit variance
    def __init__(self, dim, eps=1e-5, momentum=0.1): # params
        self.eps = eps
        self.gamma = torch.ones(dim)    
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        xmean = x.mean(1, keepdim= True)    # batch mean
        xvar = x.var(1, keepdim=True)          # batch var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalise a row, columwise 
        self.out = self.gamma * xhat + self. beta
        return self.out
        
    def parameters(self):       
        return [self.gamma, self.beta]

module = LayerNorm1D(100)   # 100 dimensions
x = torch.randn(32, 100)    # 32 batch, 100 dims

x = module(x)
x.shape

torch.Size([32, 100])

In [73]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()

        # linear layers
        self.key = nn.Linear(n_embd, head_size, bias=False)      # (B, T, head_size)
        self.query = nn.Linear(n_embd, head_size, bias=False)   # (B, T, head_size)
        self.value = nn.Linear(n_embd, head_size, bias=False)   # (B, T, head_size)
        self.register_buffer('tril', torch.ones(block_size, block_size))    # (B, T, T)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # B, T, head_size   
        q = self.key(x) # B, T, head_size 
        tril = self.tril[:T, :T].to(device)    

        # attention scores
        wei = q @ k.transpose(-2, -1) #keepdim=True) # (B, T, head_size) @ (B, head_size, T) -> keep batch dim so (B, T, T)     
        wei = wei.masked_fill(tril == 0, float('-inf'))     # lower triangular matrix (B, T, T)
        wei = F.softmax(wei, dim = -1)              # normalise 
        wei = self.dropout(wei)                      # dropout

        v = self.key(x) # B, T, head_size 

        out = wei @ v            # (B, T, T) @ (B, T, C) -> (B, T, C)
        
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, n_head, head_size):
        super().__init__()
        self.heads = nn.ModuleList(Head(head_size) for _ in range(n_head))
        self.proj = nn.Linear(n_head * head_size, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)     # (B, T, C) # each head_size gets added together
        out = self.dropout(self.proj(out))
        return out

In [74]:
# combine into causal self attention

class CausalSelfAttention(nn.Module):
    def __init___(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList()  # B, T, head_size, num_heads


In [75]:
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd, 4*n_embd), 
                                nn.ReLU(),
                                nn.Linear(4*n_embd, n_embd),
                                nn.Dropout(dropout),)

    def forward(self, x):
        return self.net(x)          # B, T, C

In [76]:
class Block(nn.Module):
    # transformer block, attention -> mlp
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)    # create this many heads, fits into n_ebd
        self.ffd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffd(self.ln2(x))

        return x

In [82]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        assert vocab_size is not None
        assert block_size is not None
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)       # final layer norm after mlp?
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        pos = torch. arange(0, T, dtype=torch.long, device=device)  # (T)

        # forward GPT model
        tok_emb = self.token_embedding_table(idx)    # B, T, n_embd
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))   # (T, n_embd)
        x = tok_emb + pos_emb # B, T, C 
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is B, T indices of context, unembedded

        for _ in range(max_new_tokens):
            # get block size of last input
            idx_cond = idx[:, -block_size:]
            # get predicitons
            logits, _ = self(idx_cond)   
            
            # last time step?
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=-1)
            # sample from dist
            idx_next = torch.multinomial(probs, num_samples = 1) # B, 1
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
            
        return idx

In [85]:
def decode(tokens):
    # First unmap from our custom vocabulary back to original token IDs
    original_tokens = [unmap_data[token] for token in tokens]
    # Then decode using the tokenizer
    return tokenizer.decode(original_tokens)

max_iters= 10000
model = GPT().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


for iter in range(max_iters):

    # evaluate the loss on train / val every so often
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if iter % 1000 == 0:
        print(f'Iter {iter}, loss: {loss.item()}')


model.eval()
context = torch.zeros((1, 1), dtype=torch.long, device=device)  # start with zeros tensor

# ignore gradients for efficiency
with torch.no_grad():
    
    y = model.generate(context, max_new_tokens=10)
    print(decode(y[0].tolist()))



Iter 0, loss: 9.842464447021484
Iter 1000, loss: 7.147793292999268
Iter 2000, loss: 6.804561138153076
Iter 3000, loss: 6.6118597984313965
Iter 4000, loss: 7.111612319946289
Iter 5000, loss: 6.9859771728515625
Iter 6000, loss: 5.455924034118652
Iter 7000, loss: 5.6383771896362305
Iter 8000, loss: 6.018691062927246
Iter 9000, loss: 6.063382148742676
! You?” ask thatov
s, I believe
