In [154]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import numpy 
import transformers
from datasets import load_dataset
import tiktoken
from transformers import AutoTokenizer

In [155]:
torch.manual_seed(1337)

<torch._C.Generator at 0x27c7ff32690>

In [156]:
"""todos
    EX1: The n-dimensional tensor mastery challenge: Combine the `Head` and `MultiHeadAttention`
     into one class that processes all the heads in parallel, treating the heads as another batch dimension (answer is in nanoGPT).
     
    EX2: Train the GPT on your own dataset of choice! What other data could be fun to blabber on about? (A fun advanced suggestion 

    if you like: train a GPT to do addition of two numbers, i.e. a+b=c. You may find it helpful to predict the digits of c in reverse
     order, as the typical addition algorithm (that you're hoping it learns) would proceed right to left too. You may want to modify 
     the data loader to simply serve random problems and skip the generation of train.bin, val.bin. You may want to mask out the loss 
     at the input positions of a+b that just specify the problem using y=-1 in the targets (see CrossEntropyLoss ignore_index). Does 
     your Transformer learn to add? Once you have this, swole doge project: build a calculator clone in GPT, for all of +-*/. Not an easy problem. You may need Chain of Thought traces.)

    EX3: Find a dataset that is very large, so large that you can't see a gap between train and val loss. Pretrain the transformer 
    on this data, then initialize with that model and finetune it on tiny shakespeare with a smaller number of steps and lower learning
     rate. Can you obtain a lower validation loss by the use of pretraining?
     
    EX4: Read some transformer papers and implement one additional feature or change that people seem to use. Does it improve the performance of your GPT?
"""

"todos\n    EX1: The n-dimensional tensor mastery challenge: Combine the `Head` and `MultiHeadAttention`\n     into one class that processes all the heads in parallel, treating the heads as another batch dimension (answer is in nanoGPT).\n     \n    EX2: Train the GPT on your own dataset of choice! What other data could be fun to blabber on about? (A fun advanced suggestion \n\n    if you like: train a GPT to do addition of two numbers, i.e. a+b=c. You may find it helpful to predict the digits of c in reverse\n     order, as the typical addition algorithm (that you're hoping it learns) would proceed right to left too. You may want to modify \n     the data loader to simply serve random problems and skip the generation of train.bin, val.bin. You may want to mask out the loss \n     at the input positions of a+b that just specify the problem using y=-1 in the targets (see CrossEntropyLoss ignore_index). Does \n     your Transformer learn to add? Once you have this, swole doge project: bu

In [157]:
torch.set_printoptions(linewidth=200)

In [158]:
# params
# B, T, C
batch_size = 4
block_size = 100
n_embd = 16
head_size = 4
learning_rate =1e-4
n_layer = 4
n_head = 4
dropout = 0.0
max_iters = 5000


In [159]:
with open('input.txt', 'r') as f:
    data = f.read()

# encoding 
enc = tiktoken.get_encoding('cl100k_base')
data = enc.encode(data)

toks = sorted(list(set(data)))
vocab_size = len(toks)

n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]


train_data = torch.tensor(train_data)
val_data = torch.tensor(val_data)


In [160]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))     # get batch_size starting points
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')

print(xb.shape)


# x needs to be embedded. 

torch.Size([4, 100])


In [161]:
class LayerNorm1D:
    # normalise each batch -> that way to get diff from mean and unit variance
    def __init__(self, dim, eps=1e-5, momentum=0.1): # params
        self.eps = eps
        self.gamma = torch.ones(dim)    
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        xmean = x.mean(1, keepdim= True)    # batch mean
        xvar = x.var(1, keepdim=True)          # batch var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalise a row, columwise 
        self.out = self.gamma * xhat + self. beta
        return self.out
        
    def parameters(self):       
        return [self.gamma, self.beta]

module = LayerNorm1D(100)   # 100 dimensions
x = torch.randn(32, 100)    # 32 batch, 100 dims

x = module(x)
x.shape

torch.Size([32, 100])

In [162]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()

        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.ones(block_size, block_size))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x) # B, T, head_size   
        q = self.key(x) # B, T, head_size 
        v = self.key(x) # B, T, head_size 

        # attention scores
        wei = q @ k.transpose(-2, -1, keepdim=True) # (B, T, head_size) @ (B, head_size, T) -> keep batch dim so (B, T, T)     
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))     # lower triangular matrix (B, T, T)
        wei = F.softmax(1, dim = -1)              # normailise 
        out = wei @ v            # (B, T, T) @ (B, T, C) -> (B, T, C)

        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList(Head(head_size) for _ in range(num_heads))
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out



In [163]:
class FeedForward(nn.Module):
    def __init__():
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd, 4*n_embd),
                                nn.ReLU(),
                                nn.Linear(4*n_embd, n_embd),
                                nn.Dropout(dropout),)

    def forward(self, x):
        return self.net(x)

In [164]:
class Block(nn.Module):
    # transformer block, attention -> mlp
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)    # create this many heads, fits into n_ebd
        self.ffd = FeedForward
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = self.sa(self.ln1(x))
        x = x + self.ffd(self.ln2(x))

        return x

In [165]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(n_embd, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, block_size)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)       # final layer norm after mlp?
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        tok_emb = self.token_embedding_table(idx) # embed x     # B, T, C
        pos_emb = self.position_embedding_table(torch.arange(T, device = device))   # T, C
        x = tok_emb + pos_emb # B, T, C 
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logit.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is B, T indices of context, unembedded
        for _ in range(max_new_tokens):
            # get block size of last input
            idx_cond = idx[:, -block_size:]
            # get predicitons
            logits, loss = self(idx_cond)   
            # last time step?
            logits = logits[:, -1, :] # (B, C)
            # sample from dist
            idx_next = torch.multinomial(probs, num_samples = 1) # B, 1
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
            return idx

In [166]:
model = GPT()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # evaluate the loss on train / val every so often
    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1,1), dtype=torch.long)

IndexError: index out of range in self

In [None]:
# combine single and multihead in 1 class, all heads in parallel

In [None]:
# masked fill, lower triangle matrix so past tokens cant look ahead to future tokens
# lower triangular matrix tells you how much weight to put on each 

# the Q, K dot product is the weightings for the @ with the values. Basically the difference between what i am 
# look for and what i contain determines how much weight to place on the values




    def __init__():
        super()__init__()
        token_embedding_table = nn.Embedding(vocab_size, n_embd)        # for each unique token, get its embedding
        position_embedding_table = nn.Embedding(block_size, n_emb)      # get a unique embedding for the position of a token in block size
        lm_head = nn.Linear(n_embd, )           

    def forward(self, idx):
        tok_embd = token_embedding_table(idx)
        pos_embd = position_embedding_table(idx)

        x = tok_embd + pos_+ embd


class SelfAttention(nn.Module):
    def __init__(self):
        super().__init__()
        token_embedding_table = nn.Embedding(vocab_size, n_embd)
        position_embedding_table = nn.Embedding(block_size, n_emb)
        lm_head = nn.Linear(n_embd, )

    def forward(self):



        B, T, C = 4, 8, 32
        x = torch.randn(B, T, C)

        # each head learns it own unique projection of the embeddings down to its head size.
        # different heads hopefully capture different features
        key = nn.Linear(C, head_size, bias=False)
        query = nn.Linear(C, head_size, bias=False)
        value = nn.Linear(C, head_size, bias=False)

        k = key(x)
        q = query(x)
        v = value(x)

        # k and q are B, T, head_size
        # only transpose time and head dims
        # (B, T, 16) @ (B, 16, T) -> (B, T, T)
        # T, T is the affinities matrix
        wei = q @ k.transpose(-2, -1)

        # then do lower triangle masking
        tril = torch.tril(torch.ones(T, T))
        wei = wei.masked_fill(tril==0, float('-inf'))
        wei = F.softmax(wei, dim=1)           # softmax horizontally, which is across dim1, not down dim 0
        out = wei @ v
        return out

In [None]:
def 

In [57]:
B, T, C = 4, 8, 32
head_size = 16

x = torch.randn(B, T, C)

# each head learns it own unique projection of the embeddings down to its head size.
# different heads hopefully capture different features
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)

k = key(x)
q = query(x)

# k and q are B, T, head_size

# only transpose time and head dims
# (B, T, 16) @ (B, 16, T) -> (B, T, T)
# T, T is the affinities matrix
wei = q @ k.transpose(-2, -1)

# then do lower triangle masking
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=1)           # softmax horizontally, which is across dim1, not down dim 0
out = wei @ x
print(out)

tensor([[[ 2.0627e-03, -3.2660e-03, -5.1835e-03,  ...,  2.5971e-03, -1.0526e-02, -7.4462e-03],
         [ 1.7579e-02, -2.0755e-02, -3.0835e-02,  ...,  1.6845e-02, -6.8175e-02, -4.9836e-02],
         [ 1.0973e-03, -9.4295e-04, -1.3573e-03,  ...,  5.0895e-05, -2.1686e-03, -8.4499e-04],
         ...,
         [-7.7978e-02,  7.4575e-02,  5.0641e-02,  ...,  2.0182e-01,  2.6490e-01,  3.8846e-01],
         [-1.4495e-01, -3.5787e-01,  8.8779e-01,  ...,  2.7113e-01,  8.1441e-01,  1.4782e-01],
         [-6.6962e-01, -1.3192e+00, -3.4279e-01,  ...,  8.3123e-01,  1.2434e+00, -8.8896e-01]],

        [[-7.1018e-03, -2.9788e-02, -4.1602e-02,  ...,  5.8691e-03, -1.1214e-01,  2.1592e-01],
         [ 6.0431e-02, -1.2342e-01, -8.2011e-02,  ...,  8.5697e-03, -2.1339e-01,  5.0892e-01],
         [-1.9269e-01,  3.4826e-01, -8.9946e-01,  ..., -5.2624e-01, -4.4395e-01,  7.1422e-02],
         ...,
         [ 6.9391e-01, -8.0423e-01,  2.2054e+00,  ...,  2.6638e-01, -7.1175e-01,  1.7694e-01],
         [ 1.7009e-0