In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
from transformers import AutoTokenizer
from dataclasses import dataclass

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.manual_seed(1337)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


In [None]:
'''Todos
- RMSnorm
- wandb logging
- rope pos encoding
- ring attention
- kv cache
- weight sharing
- config class
- 
- '''

In [None]:
# params
# B, T, C
batch_size = 32
block_size = 512

n_embd = 768
n_head = 6
n_layer = 6

learning_rate =1e-4

dropout = 0.2
max_iters = 10000
compile = True


In [5]:
with open('shakespeare.txt', 'r') as f:
    data = f.read()

# tokenize
model_id = "meta-llama/Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenized_data = tokenizer.encode(data)
print(f"input token length {len(tokenized_data)}")

# create vocab
vocab = sorted(set(tokenized_data))
vocab_size = len(vocab)
print(f"vocab size = {vocab_size}")

# mapping large vocab size tokens to input vocab size
map_data = {old_id: new_id for new_id, old_id in enumerate(vocab)}
unmap_data = {new_id: old_id for new_id, old_id in enumerate(vocab)}

# map inputs
mapped_data = [map_data[token] for token in tokenized_data]

# train test splits
n = int(0.8 * len(mapped_data))
train_data = mapped_data[:n]
val_data = mapped_data[n:]

train_data = torch.tensor(train_data, dtype=torch.long)
val_data = torch.tensor(val_data, dtype=torch.long)
print(f"train data size: {train_data.shape}")
print(f"val data size: {val_data.shape}")


Token indices sequence length is longer than the specified maximum sequence length for this model (301769 > 131072). Running this sequence through the model will result in indexing errors


input token length 301769
vocab size = 12132
train data size: torch.Size([241415])
val data size: torch.Size([60354])


In [6]:
print(data[:50])
tok50 = ([map_data[token] for token in tokenizer.encode(data[:50])])
print(tok50)

for c in tok50:
    print(tokenizer.decode([unmap_data[c]]))
print(tokenizer.decode([unmap_data[token] for token in tok50]))


word = 'disentanglement'

toks = tokenizer.encode(word)
for c in toks:
    print(tokenizer.decode(c))

First Citizen:
Before we proceed any further, hear
[12131, 2529, 9033, 261, 3967, 309, 4004, 515, 2287, 2, 2974]
<|begin_of_text|>
First
 Citizen
:

Before
 we
 proceed
 any
 further
,
 hear
<|begin_of_text|>First Citizen:
Before we proceed any further, hear
<|begin_of_text|>
dis
ent
ang
lement


In [7]:
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))     # get batch_size starting points
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

xb, yb = get_batch('train')

print(xb.shape)
print(yb.shape)

torch.Size([32, 512])
torch.Size([32, 512])


In [None]:
# masked multi head attention

class CausalSelfAttention(nn.Module):
    def __init__(self, n_head):
        super().__init__()
        # heads fit into n_embd
        assert n_embd % n_head == 0
        # create k, q, v projections for a batch
        self.c_attn = nn.Linear(n_embd, 3 * n_embd)
        # output
        self.c_proj = nn.Linear(n_embd, n_embd)
        self.n_head = n_head

        # nanogpt_scal_init
        # regularise, n_head, and n_embd

    def forward(self, x):
        B, T, C = x.shape       # batch, sequence length, embedding size
        # calc, k, q, v for heads in a batch
        qkv = self.c_attn(x)        # (B, T, C) -> (C, 3*C) -> (B, T ,3*C)
        q, k, v = qkv.split(n_embd, dim=2)   # (B, T, C)

        # reorder dims to be (B, C, T, n_head)
        q = q.view(B, T, self.n_head, C // n_head).transpose(1, 2)     # (B, nh, T, hs)
        k = k.view(B, T, self.n_head, C // n_head).transpose(1, 2)     # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // n_head).transpose(1, 2)     # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)    # mask future tokens
        y = y.transpose(1, 2).contiguous().view(B, T, C)                # (B, T, C)
        y = self.c_proj(y)

        return y
    

class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embd, 4*n_embd),   # hidden
                                nn.GELU(),                      # nonlinearity
                                nn.Linear(4*n_embd, n_embd),    # out
                                nn.Dropout(dropout),)           # regularise

    def forward(self, x):
        x = self.net(x)
        return x            # (B, T, C)



class Layernorm(nn.Module):
    def __init__():
        super().__init__()
    # so give batch of (B, T, C)
    # layernorm normalises (T, C)
    # unit mean and variance
    
class RMSNorm(nn.Module):
    def __init__():
        super().__init__()

    # like layernorm but uses root mean square

# take in a batch and return the batch normalise along the embedding dimension
    def forward(self):




    # normalise the layers along the embedding layer 


class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.ln_1 = nn.LayerNorm(n_embd)        # normalise embedding dim
        self.csa = CausalSelfAttention(n_head)
        self.ln_2 = nn.LayerNorm(n_embd)
        self.ffd = MLP()

    def forward(self, x):
        x = x + self.csa(self.ln_1(x))
        x = x + self.ffd(self.ln_2(x))

        return x 
    
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        assert vocab_size is not None
        assert block_size is not None
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)       # final layer norm after mlp?
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # weight sharing 

    def forward(self, idx, targets=None):
        B, T = idx.shape
        pos = torch. arange(0, T, dtype=torch.long, device=device)  # (T)

        # forward GPT model
        tok_emb = self.token_embedding_table(idx)    # B, T, n_embd
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))   # (T, n_embd)
        x = tok_emb + pos_emb # B, T, C 
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is B, T indices of context, unembedded

        for _ in range(max_new_tokens):
            # get block size of last input
            idx_cond = idx[:, -block_size:]
            # get predicitons
            logits, _ = self(idx_cond)   
            
            # last time step?
            logits = logits[:, -1, :] # (B, C)
            probs = F.softmax(logits, dim=-1)
            # sample from dist
            idx_next = torch.multinomial(probs, num_samples = 1) # B, 1
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
            
        return idx

In [10]:
@dataclass
class GPTConfig:
    vocab_size: int
    block_size: int
    n_layer: int
    n_head: int
    n_embd: int

In [13]:
def decode(tokens):
    # First unmap from our custom vocabulary back to original token IDs
    original_tokens = [unmap_data[token] for token in tokens]
    # Then decode using the tokenizer
    return tokenizer.decode(original_tokens)

max_iters= 10000
model = GPT().to(device)
if compile:
    model = torch.compile(model)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# B , T, head_size, num_heads
for iter in range(max_iters):
    model.train()
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)
    
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if iter % 100 == 0:
        print(f'Iter {iter}, loss: {loss.item()}')
        with torch.no_grad():
            model.eval()
            context = torch.zeros((1, 1), dtype=torch.long, device=device)  # start with zeros tensor
        
            y = model.generate(context, max_new_tokens=10)
            print(decode(y[0].tolist()))

model.eval()
context = torch.zeros((1, 1), dtype=torch.long, device=device)  # start with zeros tensor

# ignore gradients for efficiency
with torch.no_grad():
    
    y = model.generate(context, max_new_tokens=100)
    print(decode(y[0].tolist()))



Iter 0, loss: 9.577692985534668
!Measure rob happenedFine felt aloneodiac acheswordrops
Iter 100, loss: 5.760305404663086
! if above;
How sh me and beauty ofRY
Iter 200, loss: 5.056351661682129
! soINGS things it be looks, good, for
Iter 300, loss: 4.927411079406738
! so truth: then be Norfolk you
counter hundred
Iter 400, loss: 4.498869895935059
! another
corner patience, and stand creating me,
Iter 500, loss: 4.344297885894775
! pu child with pure mastered at pity is the
Iter 600, loss: 4.250061988830566
! my guilt slaves, my pains, I dream!

Iter 700, loss: 3.9501209259033203
! think orices, if, know of you were
Iter 800, loss: 3.828605890274048
! in peace of 'good harloter in him
Iter 900, loss: 3.836083173751831
! I wisely unto thee, I hope
The v
Iter 1000, loss: 3.6055996417999268
! I am no most estate. double fitness
The
Iter 1100, loss: 3.5751936435699463
! death to blame up,
Becomes it me
Iter 1200, loss: 3.4682202339172363
! I will do beat me know these not!

B
Iter 1300, los

In [15]:
with torch.no_grad():
    
    y = model.generate(context, max_new_tokens=1000)
    print(decode(y[0].tolist()))


! and accursed effect
Cannot be accused, the great place of your dwellingure,
That you shall welcome! So with the Capulets
If not have your scope Buckingham behests,
For this day you: Cleomenes and Dion, beat them to Westminster,
For the king, great Hostiliumny doth haste.

First Soldier:
To the queen to send the contrary me
Should not yet distinctly ranges. When he was not most noble
Only in a town: as any thing possible
The which is dishonour, so grieving
With a them; and falling fabric. 'Tis time:
I have of but yet I be endured
I was a son o' the Volsce like of many of mine,
The effects of them too gross and so dishonour
We have yielded to fear'd their eyes and request
With painted to break the cushion, not
Of, which I would have given already.

POLIXENES:
On, my lord!

HERMIONE:
Was not! there?

POLIXENES:
The king hath been born to-morrow affords.

HERMIONE:
Nay, but that,
There is even your graces
You were boys:
Nay, but you will?

POLIXENES:
We were as twinn'd with limber vows,
