In [26]:
import torch
import torch.nn as nn #neural network layers
import torch.nn.functional as F #stateless functions -> use relu

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [None]:
with open("corpus.txt", "r") as f:
    data = f.read()
    data = data.replace('\n', '')  # Replaces all '\n' with an empty string

chars = data
# Build tokenizer
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
trainer = BpeTrainer(special_tokens=["[UNK]", "[PAD]"])
tokenizer.train_from_iterator(data, trainer)

# Encode/decode
ids = tokenizer.encode(data).ids
print(ids)
print(tokenizer.decode(ids))
# Batch sampler
def get_batch(batch_size, context_size):
    ix = torch.randint(len(train_ids) - context_size, (batch_size,))
    x = torch.stack([torch.tensor(train_ids[i:i+context_size]) for i in ix])
    y = torch.stack([torch.tensor(train_ids[i+1:i+context_size+1]) for i in ix])
    return x, y

#build dataset with vocabulary (in this case characters)
text = 'hello world hello world hello world hello worlx'
#chars = sorted(list(set(text)))

stoi = {ch: i for i, ch in enumerate(chars)} # chsar -> index
#allows lookup dict for tokenization and decoding
itos = {i: ch for ch, i in stoi.items()}
vocab_size = len(chars)
#creates the functions to map back and forth between the index and the chars
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

#create a "TENSOR" of index ids
# Encode dataset into token IDs

train_ids = [stoi[c] for c in data]
#train_ids = train_ids.to(device)
#3data = torch.tensor(encode(text), dtype=torch.long)





[45, 58, 75, 58, 65, 62, 67, 60, 62, 67, 73, 61, 58, 31, 68, 68, 66, 47, 68, 60, 58, 73, 61, 58, 71, 30, 54, 71, 58, 54, 67, 57, 33, 58, 58, 57, 62, 67, 60, 40, 78, 40, 68, 66, 50, 54, 67, 73, 72, 73, 68, 43, 54, 71, 73, 54, 64, 58, 62, 67, 54, 30, 65, 54, 72, 72, 62, 56, 40, 68, 57, 58, 71, 67, 34, 71, 54, 67, 57, 69, 54, 71, 58, 67, 73, 47, 71, 54, 57, 62, 73, 62, 68, 67, 12, 36, 73, 84, 72, 41, 68, 73, 34, 68, 62, 67, 60, 73, 68, 35, 54, 69, 69, 58, 67, 12, 46, 61, 58, 61, 54, 73, 58, 73, 61, 62, 72, 12, 28, 57, 75, 62, 56, 58, 55, 78, 40, 62, 56, 61, 58, 65, 65, 58, 35, 58, 71, 66, 54, 67, 46, 58, 69, 73, 16, 22, 10, 16, 14, 16, 19, 22, 24, 14, 14, 28, 40, 28, 72, 68, 56, 62, 54, 65, 66, 58, 57, 62, 54, 69, 68, 72, 73, 76, 62, 73, 61, 54, 67, 58, 66, 68, 63, 62, 68, 75, 58, 71, 73, 61, 58, 56, 61, 62, 65, 57, 6, 72, 59, 54, 56, 58, 12, 43, 61, 68, 73, 68, 62, 65, 65, 74, 72, 73, 71, 54, 73, 62, 68, 67, 55, 78, 46, 65, 54, 73, 58, 12, 43, 61, 68, 73, 68, 72, 55, 78, 34, 58, 73, 7

In [29]:
#parameters

embed_dim = 16
context_size = 32
num_heads = 2
num_layers = 1

In [30]:
#transformer block
#transformer block is the layer that gets stacked many times. #Allows tokens to attend to other tokens to gather context <- thats the MultiheadAttention module. "How much should token i pay attention to token j?" each head focuses on different relationships

class TransformerBlock(nn.Module):
    def __init__(self,embed_dim, num_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
        self.ff = nn.Sequential(#this is the FeedForward Network
            nn.Linear(embed_dim, embed_dim*4),#expand
            nn.ReLU(),#nonlinearity
            nn.Linear(embed_dim*4, embed_dim)#contract back
        )
        self.ln1 = nn.LayerNorm(embed_dim)#Normalization functions
        self.ln2 = nn.LayerNorm(embed_dim)
    def forward(self, x):
        T = x.size(1)
        mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()  # True where we block
        attn_out, _ = self.attn(x, x, x, attn_mask=mask, need_weights=False)
        x = self.ln1(x+attn_out) #uses POSTNORM: LayerNorm after adding residuals... bad practices
        ff_out = self.ff(x)
        x = self.ln2(x+ ff_out) #another residual sent through LayerNorm
        return x

In [31]:
class TinyLLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, context_size, num_heads):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_dim) #maps token ids to vectors..... vocab_size X embed_dim
        self.pos_embed = nn.Embedding(context_size, embed_dim) #learned positional embeddings for the shape of context_size and embed_dim
        self.transformer = TransformerBlock(embed_dim, num_heads) #Exciting! We call our TransformerBlock Module

        #Tie the input embedding weights with the output layer for efficiency + performance.
        self.lm_head = nn.Linear(embed_dim, vocab_size, bias=False)
        self.lm_head.weight = self.token_embed.weight  # weight tying
 #Linear is a fully connected layer, with an input vector length of embed_dim, and adds weight matrix, bias vector: y = xW^T + b

    def forward(self, idx):
        B, T = idx.shape #Batch Size and Sequence Length
        tok_emb = self.token_embed(idx) #remember, maps the token id, to the vectors (B, T, E) <- E is the size of the vector representing each token in the model. Like the resolution of the token's brain.
        pos_emb = self.pos_embed(torch.arange(T, device = idx.device)) #this returns T, E
        x = tok_emb + pos_emb
        x = self.transformer(x) 
        logits = self.lm_head(x) 
        return logits
    
    #Positional Embedding size is limited by context_size. Context_size must be larger than the max T you ever pass
    #Weight Tying - best practice to tie token_embed.weight and lm_head.weight ..... (use the same matrix)

In [32]:
#Model Instantiation and Optimizer

model = TinyLLM(vocab_size, embed_dim, context_size, num_heads).to(device) #Call the class, using previously defined params

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Scheduler (linear warmup then decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=1000)

#optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2) #Adam with decoupled weight delay..... common optimizer for transformers. The param lr is very large for anything but toy problems..... you'd want a much smaller LR for real training.....
#An optimizer is part of the Model Training Pipeline that updates the model's params so the model learns from data. IE we start with random weights, and they tweak the weights with the optimizer to reduce prediction loss

#LR is "Learning Rate" - controls how big each step is when updating weights. Smaller is more accurate, but slower



#Moving model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); model.to(device)
#

TinyLLM(
  (token_embed): Embedding(16822, 16)
  (pos_embed): Embedding(32, 16)
  (transformer): TransformerBlock(
    (attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
    )
    (ff): Sequential(
      (0): Linear(in_features=16, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=16, bias=True)
    )
    (ln1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
    (ln2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=16, out_features=16822, bias=False)
)

In [33]:
#TRAIN OUR MODEL!!!

for epoch in range(2000): #train 200 times
    xb, yb = get_batch(batch_size=32, context_size=context_size)

    
    '''for chars
    #get random batch for contiguous slice of length context size.
    start = torch.randint(0, len(data)-context_size, (1,))  
    #input sequence of length context_size, shape after batch size 1?
    xb = data[start : start + context_size].unsqueeze(0) #input
    #implementing next-token prediction for each position
    yb = data[start + 1 : start + context_size + 1].unsqueeze(0) #target
    '''

    #these update the weights
    ''' for chars
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    '''
    optimizer.zero_grad()
    logits = model(xb) #feeding data

    #flatten the batch and time dimensions. cross entropy computes the average loss over these positions
    loss = F.cross_entropy(logits.view(-1, vocab_size), yb.view(-1))

    loss.backward()
    optimizer.step()
    scheduler.step()
    


    if epoch % 50 == 0:
        print(f"Epoch {epoch}, loss {loss.item():.4f}")

Epoch 0, loss 17.1799
Epoch 50, loss 16.2958
Epoch 100, loss 15.5178
Epoch 150, loss 14.8475
Epoch 200, loss 14.2162
Epoch 250, loss 13.7517
Epoch 300, loss 13.0044
Epoch 350, loss 12.3937
Epoch 400, loss 12.3682
Epoch 450, loss 11.8875
Epoch 500, loss 11.5255
Epoch 550, loss 11.4615
Epoch 600, loss 11.3953
Epoch 650, loss 11.3151
Epoch 700, loss 11.0360
Epoch 750, loss 11.0249
Epoch 800, loss 10.7925
Epoch 850, loss 10.8456
Epoch 900, loss 10.8851
Epoch 950, loss 11.1023
Epoch 1000, loss 10.7649
Epoch 1050, loss 10.7217
Epoch 1100, loss 10.8352
Epoch 1150, loss 10.9030
Epoch 1200, loss 11.0054
Epoch 1250, loss 10.8142
Epoch 1300, loss 10.8080
Epoch 1350, loss 10.4932
Epoch 1400, loss 10.4749
Epoch 1450, loss 10.2722
Epoch 1500, loss 10.4162
Epoch 1550, loss 10.1658
Epoch 1600, loss 9.9472
Epoch 1650, loss 9.8945
Epoch 1700, loss 9.7828
Epoch 1750, loss 9.4718
Epoch 1800, loss 9.3692
Epoch 1850, loss 9.1221
Epoch 1900, loss 8.8858
Epoch 1950, loss 8.7080


In [34]:
#Text Generation

#actual using the model
context = torch.tensor([[stoi['h'], stoi['e'], stoi['l'], stoi['l']]], dtype=torch.long)

for _ in range(20):
    context_in = context[:, -context_size:] 
    logits = model(context_in) #repeatedly runs the model on the entire context sequence
    probs = F.softmax(logits[:,-1,:],dim=-1) #takes the logits for the last position (the models prediction for next token)
    next_id = torch.multinomial(probs, num_samples = 1) #samples from probability distribution to pick up the next token
    context = torch.cat([context, next_id], dim=1)
#append that token to the context

print("Generated:", decode(context[0].tolist()))



KeyError: 11478

In [None]:
torch.optim.SGD(model.parameters())


SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)