# CS336 Assignments

| # | Topic                         | Description                                 |
|---|-------------------------------|---------------------------------------------|
| 1 | Basics                        | Train an LLM from scratch                   |
| 2 | Systems                       | Make it run fast!                           |
| 3 | Scaling                       | Make it performant at a FLOP budget         |
| 4 | Data                          | Prepare the right datasets                  |
| 5 | Alignment & Reasoning RL      | Align it to real-world use cases            |

# Assignment #1
- Implement all of the components (tokenizer, model, loss function, optimizer) necessary to train a standard Transformer language model
- Train a minimal language model

In [2]:
import warnings
warnings.filterwarnings("ignore")

import torch
import lovely_tensors as lt
lt.monkey_patch()

import tiktoken

from datasets import load_dataset
import joblib

from torch.utils.data import TensorDataset, DataLoader


In [3]:
def crossentropyloss(preds, actuals):
    probs = preds.squeeze().sigmoid()
    eps = 1e-8
    loss = - actuals * (probs + eps).log() - (1 - actuals) * ((1 - probs) + eps).log()
    return loss.mean()

class LayerNorm_custom(torch.nn.Module):
    def __init__(self, eps=1e-8):
        super().__init__()
        # self.scale = torch.nn.Parameter(torch.randn())
        # self.shift = torch.nn.Parameter(torch.randn())
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)

        norm_tensor = (x - mean) / torch.sqrt(var + self.eps)
        return norm_tensor
    
class FFNWithNorm(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.linear = torch.nn.Parameter(torch.randn(input_dim, hidden_dim))
        self.act = GeLU_custom()
        self.ln = LayerNorm_custom()
        self.linear2 = torch.nn.Parameter(torch.randn(hidden_dim, output_dim))

    def forward(self, x):
        o = x @ self.linear
        o = self.act(o)
        o = self.ln(o)
        o = o @ self.linear2
        return o
    
    def state_dict(self):
        from collections import OrderedDict

        sd = OrderedDict()
        sd['linear'] = self.linear.data
        sd['linear2'] = self.linear2.data
        return sd

class GeLU_custom(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        o = 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2) / torch.pi) * (x + 0.044715 * torch.pow(x, 3))))
        return o

class AdamW_custom:
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2):
        self.params = list(params)
        self.lr = lr
        self.beta1, self.beta2 = betas
        self.wd = weight_decay
        self.eps = eps
        self.step_count = 0

        # Initialize momentum and velocity for each parameter
        self.m = [torch.zeros_like(p) for p in self.params]
        self.v = [torch.zeros_like(p) for p in self.params]

        self.ckpt = {}
        self.ckpt['param_groups'] = [{
            'lr': lr,
            'eps': eps,
            'betas': (self.beta1, self.beta2),
            'weight_decay': self.wd
        }]

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()

    def step(self):
        self.step_count += 1

        for i, param in enumerate(self.params):
            if param.grad is None:
                continue

            grad = param.grad.data

            # Apply weight decay directly to parameters (AdamW style)
            param.data.mul_(1 - self.lr * self.wd)

            # Update biased first moment estimate
            # beta1 * i + (1-beta1) * grad
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad

            # Update biased second raw moment estimate
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * grad.pow(2)
            
            # Compute bias-corrected first moment estimate
            m_hat = self.m[i] / (1 - self.beta1 ** self.step_count)

            # Compute bias-corrected second raw moment estimate
            v_hat = self.v[i] / (1 - self.beta2 ** self.step_count)

            # Update parameters
            param.data.add_(m_hat * -self.lr/ (v_hat.sqrt() + self.eps))

        

    def state_dict(self):
        return self.ckpt

In [5]:
tinystories = load_dataset("roneneldan/TinyStories")
tinystories

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [9]:
[tinystories['train'][x] for x in range(10)]

[{'text': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'},
 {'text': 'Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\n\nOne day, Beep was driving in the park when he saw a big tree. The tree had 

## Train a BPE tokenizer on TinyStories dataset

In [11]:
# TBD

## Run trained tokenizer to convert tinystories to sequence of Integer IDs

In [10]:
# TBD

# Train a transformer LM on TinyStories dataset

In [12]:
# TBD

In [173]:
bs = 8

torch.manual_seed(42)
data = torch.randn(30, 5)
labels = torch.randn(30)
labels = torch.where(labels < 0.7, 0, 1)

train_dataset = TensorDataset(data, labels)
train_dataloader = DataLoader(train_dataset, batch_size=bs, shuffle=False, num_workers=0)

In [196]:
n_epochs = 10

torch.manual_seed(42)
model = FFNWithNorm(5, 10, 1)

optimizer = AdamW_custom(model.parameters())
# optimizer = torch.optim.AdamW(model.parameters())


for e in range(n_epochs):

    train_loss = 0.0
    train_batches = 0
    for i, (data, labels) in enumerate(train_dataloader):
        logits = model(data)
        optimizer.zero_grad()

        loss = crossentropyloss(logits, labels)
        
        # backward pass
        loss.backward()

        # update weights
        optimizer.step()

        # logging
        train_loss += loss.item()
        train_batches += 1

    avg_train_loss = train_loss / train_batches

    print(f"epoch: {i: 03d}/{n_epochs: 03d} | train_loss: {avg_train_loss: .4f}")


epoch:  03/ 10 | train_loss:  1.8256
epoch:  03/ 10 | train_loss:  1.8074
epoch:  03/ 10 | train_loss:  1.7905
epoch:  03/ 10 | train_loss:  1.7737
epoch:  03/ 10 | train_loss:  1.7572
epoch:  03/ 10 | train_loss:  1.7409
epoch:  03/ 10 | train_loss:  1.7247
epoch:  03/ 10 | train_loss:  1.7087
epoch:  03/ 10 | train_loss:  1.6929
epoch:  03/ 10 | train_loss:  1.6772


In [230]:
ckpt = {
    'model_state': model.state_dict(),
    'optimizer_state': optimizer.state_dict()
}

ckpt

{'model_state': OrderedDict([('linear',
               tensor[5, 10] n=50 x∈[-2.132, 1.904] μ=0.099 σ=1.048),
              ('linear2',
               tensor[10, 1] x∈[-2.091, 1.252] μ=-0.066 σ=1.188 [[-0.572], [1.252], [-1.554], [-1.138], [0.864], [0.138], [-2.091], [0.925], [0.757], [0.762]])]),
 'optimizer_state': {'param_groups': [{'lr': 0.001,
    'eps': 1e-08,
    'betas': (0.9, 0.999),
    'weight_decay': 0.01}]}}

In [240]:
import joblib

joblib.dump(ckpt, "model.pth")

['model.pth']

In [241]:
ckpt = joblib.load("model.pth")
ckpt

{'model_state': OrderedDict([('linear',
               tensor[5, 10] n=50 x∈[-2.132, 1.904] μ=0.099 σ=1.048),
              ('linear2',
               tensor[10, 1] x∈[-2.091, 1.252] μ=-0.066 σ=1.188 [[-0.572], [1.252], [-1.554], [-1.138], [0.864], [0.138], [-2.091], [0.925], [0.757], [0.762]])]),
 'optimizer_state': {'param_groups': [{'lr': 0.001,
    'eps': 1e-08,
    'betas': (0.9, 0.999),
    'weight_decay': 0.01}]}}

## Generate samples and evaluate perplexity using the trained transformer LM

In [14]:
# TBD

## Train models on OpenWebText and submit perplexities to a leaderboard

In [15]:
# TBD