# CS336 Assignments

| # | Topic                         | Description                                 |
|---|-------------------------------|---------------------------------------------|
| 1 | Basics                        | Train an LLM from scratch                   |
| 2 | Systems                       | Make it run fast!                           |
| 3 | Scaling                       | Make it performant at a FLOP budget         |
| 4 | Data                          | Prepare the right datasets                  |
| 5 | Alignment & Reasoning RL      | Align it to real-world use cases            |

# Assignment #1
- Implement all of the components (tokenizer, model, loss function, optimizer) necessary to train a standard Transformer language model
- Train a minimal language model

In [1]:
import warnings
warnings.filterwarnings("ignore")

import torch
import lovely_tensors as lt
lt.monkey_patch()

import tiktoken

from datasets import load_dataset

In [75]:
def crossentropyloss(actuals, preds):
    probs = preds.squeeze().sigmoid()
    eps = 1e-8
    loss = - actuals * (probs + eps).log() - (1 - actuals) * ((1 - probs) + eps).log()
    return loss.mean()

class AdamW_custom:
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2):
        self.params = list(params)
        self.lr = lr
        self.beta1, self.beta2 = betas
        self.wd = weight_decay
        self.eps = eps
        self.step_count = 0

        # Initialize momentum and velocity for each parameter
        self.m = [torch.zeros_like(p) for p in self.params]
        self.v = [torch.zeros_like(p) for p in self.params]

    def zero_grad(self):
        for p in self.params:
            if p.grad is not None:
                p.grad.zero_()

    def step(self):
        self.step_count += 1

        for i, param in enumerate(self.params):
            if param.grad is None:
                continue

            grad = param.grad.data

            # Apply weight decay directly to parameters (AdamW style)
            param.data.mul_(1 - self.lr * self.wd)

            # Update biased first moment estimate
            # beta1 * i + (1-beta1) * grad
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad

            # Update biased second raw moment estimate
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * grad.pow(2)
            
            # Compute bias-corrected first moment estimate
            m_hat = self.m[i] / (1 - self.beta1 ** self.step_count)

            # Compute bias-corrected second raw moment estimate
            v_hat = self.v[i] / (1 - self.beta2 ** self.step_count)

            # Update parameters
            param.data.add_(m_hat * -self.lr/ (v_hat.sqrt() + self.eps))



## layer norm

In [49]:
class FFN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = torch.nn.Linear(5, 6)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        return self.relu(self.linear(x))
    
torch.manual_seed(42)
data = torch.randn(3, 5)
model = FFN()
output = model(data)

output.v

tensor[3, 6] n=18 x∈[0., 1.286] μ=0.331 σ=0.392 grad ReluBackward0
tensor([[0.2151, 0.1047, 0.0000, 0.0000, 0.2167, 0.0521],
        [0.4768, 0.0000, 0.7975, 0.3168, 1.2863, 0.6434],
        [0.0000, 0.0000, 0.9230, 0.7498, 0.1746, 0.0000]],
       grad_fn=<ReluBackward0>)

In [50]:
mean = output.mean(dim=-1)
var = output.var(dim=-1)

mean, var

(tensor[3] x∈[0.098, 0.587] μ=0.331 σ=0.245 grad MeanBackward1 [0.098, 0.587, 0.308],
 tensor[3] x∈[0.010, 0.193] μ=0.126 σ=0.101 grad VarBackward0 [0.010, 0.193, 0.175])

We see that the mean of each layer is not zero and the standard deviation is not close to 1. With layer normalization, we can bring them to 0.

In [51]:
(output - mean) / torch.sqrt(var)

RuntimeError: The size of tensor a (6) must match the size of tensor b (3) at non-singleton dimension 1

In [52]:
mean = output.mean(dim=-1, keepdim=True)
var = output.var(dim=-1, keepdim=True)

mean, var

(tensor[3, 1] x∈[0.098, 0.587] μ=0.331 σ=0.245 grad MeanBackward1 [[0.098], [0.587], [0.308]],
 tensor[3, 1] x∈[0.010, 0.193] μ=0.126 σ=0.101 grad VarBackward0 [[0.010], [0.193], [0.175]])

In [53]:
(output - mean) / torch.sqrt(var)

tensor[3, 6] n=18 x∈[-1.335, 1.591] μ=-6.623e-09 σ=0.939 grad DivBackward0

0 mean and 1 variance! :) 

Let's wrap this as a PyTorch layer now.

In [54]:
class LayerNorm_custom(torch.nn.Module):
    def __init__(self, eps=1e-8):
        super().__init__()
        # self.scale = torch.nn.Parameter(torch.randn())
        # self.shift = torch.nn.Parameter(torch.randn())
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)

        norm_tensor = (x - mean) / torch.sqrt(var + self.eps)
        return norm_tensor


Let's add this to the above FFN class and see if the outputs look normalized!

In [55]:
class FFNWithNorm(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = torch.nn.Linear(5, 6)
        self.relu = torch.nn.ReLU()
        self.ln = LayerNorm_custom()

    def forward(self, x):
        o = self.relu(self.linear(x))
        o = self.ln(x)
        return o
    
torch.manual_seed(42)
data = torch.randn(3, 5)
model = FFNWithNorm()
output = model(data)


mean = output.mean(dim=-1, keepdim=True)
var = output.var(dim=-1, keepdim=True)

(output - mean) / torch.sqrt(var)

tensor[3, 5] n=15 x∈[-1.776, 1.646] μ=3.974e-09 σ=0.926

In [63]:
class LayerNorm_custom2(torch.nn.Module):
    def __init__(self, emb_dim, eps=1e-8):
        super().__init__()
        self.scale = torch.nn.Parameter(torch.ones(emb_dim))
        self.shift = torch.nn.Parameter(torch.zeros(emb_dim))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True)

        norm_tensor = (x - mean) / torch.sqrt(var + self.eps)
        output = self.scale * norm_tensor + self.shift
        return output


In [64]:
class FFNWithNorm(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = torch.nn.Linear(5, 6)
        self.relu = torch.nn.ReLU()
        self.ln = LayerNorm_custom2(1)

    def forward(self, x):
        o = self.relu(self.linear(x))
        o = self.ln(x)
        return o

torch.manual_seed(42)
data = torch.randn(3, 5)
model = FFNWithNorm()
output = model(data)


mean = output.mean(dim=-1, keepdim=True)
var = output.var(dim=-1, keepdim=True)

(output - mean) / torch.sqrt(var)

tensor[3, 5] n=15 x∈[-1.776, 1.646] μ=3.974e-09 σ=0.926 grad DivBackward0

## GeLU activation

In [81]:
class GeLU_custom(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        o = 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2) / torch.pi) * (x + 0.044715 * torch.pow(x, 3))))
        return o
    

class FFNWithNorm(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = torch.nn.Linear(5, 6)
        self.act = GeLU_custom()
        self.ln = LayerNorm_custom()

    def forward(self, x):
        o = self.linear(x)
        o = self.act(o)
        o = self.ln(o)
        return o

torch.manual_seed(42)
data = torch.randn(3, 5)
model = FFNWithNorm()
output = model(data)
output

tensor[3, 6] n=18 x∈[-1.151, 1.720] μ=1.325e-08 σ=0.939 grad DivBackward0