# LiteByte: Online Byte-Level MLP Model

This notebook demonstrates the core components of the LiteByte architecture in a minimal setting. It runs in under 30 minutes on Google Colab Free Tier GPUs.

In [None]:
# Install required libraries
!pip install torch numpy

In [None]:
# Define a simplified LiteByte block (no expert routing, 1 patch, 1 OrgLayer)
import torch
import torch.nn as nn
import torch.nn.functional as F

class LiteByte(nn.Module):
    def __init__(self, vocab_size=256, vocab_dim=64, hidden_dim=512):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, vocab_dim)
        self.norm1 = nn.LayerNorm(hidden_dim)
        self.ffn = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim*4),
            nn.GELU(),
            nn.Linear(hidden_dim*4, hidden_dim),
            nn.GELU()
        )
        self.proj = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = torch.sigmoid(self.embed(x))
        B, T, D = x.shape
        x = x.view(B, T // 8, 8 * D)
        x = self.norm1(x)
        x = self.ffn(x) + x
        logits = self.proj(x)
        return logits

# Example usage
model = LiteByte()
x = torch.randint(0, 256, (2, 64))  # batch size 2, sequence length 64
logits = model(x)
logits.shape  # should be (2, 8, 256)

You can now proceed to train the model using a simple cross-entropy loss and perform online updates per batch.