In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
with open('names.txt') as f:
    content = f.read()
    words = content.splitlines()
len(words)

32033

In [5]:
chars = sorted(list(set(''.join(words))))
stoi = { s: i+1 for i, s in enumerate(chars) }
stoi['.'] = 0
itos = { i: s for s, i in stoi.items() }
vocab_size = len(itos)

In [6]:
import torch

# Build dataset splits
def build_dataset(words: list[str], block_size: int) -> (torch.tensor, torch.tensor):
    x, y = [], []
    for w in words:
        #print(w)
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            x.append(context)
            y.append(ix)
            #print(''.join(itos[i] for i in context), "--->", ch)
            # Advance the rolling window of context
            context = context[1:] + [ix]
    X = torch.tensor(x)
    Y = torch.tensor(y)
    print(X.shape, Y.shape)
    return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

CONTEXT_SZ = 5
Xtr, Ytr = build_dataset(words[:n1], CONTEXT_SZ)
Xdev, Ydev = build_dataset(words[n1:n2], CONTEXT_SZ)
Xte, Yte = build_dataset(words[n2:], CONTEXT_SZ)

torch.Size([182625, 5]) torch.Size([182625])
torch.Size([22655, 5]) torch.Size([22655])
torch.Size([22866, 5]) torch.Size([22866])


# Model

In [8]:
def cmp(s: label, dt: Any, t: torch.tensor) -> None:
    """Compre manual gradient calculation with our own calculation."""
    ex = torch.all(dt == t.grad).item()
    app = torch.allclose(dt, t.grad)
    maxdiff = (dt - t.grad).abs().max().item()
    printf(f"{s:15s} | exact: {str(ex):5s} | approx: {str(app):5s} | maxdiff: {maxdiff}")

In [12]:
from typing import Union

class Linear:
    """A linear layer."""

    def __init__(
        self,
        fan_in: int,
        fan_out: int,
        generator: Union[torch.Generator, None] = None,
        weight_gain: float | None = None,
        bias_gain: float = 0.1,
    ) -> None:
        """Initialize Linear."""
        if weight_gain is None:
            weight_gain = (5/3) / (fan_in**0.5)
        self.weight = torch.randn((fan_in, fan_out), generator=generator) * weight_gain
        # Note: bias is spurious given we're using batch norm, but calculating anyway
        self.bias = torch.randn(fan_out, generator=g) * bias_gain
        self.out: Union[torch.tensor, None] = None

    def __call__(self, x: torch.tensor) -> torch.tensor:
        """Forward pass"""
        self.out = x @ self.weight + self.bias
        return self.out

    def parameters(self):
        return [self.weight, self.bias]

    def __str__(self) -> str:
        return self.__class__.__name__


class BatchNorm1d:
    """A batch normalization layer."""

    def __init__(self, dim: int, eps: float = 1e-5, momentum=0.001) -> None:
        """Initialize BatchNorm1d."""
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # Adds small random numbers to unmask gradient errors
        self.bngain = torch.randn((1, dim)) * 0.1 + 1.0
        self.bnbias = torch.randn((1, dim)) * 0.1
        # Buffers, trained with a running momentum update
        self.running_mean = torch.zeros((1, dim))
        self.running_std = torch.ones((1, dim))
        self.out: Union[torch.tensor, None] = None

    def __call__(self, x: torch.tensor) -> torch.tensor:
        """Forward pass."""
        if not self.training:
            xmean = self.running_mean
            xstd = self.running_std
        else:

            xmean = x.mean(0, keepdim=True)
            xstd = x.std(0, keepdim=True)

            # Update buffers
            with torch.no_grad():
                self.running_mean = (1-self.momentum) * self.running_mean + self.momentum * xmean
                self.running_std = (1-self.momentum) * self.running_std + self.momentum * xstd

        # Preserve last value for statistics. TODO: We're not doing sqrt(xvar + eps) here. Fix?
        self.out = self.bngain * (x - xmean) / (xstd + self.eps) + self.bnbias
        return self.out

    def parameters(self):
        return [self.bngain, self.bnbias]

    def __str__(self) -> str:
        return self.__class__.__name__


class Nonlinearity:
    """Nonlinearity."""

    def __init__(self) -> None:
        """Initialize Nonlinearity."""
        self.act = torch.nn.Tanh()
        self.out: Union[torch.tensor, None] = None

    def __call__(self, x: torch.tensor) -> torch.tensor:
        """Forward pass"""
        self.out = self.act(x)
        return self.out

    def __str__(self) -> str:
        return f"{self.__class__.__name__}"

    def parameters(self):
        return []

In [13]:
class Model:

    def __init__(self, vocab_size: int, generator: torch.Generator = None) -> None:
        self.C = torch.randn((vocab_size, EMBED_SZ), generator=g)
        self.layers = [
            Linear(EMBED_SZ * CONTEXT_SZ, HIDDEN_LAYER_SZ, generator=g),
            Nonlinearity(activation=activation),
            BatchNorm1d(HIDDEN_LAYER_SZ),
        ]

        for layer in layers:
            layer.training = True

        # Reset parameters for training
        for p in self.parameters():
            p.requires_grad = True

    def parameters(self):
        return [self.C] + [p for layer in self.layers for p in layer.parameters()]

    @property
    def num_parameters(self) -> int:
        """Total number of parameters."""
        return sum(p.nelement() for p in self.parameters())

    def forward_pass(self, x: torch.tensor) -> torch.tensor:
        """Forward pass."""

        emb = self.C[x] # (N, CONTEXT_SZ, 2)
        xhat = emb.view(emb.shape[0], -1)  # concatenate the vectors

        for layer in self.layers:
            xhat = layer(xhat)

        return xhat