In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
with open("names.txt", "r") as file:
    words = file.read().splitlines()

words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
# build a vocabulary of characters and mappings to/from integers
chars = sorted(list(set("".join(words))))
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27


In [4]:
import random
random.seed(42)
random.shuffle(words)

In [69]:
block_size = 8  # context length: how many characters do we take to predict the next one
def build_dataset(words):

    X, Y = [], []
    for w in words:

        context = [0] * block_size
        for ch in w + ".":
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]    # crop and append

    # basically we have a rolling window for context
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

torch.Size([182625, 8]) torch.Size([182625])
torch.Size([22655, 8]) torch.Size([22655])
torch.Size([22866, 8]) torch.Size([22866])


In [70]:
for x, y in zip(Xtr[:20], Ytr[:20]):
    print(f"{''.join(itos[ix.item()] for ix in x)} --> {itos[y.item()]}")

........ --> y
.......y --> u
......yu --> h
.....yuh --> e
....yuhe --> n
...yuhen --> g
..yuheng --> .
........ --> d
.......d --> i
......di --> o
.....dio --> n
....dion --> d
...diond --> r
..diondr --> e
.diondre --> .
........ --> x
.......x --> a
......xa --> v
.....xav --> i
....xavi --> e


In [71]:
import abc
from typing import Any, List, Optional


class Layer(abc.ABC):
    @abc.abstractmethod
    def __call__(self, x: torch.tensor) -> torch.tensor:
        pass

    @abc.abstractmethod
    def parameters() -> List[torch.tensor]:
        pass

In [97]:
from typing import Any, List


class Linear(Layer):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        device: Optional[str] = None,
    ) -> None:
        self.weights = torch.randn((in_features, out_features)) / in_features ** 0.5    # NOTE: kaiming init
        self.bias = torch.zeros(out_features) if bias else None

    def __call__(self, x: torch.tensor) -> torch.tensor:

        self.out = x @ self.weights
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self) -> List[torch.tensor]:
        return [self.weights] + ([] if self.bias is None else [self.bias])


class BatchNorm1d(Layer):
    def __init__(
        self,
        num_features: int,
        eps: float = 1e-5,
        momentum: float = 0.1,
    ) -> None:
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters
        self.gamma = torch.ones(num_features)
        self.beta = torch.zeros(num_features)
        # buffers (trained with a running 'momentum update')
        self.running_mean = torch.zeros(num_features)
        self.running_var = torch.ones(num_features)

    def __call__(self, x: torch.tensor) -> torch.tensor:
        # calculate the forward pass
        if self.training:
            if x.ndim == 2:
                dim = 0
            elif x.ndim == 3:
                dim = (0, 1)

            xmean = x.mean(dim, keepdim=True)
            xvar = x.var(dim, keepdim=True)
        else:  # inference
            xmean = self.running_mean
            xvar = self.running_var

        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = xhat * self.gamma + self.beta

        # update the buffers
        if self.training:
            with torch.no_grad():
                # updated using the exponential moving average
                self.running_mean = (
                    1 - self.momentum
                ) * self.running_mean + self.momentum * xmean
                self.running_var = (
                    1 - self.momentum
                ) * self.running_var + self.momentum * xvar
        return self.out

    def parameters(self) -> List[torch.tensor]:
        return [self.gamma, self.beta]


class Tanh(Layer):
    def __init__(self) -> None:
        pass

    def __call__(self, x: torch.tensor) -> torch.tensor:
        self.out = torch.tanh(x)
        return self.out

    def parameters(self) -> Optional[List]:
        return []


# same name in torch.nn
class Embedding(Layer):
    def __init__(self, num_embeddings, embedding_dim) -> None:
        self.weights = torch.randn((num_embeddings, embedding_dim))

    def __call__(self, IX: torch.tensor) -> torch.tensor:
        self.out = self.weights[IX]
        return self.out

    def parameters(self) -> List:
        return [self.weights]


class FlattenConsecutive(Layer):
    def __init__(self, n: int) -> None:
        self.n = n

    def __call__(self, x: torch.tensor) -> torch.tensor:
        B, T, C = x.shape
        x = x.view(B, T//self.n, C*self.n)
        if x.shape[1] == 1:
            x = x.squeeze(1)

        self.out = x
        return self.out

    def parameters(self) -> List:
        return []


class Sequential(Layer):

    def __init__(self, layers: List[Layer]) -> None:
        self.layers = layers

    def __call__(self, x) -> Any:
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out

    def parameters(self) -> List[torch.tensor]:
        return [p for layer in self.layers for p in layer.parameters()]

In [98]:
torch.manual_seed(42);

In [105]:
n_embd = 24  # the dimensionality of the character embedding vectors
n_hidden = 128  # the number of neurons in the hidden layer of the MLP

model = Sequential([
    Embedding(vocab_size, n_embd),
    FlattenConsecutive(2), Linear(n_embd * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, vocab_size)     # , bias=True),
    # NOTE: the bias if False for the first linear layer as it is followed by the batchnorm layer and it is False for the last layer as it is not followed by the batchnorm layer.
])

# NOTE: there are similar results when placing batchnorm after activation layers

with torch.no_grad():
    # last layer: make less confident
    model.layers[-1].weights *= 0.1


# parameters = [C] + [p for layer in layers for p in layer.parameters()]
# parameters = [p for layer in layers for p in layer.parameters()]
parameters = model.parameters()

print(f"Total num of elements: {sum(p.nelement() for p in parameters)}")

for p in parameters:
    p.requires_grad = True

Total num of elements: 76579


In [106]:
# NOTE: We are crushing too much information for our basic network (data compression during learning) by simply scaling the dataset or vocab size to 8
ix = torch.randint(0, Xtr.shape[0], (4,))
Xb, Yb = Xtr[ix], Ytr[ix]
model(Xb).shape

torch.Size([4, 27])

In [108]:
for layer in model.layers:
    print(f"{layer.__class__.__name__} : {tuple(layer.out.shape)}")

Embedding : (32, 8, 24)
FlattenConsecutive : (32, 4, 48)
Linear : (32, 4, 128)
BatchNorm1d : (32, 4, 128)
Tanh : (32, 4, 128)
FlattenConsecutive : (32, 2, 256)
Linear : (32, 2, 128)
BatchNorm1d : (32, 2, 128)
Tanh : (32, 2, 128)
FlattenConsecutive : (32, 256)
Linear : (32, 128)
BatchNorm1d : (32, 128)
Tanh : (32, 128)
Linear : (32, 27)


In [107]:
n_epochs = 200000
batch_size = 32
lossi = []

for i in range(n_epochs):
    # minibatch construct
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xbatch, Ybatch = Xtr[ix], Ytr[ix]

    # forward pass
    logits = model(Xbatch)
    loss = F.cross_entropy(logits, target=Ybatch)

    for p in parameters:
        p.grad = None
    loss.backward()

    # parameter update
    lr = 0.1 if i < 150000 else 0.01

    for p in parameters:
        p.data += -lr * p.grad

    # track stats
    if i % 10000 == 0:
        print(f"{i:7d}/{n_epochs:7d}: {loss.item():.4f}")

    lossi.append(loss.log10().item())

    break

      0/ 200000: 3.3111


## Performance log

Write down every new feature you did or bug you fixed: the new result

In [62]:
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1));

RuntimeError: shape '[-1, 1000]' is invalid for input of size 1

In [63]:
# put layers into eval mode (especially needed for batchnorm specifically)
for layer in model.layers:
    layer.training = False

In [64]:
@torch.no_grad()
def split_loss(split: str):
    x, y = {
        "train": (Xtr, Ytr),
        "val": (Xdev, Ydev),
        "test": (Xte, Yte)
    }[split]

    logits = model(x)
    loss = F.cross_entropy(logits, y)
    print(split, loss.item())

split_loss("train")
split_loss("val")

train 3.8625447750091553
val 3.867837429046631


In [65]:
# sample from the model

for _ in range(20):
    out = []
    context = [0] * block_size

    while True:
        logits = model(torch.tensor([context]))

        # NOTE you were getting nan probs as you didnt set the layers to inference/eval mode
        probs = F.softmax(logits, dim=1)
        # sample from the distribution
        ix = torch.multinomial(probs, num_samples=1).item()
        # shift the context window and track the samples
        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:
            break

    print("".join(itos[i] for i in out))

aelxptlp.
tblqxnymscljnnnalxibt.
pmgknnyackttelvjwuqpdarjgfvmkydreqpulpfncqvesolvmrnzwjsftsocspzaksyeckujnq.
olpppulturx.
encxpswgvgugyyyizveqdkjutdhrzm.
fyfnmuufhrmlvespegbsblwzj.
esoqsyqfeod.
lxgkifglvghvetpbnf.
hxdundjgzdefop.
m.
fzriufjehsniflsuddnfj.
klrtcobzgxpnltorawlvxnm.
eeppdmyynkoblqegh.
awcqxnyou.
mm.
ayvclvtuidmcujpdnyn.
mmhrybioutgrrawsoloppbxkyfzlcunynhtdbs.
g.
darwqw.
xej.


## Problem with NaNs and infs

The problem was with the batchnorm layer being still in training mode as it is trying to calculate the variance/spread of a single unit (when we are doing inference) and a variance of a single number is not a number as we can see below. 

In [68]:
torch.var(torch.tensor  ([5.0]))

  torch.var(torch.tensor([5.0]))


tensor(nan)