In [11]:
import random

import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [8]:
# Read dataset
path_in = "../data/names.txt"
with open(path_in, 'r') as fh:
    words = fh.read().splitlines()
len(words), words[:5]

(32033, ['emma', 'olivia', 'ava', 'isabella', 'sophia'])

In [9]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}
vocab_size = len(chars) + 1

In [12]:
# build the dataset
block_size = 3

def build_dataset(words):
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]  # rolling window
    return torch.tensor(X), torch.tensor(Y)

random.seed(42)
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))
Xtr,  Ytr  = build_dataset(words[:n1])
Xval, Yval = build_dataset(words[n1:n2])
Xte,  Yte  = build_dataset(words[n2:])
Xtr.shape, Ytr.shape, Xval.shape, Yval.shape, Xte.shape, Yte.shape

(torch.Size([182625, 3]),
 torch.Size([182625]),
 torch.Size([22655, 3]),
 torch.Size([22655]),
 torch.Size([22866, 3]),
 torch.Size([22866]))

In [2]:
g = torch.Generator().manual_seed(123)

In [4]:
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in**0.5
        self.bias = torch.zeros(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])


In [6]:

class BatchNorm1d:
    def __init__(self, dim, epsilon=1e-5, momentum=0.1):
        self.epsilon = epsilon
        self.momentum = momentum
        self.training = True
        # parameters
        self.bngain = torch.ones((1, dim))
        self.bnbias = torch.zeros((1, dim))
        # buffers
        self.bnmean_running = torch.zeros((1, dim))
        self.bnvar_running  = torch.ones((1, dim))
    
    def __call__(self, x):
        if self.training:
            bnmeani = x.mean(0, keepdim=True)
            bnvari = x.var(0, keepdim=True, unbiased=True)
        else:
            bnmeani = self.bnmean_running
            bnvari = self.bnvar_running
        xhat = (x - bnmeani) / (bnvari + self.epsilon)**0.5
        self.out = self.bngain * xhat + self.bnbias
        if self.training:
            with torch.no_grad():
                self.bnmean_running = (1 - self.momentun) * self.bnmean_running + self.momentum * bnmeani
                self.bnvar_running = (1 - self.momentun) * self.bnvar_running + self.momentum * bnvari
        return self.out


    def parameters(self):
        return [self.bngain, self.bnbias]


In [7]:
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

In [14]:
n_embd = 10
n_hidden = 100

g = torch.Generator().manual_seed(123)

C = torch.randn((vocab_size, n_embd), generator=g)
layers = [
    Linear(n_embd * block_size, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, n_hidden), Tanh(),
    Linear(n_hidden, vocab_size),
]

with torch.no_grad():
    layers[-1].weight *= 0.1
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            layer.weight *= 5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]
for p in parameters: 
    p.requires_grad = True