In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

### Preparing the dataset

In [2]:
words = open("names.txt", "r").read().splitlines()

In [3]:
chars = sorted(list(set("".join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(stoi)
vocab_size

27

In [4]:
block_size = 3
def build_dataset(words):
    block_size = 3
    X, Y = [], []
    
    for w in words:
    
        context = [0] * block_size
        for ch in w+".":
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
    
            context = context[1:]+[ix]
    
    X = torch.tensor(X)
    Y = torch.tensor(Y)   

    return X, Y

import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])

### Defining the layers

In [5]:
from makemore.linear import Linear
from makemore.tanh import Tanh
from makemore.batchNorm1d import BatchNorm1d

In [6]:
n_embed = 10
n_hidden = 100
g = torch.Generator().manual_seed(2147483647)

C = torch.randn((vocab_size, n_embed), generator=g)


layers = [
  Linear(n_embed * block_size, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(           n_hidden, vocab_size, bias=False), BatchNorm1d(vocab_size),
]

with torch.no_grad():
  # last layer: make less confident
  layers[-1].gamma *= 0.1
  #layers[-1].weight *= 0.1
  # all other layers: apply gain
  for layer in layers[:-1]:
    if isinstance(layer, Linear):
      layer.weight *= 1.0 #5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

47024


### Training loop

In [11]:
%%time

max_steps = 200000
batch_size = 32
lossi = []
ud = []

for i in range(max_steps):

    #minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix]

    emb = C[Xb]
    x = emb.view(-1, block_size * n_embed)

    for layer in layers:
        x = layer(x)
    
    loss = F.cross_entropy(x, Yb)

     # backward pass
    for layer in layers:
      layer.out.retain_grad() # AFTER_DEBUG: would take out retain_graph
      
    for p in parameters:
        p.grad = None

    loss.backward()

    lr = 0.1 if i<100000 else 0.01
    for p in parameters:
        p.data += -lr * p.grad
        
    # track stats
    if i % 10000 == 0: # print every once in a while
      print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    lossi.append(loss.log10().item())

    with torch.no_grad(): #checks how large are the update compared to values
        ud.append([((lr*p.grad).std() / p.data.std()).log10().item() for p in parameters])

    # # if i >= 1000:
    # #   break # AFTER_DEBUG: would take out obviously to run full optimization
    # if i % 10000 == 0: # print every once in a while
    #   print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
    # lossi.append(loss.log10().item())

      0/ 200000: 2.1189
  10000/ 200000: 2.4210
  20000/ 200000: 1.8325
  30000/ 200000: 1.9717
  40000/ 200000: 1.9369
  50000/ 200000: 2.8048
  60000/ 200000: 1.9833
  70000/ 200000: 2.0637
  80000/ 200000: 2.3088
  90000/ 200000: 2.2504
 100000/ 200000: 2.0926
 110000/ 200000: 2.0370
 120000/ 200000: 1.8982
 130000/ 200000: 1.5689
 140000/ 200000: 2.0867
 150000/ 200000: 1.9026
 160000/ 200000: 1.6841
 170000/ 200000: 1.5990
 180000/ 200000: 1.8545
 190000/ 200000: 2.1288
CPU times: user 55min 2s, sys: 13.5 s, total: 55min 15s
Wall time: 9min 54s


### Evaluation

In [12]:
for layer in layers:
    layer.training = False

In [13]:
@torch.no_grad()
def split_loss(split):
    x, y = {
       "train" : (Xtr, Ytr),
        "test" : (Xte, Yte),
        "dev": (Xdev, Ydev)
    }[split] 
    emb = C[x]
    x = emb.view(emb.shape[0], -1)

    for layer in layers:
        x = layer(x)    
    
    loss = F.cross_entropy(x, y)
    print(split, loss.item()) 

split_loss("train")
split_loss("dev")

train 1.96268630027771
dev 2.09045672416687


### Sample from the data

In [16]:
for _ in range(20):
    context = [0]*block_size
    out = []

    while True:
        emb = C[torch.tensor([context])]

        x = emb.view(emb.shape[0], -1)
        for layer in layers:
            x = layer(x)

        logits = x
        probs = F.softmax(logits, dim=1)

        ix = torch.multinomial(probs, num_samples=1).item()

        context = context[1:] + [ix]
        out.append(ix)

        if ix==0:
            break

    print("".join(itos[i] for i in out))
    
    

a
av
avr
avri
avrin
avring
avrings
avringst
avringste
avringstel
avringstell
avringstelle
avringstellen
a
ar
ari
aria
ariau
ariaun
b
be
ben
bent
benty
bentyn
a
an
ani
aniy
aniya
aniyah
g
ge
gem
gemi
gemia
gemiah
s
st
sto
ston
stone
s
so
sor
sorr
sorre
sorres
sorresk
sorreska
m
ma
mar
mari
k
ka
kam
kama
kamar
kamark
kamarke
kamarkel
d
da
dax
j
ja
jak
jaky
jakyr
jakyri
jakyria
jakyrian
jakyriann
jakyrianne
jakyrianner
n
ny
nyl
nyla
nylan
nylani
k
ka
kai
kair
kaire
kairel
m
ma
mav
mave
mavee
r
ra
rae
raey
raeyi
raeyio
t
th
tha
thai
thail
thaila
thailan
thailana
a
al
ale
alem
alemo
alemon
alemone
alemoneo
d
de
des
dest
desto
deston
destoni
destonis
h
ha
han
hana
hanat
hanata
hanatal
hanatale
hanatalek
hanataleko
hanatalekob
hanatalekobe
hanatalekober
m
me
mel
melo
