In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
names = open('names.txt','r').read().splitlines()

In [3]:
names[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [4]:
chars = sorted(list(set(''.join(names))))

In [5]:
stoi={}
itos = {}
stoi['.']=0
itos[0]='.'
for i,c in enumerate(chars):
    stoi[c] = i+1
    itos[i+1]=c

In [10]:
def get_data(words):
    block_size = 3
    X = []
    Y = []
    for w in words:
        context = [0]*block_size
        for c in w + '.':
            ix = stoi[c]
            X.append(context)
            Y.append(ix)
            context = context[1:]+[ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X,Y

In [11]:
X,Y = get_data(names)

In [13]:
X[:10],Y[:10]

(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         [ 5, 13, 13],
         [13, 13,  1],
         [ 0,  0,  0],
         [ 0,  0, 15],
         [ 0, 15, 12],
         [15, 12,  9],
         [12,  9, 22]]),
 tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9]))

In [180]:
C = torch.randn((27,10))
W1 = torch.randn((30,200))
b1 = torch.randn(200)
W2 = torch.randn((200,27))
b2 = torch.randn(27)
params = [C,W1,b1,W2,b2]
for p in params:
    p.requires_grad = True
def train(X,Y):
    for i in range(200000):
        ixs = torch.randint(0,len(X),(32,))
        Xt = X[ixs] # B,3
        Yt = Y[ixs]
        embed = C[Xt].view(len(Xt),-1) # B,30
        h = torch.tanh(embed @ W1 + b1)
        logits = h @ W2 + b2
        counts = logits.exp()
        probs = counts / counts.sum(dim=1,keepdims=True)
        loss = -probs[torch.arange(len(Yt)),Yt].log().mean()
        if i % 10000 == 0:
            print(loss.item())
        for p in params:
            p.grad = None
        loss.backward()
        lr = 0.1 if i < 100000 else 0.01
        for p in params:
            p.data += -lr*p.grad
train(X,Y)

24.610027313232422
2.338359832763672
2.7370457649230957
1.9095947742462158
2.1854395866394043
2.2639575004577637
2.5666255950927734
2.377969980239868
2.2268550395965576
2.2027294635772705
2.302860975265503
2.217515468597412
2.135256052017212
2.180211305618286
2.3095669746398926
2.3808343410491943
2.0995092391967773
2.1182806491851807
1.7603278160095215
2.0243823528289795


In [139]:
C = torch.randn((27,10))
W1 = torch.randn((30,200))
b1 = torch.randn(200)
W2 = torch.randn((200,27))
b2 = torch.randn(27)

# batch norm
bngain = torch.ones((1,200))
bnbais = torch.zeros((1,200))
params = [C,W1,W2,b2,bngain,bnbais]
for p in params:
    p.requires_grad = True
bnmean_running = torch.zeros((1,200))
bnstd_running = torch.ones((1,200))


for i in range(300000):
    ixs = torch.randint(0,len(X),(32,))
    Xt = X[ixs] # B,3
    Yt = Y[ixs]
    embed = C[Xt].view(len(Xt),-1) # B,30

    # batch layer
    hpreact = embed @ W1
    bnmeani = hpreact.mean(dim=0,keepdims=True)
    bnstdi = hpreact.std(dim=0,keepdims=True)
    hpreact = bngain * (hpreact-bnmeani)/bnstdi + bnbais
    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001*bnmeani
        bnstd_running = 0.999 * bnstd_running + 0.001*bnstdi
    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    counts = logits.exp()
    probs = counts / counts.sum(dim=1,keepdims=True)
    loss = -probs[torch.arange(len(Yt)),Yt].log().mean()
    if i % 10000 == 0:
        print(loss.item())
    for p in params:
        p.grad = None
    loss.backward()
    lr = 0.1 if i < 100000 else 0.01
    for k,p in enumerate(params):
        p.data += -lr*p.grad


21.266706466674805
2.8845198154449463
2.579099655151367
2.392408847808838
2.1954123973846436
2.257458209991455
2.210493564605713
2.5479586124420166
2.135347843170166
2.5191166400909424
2.5528926849365234
2.3193135261535645
2.350252151489258
2.2613630294799805
1.8815114498138428
1.7844417095184326
2.576780080795288
2.0307910442352295
2.3529863357543945
2.232245445251465
2.2092714309692383
2.0538454055786133
1.5810080766677856
2.514617919921875
2.434250593185425
2.193006992340088
2.2512848377227783
2.339474678039551
2.5057222843170166
2.113816261291504


In [None]:
class Linear:
    
    def __init__(self,input_size,output_size,bais=False):
        self.W = torch.randn((input_size,output_size))
        self.bais = torch.randn(output_size) if bais else None
    
    def __call__(self,x):
        self.out = x @ self.W
        if self.bais is not None:
            self.out += self.bais
        return self.out
    
    def parameters(self):
        return [self.W] + ([] if self.bais is None else [self.bais])

    
class BatchNorm:
    
    def __init__(self,output_size):
        self.bnmean_running = torch.zeros(output_size)
        self.bnvar_running = torch.ones(output_size)
        self.eps = 1e-5
        self.training = True
        # parameter 
        self.gamma = torch.ones(output_size)
        self.beta = torch.zeros(output_size)
    
    
    def __call__(self,x):
        if self.training:
            var = x.var(dim=0,keepdims=True)
            mean = x.mean(dim=0,keepdims=True)
        else:
            var = self.bnvar_running
            mean = self.bnmean_running
        bn = (x-mean)/torch.sqrt(var+self.eps)
        self.out = self.gamma * bn + self.beta
        if self.training:
            with torch.no_grad():
                self.bnmean_running = 0.999 * self.bnmean_running + 0.001*mean
                self.bnvar_running = 0.999*self.bnvar_running + 0.001*var
        return self.out
    
    def parameters(self):
        return [self.gamma,self.beta]

class Tanh:
    
    def __init__(self):
        pass
    
    def __call__(self,x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

C = torch.randn((27,10))
linear1 = Linear(30,200,False)
bn1 = BatchNorm(200)
linear2 = Linear(200,200,False)
bn2 = BatchNorm(200)
linear3 = Linear(200,200,False)
bn3 = BatchNorm(200)
linear4 = Linear(200,200,False)
bn4 = BatchNorm(200)
linear5 = Linear(200,200,False)
bn5 = BatchNorm(200)
linear6 = Linear(200,27,False)
bn6 = BatchNorm(27)
tanh = Tanh()


layers = [linear1,bn1,Tanh(),linear2,bn2,Tanh(),linear3,bn3,Tanh(),linear4,bn4,Tanh(),
          linear5,bn5,Tanh(),linear6,bn6]
params = [C]
for layer in layers:
    params.extend(layer.parameters())

for p in params:
    p.requires_grad = True


for i in range(200000):
    ixs = torch.randint(0,len(X),(32,))
    Xt = X[ixs] # B,3
    Yt = Y[ixs]
    embed = C[Xt].view(len(Xt),-1) # B,30
    #
    logits = embed
    for layer in layers:
        logits = layer(logits)
    counts = logits.exp()
    probs = counts / counts.sum(dim=1,keepdims=True)
    loss = -probs[torch.arange(len(Yt)),Yt].log().mean()
#     loss = F.cross_entropy(logits,Yt)
    if i % 10000 == 0:
        print(loss.item())
#     for layer in layers:
#         layer.out.retain_grad()
    for p in params:
        p.grad = None
    loss.backward()
    lr = 0.1 if i < 150000 else 0.01
    for k,p in enumerate(params):
        p.data += -lr*p.grad


3.7300968170166016
2.536337375640869
2.420426845550537
2.192359685897827
2.326446056365967
2.5079922676086426
2.130180835723877
2.50620436668396
2.3601365089416504
2.254638671875
2.535080909729004
2.0358543395996094
2.3286173343658447
2.511019229888916
1.7870302200317383
1.9730944633483887
2.0979163646698


In [245]:
print(sum(p.nelement() for p in params)) # number of parameters in total


173670


2

In [246]:
len(linear2.parameters())

1

In [247]:


for l in layers:
    print(len(l.parameters()))

1
2
0
1
2
0
1
2
0
1
2
0
1
2
0
1


In [271]:
for layer in layers:
    layer.training = False

for i in range(20):
    context = [0]*3
    word = []
    while True:
        c = torch.tensor(context).view(1,3)
        embed = C[c].view(-1,30)
        logits = embed
        for i,layer in enumerate(layers):
            logits = layer(logits)
#         counts = logits.exp()
#         probs = counts / counts.sum(dim=1,keepdims=True)
        probs = F.softmax(logits,dim=1)
        ix = torch.multinomial(probs,num_samples=1).item()
        word.append(itos[ix])
        if ix == 0:
            break
        context = context[1:]+[ix]
    print(''.join(word))
    

jaiva.
dayon.
carriela.
kaya.
dela.
adelon.
zaharus.
johnndere.
lejievena.
hai.
elissa.
jana.
romir.
jaishzyn.
kmin.
ange.
avorthanievia.
catriel.
evee.
pran.


In [210]:
# Let's train a deeper network
# The classes we create here are the same API as nn.Module in PyTorch

class Linear:
  
  def __init__(self, fan_in, fan_out, bias=True):
    self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in**0.5
    self.bias = torch.zeros(fan_out) if bias else None
  
  def __call__(self, x):
    self.out = x @ self.weight
    if self.bias is not None:
      self.out += self.bias
    return self.out
  
  def parameters(self):
    return [self.weight] + ([] if self.bias is None else [self.bias])


class BatchNorm1d:
  
  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.momentum = momentum
    self.training = True
    # parameters (trained with backprop)
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)
    # buffers (trained with a running 'momentum update')
    self.running_mean = torch.zeros(dim)
    self.running_var = torch.ones(dim)
  
  def __call__(self, x):
    # calculate the forward pass
    if self.training:
      xmean = x.mean(0, keepdim=True) # batch mean
      xvar = x.var(0, keepdim=True) # batch variance
    else:
      xmean = self.running_mean
      xvar = self.running_var
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    # update the buffers
    if self.training:
      with torch.no_grad():
        self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
        self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
    return self.out
  
  def parameters(self):
    return [self.gamma, self.beta]

class Tanh:
  def __call__(self, x):
    self.out = torch.tanh(x)
    return self.out
  def parameters(self):
    return []
vocab_size=27
block_size=3
n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 100 # the number of neurons in the hidden layer of the MLP
g = torch.Generator().manual_seed(2147483647) # for reproducibility

C = torch.randn((27, n_embd),            generator=g)
layers = [
  Linear(n_embd * block_size, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
  Linear(           n_hidden, vocab_size, bias=False), BatchNorm1d(vocab_size),
]
# layers = [
#   Linear(n_embd * block_size, n_hidden), Tanh(),
#   Linear(           n_hidden, n_hidden), Tanh(),
#   Linear(           n_hidden, n_hidden), Tanh(),
#   Linear(           n_hidden, n_hidden), Tanh(),
#   Linear(           n_hidden, n_hidden), Tanh(),
#   Linear(           n_hidden, vocab_size),
# ]

with torch.no_grad():
  # last layer: make less confident
  layers[-1].gamma *= 0.1
  #layers[-1].weight *= 0.1
  # all other layers: apply gain
  for layer in layers[:-1]:
    if isinstance(layer, Linear):
      layer.weight *= 1.0 #5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(sum(p.nelement() for p in parameters)) # number of parameters in total
for p in parameters:
  p.requires_grad = True

47024


In [211]:
# same optimization as last time
max_steps = 200000
batch_size = 32
lossi = []
ud = []
Xtr = X
Ytr = Y
for i in range(max_steps):
  
  # minibatch construct
  ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
  Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y
  
  # forward pass
  emb = C[Xb] # embed the characters into vectors
  x = emb.view(emb.shape[0], -1) # concatenate the vectors
  for layer in layers:
    x = layer(x)
  loss = F.cross_entropy(x, Yb) # loss function
  
  # backward pass
  for layer in layers:
    layer.out.retain_grad() # AFTER_DEBUG: would take out retain_graph
  for p in parameters:
    p.grad = None
  loss.backward()
  
  # update
  lr = 0.1 if i < 150000 else 0.01 # step learning rate decay
  for p in parameters:
    p.data += -lr * p.grad

  # track stats
  if i % 10000 == 0: # print every once in a while
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())
  with torch.no_grad():
    ud.append([((lr*p.grad).std() / p.data.std()).log10().item() for p in parameters])

#   if i >= 1000:
#     break # AFTER_DEBUG: would take out obviously to run full optimization

      0/ 200000: 3.2655
  10000/ 200000: 2.1920
  20000/ 200000: 2.9674
  30000/ 200000: 2.1824
  40000/ 200000: 2.0169
  50000/ 200000: 2.0693
  60000/ 200000: 1.9622
  70000/ 200000: 2.1996
  80000/ 200000: 2.1361
  90000/ 200000: 2.0254
 100000/ 200000: 1.8179
 110000/ 200000: 1.7120
 120000/ 200000: 2.1412
 130000/ 200000: 1.9781
 140000/ 200000: 2.3923
 150000/ 200000: 2.1340
 160000/ 200000: 2.0724
 170000/ 200000: 1.9253
 180000/ 200000: 2.3488
 190000/ 200000: 2.3244


In [212]:
# put layers into eval mode
for layer in layers:
  layer.training = False

In [266]:
# sample from the model
g = torch.Generator().manual_seed(2147483647 + 10)

for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      # forward pass the neural net
      emb = C[torch.tensor([context])] # (1,block_size,n_embd)
      x = emb.view(emb.shape[0], -1) # concatenate the vectors
      for layer in layers:
        x = layer(x)
      logits = x
      probs = F.softmax(logits, dim=1)
      # sample from the distribution
      ix = torch.multinomial(probs, num_samples=1, generator=g).item()
      # shift the context window and track the samples
      context = context[1:] + [ix]
      out.append(ix)
      # if we sample the special '.' token, break
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out)) # decode and print the generated word

carmah.
amoriq.
khi.
mili.
taty.
skanden.
jazhitn.
deliah.
jarqui.
nellara.
chaiiv.
kaleigh.
ham.
joce.
quint.
shon.
marianni.
watelo.
dearyxi.
jace.


In [226]:
for layer in layers:
    layer.training = False

for i in range(20):
    context = [0]*3
    word = []
    while True:
        c = torch.tensor(context).view(1,3)
        embed = C[c].view(-1,30)
        logits = embed
        for i,layer in enumerate(layers):
            logits = layer(logits)
        counts = logits.exp()
        probs = counts / counts.sum(dim=1,keepdims=True)
        ix = torch.multinomial(probs,num_samples=1).item()
        word.append(itos[ix])
        if ix == 0:
            break
        context = context[1:]+[ix]
    print(''.join(word))

dani.
rydore.
azton.
sophan.
damyrani.
madomi.
chrish.
meymontreviden.
kaity.
kyrick.
angeliyah.
mieranc.
corvii.
aza.
huz.
malia.
rosalan.
lona.
alesticus.
moed.
