In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
words = open('names.txt').read().splitlines()
print(len(words), 'words')
print(max(len(word) for word in words), 'max word length')
print(words[:10])

32033 words
15 max word length
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


In [3]:
chars = sorted(list(set(''.join(words))))
stoi = {w: i+1 for i, w in enumerate(chars)}
stoi['.'] = 0
itos = {i: w for w, i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size, 'characters')

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27 characters


In [4]:
import random
random.seed(42)
random.shuffle(words)

In [5]:
block_size = 3
device = 'cuda'

def build_dataset(words):
    X = []
    Y = []
    for word in words:
        ctx = [0] * block_size
        for c in word + '.':
            X.append(ctx)
            Y.append(stoi[c])
            #print(''.join(itos[x] for x in ctx), '--->',c)
            ctx = ctx[1:] + [stoi[c]]

    X = torch.tensor(X, device=device)
    Y = torch.tensor(Y, device=device)
    return X, Y


n1 = int(len(words) * .8)
n2 = int(len(words) * .9)

Xtrain, Ytrain = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

In [6]:
for x, y in zip(Xtrain[:20], Ytrain[:20]):
    print(''.join(itos[x_.item()] for x_ in x), '--->', itos[y.item()])

... ---> y
..y ---> u
.yu ---> h
yuh ---> e
uhe ---> n
hen ---> g
eng ---> .
... ---> d
..d ---> i
.di ---> o
dio ---> n
ion ---> d
ond ---> r
ndr ---> e
dre ---> .
... ---> x
..x ---> a
.xa ---> v
xav ---> i
avi ---> e


# Layer Types

In [7]:
class Linear:
    def __init__(self, fan_in, fan_out, device='cpu', bias=True):
        self.weight = torch.randn((fan_in, fan_out), device=device) / fan_in**0.5
        self.b = torch.zeros(fan_out, device=device) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.b is not None:
            self.out += self.b
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.b is None else [self.b])
    
class BatchNorm1d:
    def __init__(self, fan_in, eps=1e-5, momentum=0.1, device='cpu'):
        self.eps = eps
        self.momentum = momentum
        self.gamma = torch.ones(fan_in, device=device)
        self.beta = torch.zeros(fan_in, device=device)
        self.mean_running = torch.zeros(fan_in, device=device)
        self.var_running = torch.ones(fan_in, device=device)
        self.training = True

    def __call__(self, x):
        if self.training:
            mean = x.mean(0, keepdim=True)
            var = x.var(0, keepdim=True)
        else:
            mean = self.mean_running
            var = self.var_running

        self.out = (x - mean) / (var + self.eps).sqrt()
        self.out = self.out * self.gamma + self.beta

        if self.training:
            with torch.no_grad():
                self.mean_running = self.momentum * self.mean_running + (1 - self.momentum) * mean
                self.var_running = self.momentum * self.var_running + (1 - self.momentum) * var

        return self.out 
        
    def parameters(self):
        return [self.gamma, self.beta]

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []
    
class Embedding:
    def __init__(self, vocab_size, embedding_size, device='cpu'):
        self.weight = torch.randn((vocab_size, embedding_size), device=device)

    def __call__(self, ix):
        self.out = self.weight[ix]
        return self.out
    
    def parameters(self):
        return [self.weight]
    
class Flatten:
    def __call__(self, x):
        self.out = x.view(x.shape[0], -1)
        return self.out
    
    def parameters(self):
        return []
    
class Sequential:
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return x

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

# Network Setup

In [8]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f6d470594d0>

In [9]:
n_embd = 10
n_hidden = 200

model = Sequential([
    Embedding(vocab_size, n_embd, device=device),
    Flatten(),
    Linear(n_embd * block_size, n_hidden, bias=False, device=device), BatchNorm1d(n_hidden, device=device), Tanh(),
    Linear(n_hidden, vocab_size, device=device),
])

with torch.no_grad():
  model.layers[-1].weight *= 0.1

print("Model Params:", sum(p.nelement() for p in model.parameters()))

for p in model.parameters():
    p.requires_grad = True

Model Params: 12097


# Training

In [10]:
batch_size = 32
max_steps = 200000
lossi = []

for i in range(max_steps):
    ix = torch.randint(0, Xtrain.shape[0], (batch_size,))
    Xb, Yb = Xtrain[ix], Ytrain[ix]

    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb)

    #-- backprop
    for p in model.parameters():
        p.grad = None
    loss.backward()

    #learning_rate = lrs[i]
    lr = 0.1 if i < 100000 else 0.01
    for p in model.parameters():
        p.data += -lr * p.grad

    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')

    lossi.append(loss.log10().item())
    # if i >= 1:
    #     break

print(loss)

      0/ 200000: 3.2972
  10000/ 200000: 2.0884
  20000/ 200000: 2.2309
  30000/ 200000: 2.1892
  40000/ 200000: 1.8902
  50000/ 200000: 2.3622
  60000/ 200000: 2.4263
  70000/ 200000: 2.5687
  80000/ 200000: 1.8453
  90000/ 200000: 2.2090
 100000/ 200000: 2.3640
 110000/ 200000: 2.0855
 120000/ 200000: 2.2904
 130000/ 200000: 2.1505
 140000/ 200000: 2.1970
 150000/ 200000: 2.2652
 160000/ 200000: 1.8880
 170000/ 200000: 2.3349
 180000/ 200000: 2.1135
 190000/ 200000: 2.2555
tensor(1.9021, device='cuda:0', grad_fn=<NllLossBackward0>)


In [1]:
plt.plot(torch.tensor(lossi).view(-1, 1000).mean(1))

NameError: name 'plt' is not defined

In [37]:
for layer in model.layers:
    layer.training = False

In [38]:
@torch.no_grad()
def split_loss(split):
    x, y = {
        'train': (Xtrain, Ytrain),
        'dev': (Xdev, Ydev),
        'test': (Xtest, Ytest),
    }[split]

    logits = model(x)
    loss = F.cross_entropy(logits, y)
    return loss

loss_train = split_loss('train')
print('train', loss_train)

loss_dev = split_loss('dev')
print('dev', loss_dev)

train tensor(3.2701, device='mps:0')
dev tensor(3.2695, device='mps:0')


# Sample from model

In [40]:
for _ in range(20):
    
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      # forward pass the neural net

      logits = model(torch.tensor([context]))
      # prediction
      probs = F.softmax(logits, dim=1)
      # sample from the distribution
      ix = torch.multinomial(probs, num_samples=1).item()
      # shift the context window and track the samples
      context = context[1:] + [ix]
      out.append(ix)
      # if we sample the special '.' token, break
      if ix == 0:
        break
    
    print(''.join(itos[i] for i in out)) # decode and print the generated word

spuvsxxlzzxgrrgedumn.
vyaez.
fasadwfha.
qclnylteiyqevhycucxuxayxigsifpeqyqxczebmzpecjmbdigjfnrkvugnivdcwkeanjrodzswadpbnnmjeijdcgigduozyylouzxcoqznkomageatgrqqznryogbrmbnwvqblemmhektgjqtz.
ahxh.
ryhvwvsbhmgovboebsmfmjcvlelkrhkqcubjbdtabo.
.
lvelglfszvolddoavqrquzihwhmytrryimazkdgvyfbfulvgvtvfiygfsextmzxonlcyakjzeqfzskmnqgpvqvqoxskjpaqhufpfqorboqck.
dxugttkpvmvwwavggnaxcbscjjbnblhvlsguzmgycvgausvishjwx.
rnqhecded.
kumaulbfcl.
thqblru.
ackxwvnlnzowhrerbnisawshigookqoynzqoruoywppcvizdeqlniler.
dhjikxrumskk.
htpgcarhmjxezbagpgvuwwcejreazxlfpoylhpkkbjvwhchwigrkbuozkjwuffuttwnxaewtvppwyjnscnfyot.
rkexjkijawurajajkoqsyaxqnrqhuhxfdb.
dtkyebvjgocdcxbosvxwbssvjygq.
qblxzsiyrhnfypahohjgabjjjaufluehmgnmrx.
rxjjalaznlzmzgxkwmlcpbxlsuadjebvoddpitukhqxqdogukzxivrwagaaqnhuemhlvsshpol.
milmbikiefdlfykjmmvahjluqaictkf.


In [45]:
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" == "79f7de7a1a48f82b08b2d7f16503daf9de7de7423f4e70372a30919b9cbe35b6"

False