In [1]:
# Imports + setup
import torch
import torch.nn.functional as F

words = open('names.txt', 'r').read().splitlines() # Read in Data
chars = sorted(list(set(''.join(words)))) # Build vocabulary
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i: s for s, i in stoi.items()}

In [2]:
# Building the dataset (inputs and outputs)

context_size = 3 # Number of characters to use to predict the next character
X, Y = [], []

for w in words:

    #print(w)
    context = [0] * context_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        #print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]

X = torch.tensor(X) # Inputs/context
Y = torch.tensor(Y) # Outputs

In [27]:
C = torch.randn((27, 2)) # Lookup table to embed input (27 characters, each stored in 2 dimensions)
emb = C[X] # for each context in X, we embed all the characters of the context

W1 = torch.randn((6, 100)) # Weights for next layer of MLP
b1 = torch.randn(100) # Biasses

In [28]:
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1) # Calculations for first layer. Flatten out emb to have n X 6 dimension, 6 coming from 
# the 2 dimesions of the 3 previous characters. Result is a n X 100 matrix, n outputs for each of the 100 nuerons in the layer. N is the 
#number of original inputs, each neuron calculates for each input

In [29]:
W2 = torch.randn((100, 27)) # 100 weights for each of the 27 neurons in the next layer (27 neurons for 27 different characters)
b2 = torch.randn(27)

In [30]:
logits = h @ W2 + b2 # Final outputs (n x 27) Probability distrobution calculated for each of the inputs

In [31]:
# Loss Calculation (Negative mean log loss)
loss = F.cross_entropy(logits, Y) # Automatically takes exp of logits, normalizes to probabilities for each input, calculates log loss
loss

tensor(20.6314)

In [4]:
# Better Setup
C = torch.randn((27, 2))
W1 = torch.randn((6, 100)) # Weights for next layer of MLP
b1 = torch.randn(100) # Biasses
W2 = torch.randn((100, 27)) # 100 weights for each of the 27 neurons in the next layer (27 neurons for 27 different characters)
b2 = torch.randn(27)
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

In [5]:
# Finding optimal learning rate
lre = torch.linspace(-3, 0, 1000) # Generate 1000 exponents between -3 and 0
lrs = 10**lre

In [8]:
# Training Network

lri = []
lossi = []
#graphing lri vs lossi helps find an optimal learning rate

for i in range(10000):
    
    # Randomly select batch to optimize
    ix = torch.randint(0, X.shape[0], (32,)) # Select 32 random rows
    
    # Forward Pass
    emb = C[X[ix]]
    h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y[ix])
    #print(loss.item())

    # Backward Pass
    for p in parameters:
        p.grad = None # Zero out all gradients
    loss.backward() # Calculate new gradients

    #lr = lrs[i]
    for p in parameters:
        p.data += -0.1 * p.grad

    #lri.append(lr)
    #lossi.append(loss.item())
        
emb = C[X]
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)
print(loss.item())

2.4527599811553955


In [None]:
#Traing split 80%
#What we did above, training for better parameters

#dev/validation split 10%
#Optimize hyperparameters(size of layers, inputs, etc)

#Test split 10%
#Evaluate final model performance after optimizations

#What is the point: Avoid overtraining where model memorizes outputs

In [9]:
#Splitting model into 3 splits
def build_dataset(words):
    context_size = 3 # Number of characters to use to predict the next character
    X, Y = [], []
    
    for w in words:
        #print(w)
        context = [0] * context_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            #print(''.join(itos[i] for i in context), '--->', itos[ix])
            context = context[1:] + [ix]
        
    X = torch.tensor(X) # Inputs/context
    Y = torch.tensor(Y)

    return X, Y

import random
random.seed(42)
random.shuffle(words)

n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

In [10]:
# Resetup
C = torch.randn((27, 2))
W1 = torch.randn((6, 100)) # Weights for next layer of MLP
b1 = torch.randn(100) # Biasses
W2 = torch.randn((100, 27)) # 100 weights for each of the 27 neurons in the next layer (27 neurons for 27 different characters)
b2 = torch.randn(27)
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

In [14]:
# Training Network Using only training set

lri = []
lossi = []
#graphing lri vs lossi helps find an optimal learning rate

for i in range(10000):
    
    # Randomly select batch to optimize
    ix = torch.randint(0, Xtr.shape[0], (32,)) # Select 32 random rows
    
    # Forward Pass
    emb = C[Xtr[ix]]
    h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])
    #print(loss.item())

    # Backward Pass
    for p in parameters:
        p.grad = None # Zero out all gradients
    loss.backward() # Calculate new gradients

    #lr = lrs[i]
    for p in parameters:
        p.data += -0.1 * p.grad

    #lri.append(lr)
    #lossi.append(loss.item())
        
emb = C[Xdev]
h = torch.tanh(emb.view(emb.shape[0], 6) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
print(loss.item())

2.3674027919769287


In [26]:
#Dev steup to optimize hyperparameters
C = torch.randn((27, 10))
W1 = torch.randn((30, 200)) # Weights for next layer of MLP
b1 = torch.randn(200) # Biasses
W2 = torch.randn((200, 27)) # 100 weights for each of the 27 neurons in the next layer (27 neurons for 27 different characters)
b2 = torch.randn(27)
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

In [None]:
#Optimizing Hyperparameters:
lri = []
lossi = []

for i in range(50000):
    
    # Randomly select batch to optimize
    ix = torch.randint(0, Xtr.shape[0], (32,)) # Select 32 random rows
    
    # Forward Pass
    emb = C[Xtr[ix]]
    h = torch.tanh(emb.view(emb.shape[0], 30) @ W1 + b1)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Ytr[ix])
    #print(loss.item())

    # Backward Pass
    for p in parameters:
        p.grad = None # Zero out all gradients
    loss.backward() # Calculate new gradients

    #lr = lrs[i]
    for p in parameters:
        p.data += -0.05 * p.grad

    #lri.append(lr)
    #lossi.append(loss.item())
        
emb = C[Xdev]
h = torch.tanh(emb.view(emb.shape[0], 30) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Ydev)
print(loss.item())