# TODO: Improvements based of lec2
1. Initialization matters: wants to logits to be close to 0 at the very beginning to avoid large losses.
2. Saturation of the tanh function: break the gradient flow. plot tanh intermediate values to see how it saturates.
   1. Gradient always deceases as we go back through tanh.
   2. Dead neurons: tanh (or sigmoid or relu) satrurates and cause the gradient to be 0 for all instances.
   3. Initialization: motivation is to keep the distribution's variance of the input and output of each layer the same. If no activation, can multiply by sqrt of (1 / num_of_inputs).
   4. Kaiming's initialization: for relu, multiply by sqrt of (2 / number of inputs), where 2 is called "gain", used to compensate for the fact that relu is not symmetric around 0. "gain" has different values for different activation functions (see pytorch docs aboud weight initialization).
   5. Modern techniques like Adam, BN, residual connections, make initialization less important, but still useful.

In [1]:
import torch
import torch.nn.functional as F
from matplotlib import pyplot as plt
import random

# Data preparation

In [None]:
words = open("../names.txt", "r").read().splitlines()
chars = sorted(set('.'.join(words)))
stoi = {char:i for i, char in enumerate(chars)}
itos = {i:char for i, char in enumerate(chars)}

In [None]:
ratio1 = 0.8
ratio2 = 0.9
chunk = 3

def create_data(ws, chunk=3):
    X = []
    Y = []
    
    for word in ws:
        Xt = [0] * chunk # Cool
        for ch in word + '.': # Dont forget to add ending token.
            X.append(Xt)
            Y.append(stoi[ch])   
            Xt = Xt[1:] + [stoi[ch]]

    return torch.tensor(X), torch.tensor(Y)

random.shuffle(words) # No obvious affect (original words already shuffled)
n1 = int(ratio1 * len(words))
n2 = int(ratio2 * len(words))
Xt, Yt = create_data(words[:n1], chunk=chunk)
Xdev, Ydev = create_data(words[n1:n2], chunk=chunk)
Xte, Yte = create_data(words[n2:], chunk=chunk)
print(Xt.shape)
print(len(Xdev))

# Construct the MLP

In [None]:
X_ = F.one_hot(Xt, num_classes=27).float()
print(X_.shape)
W = torch.randn(27, 3)
print((X_ @ W).shape)

In [None]:
embedding_size = 3

def get_embedding(embedding_size):
    return torch.randn(27, embedding_size, requires_grad=True)

In [None]:
def get_MLP(chunk, embedding_size, num_neurons):
    Ws = []
    bs = []

    # Input layer
    W = torch.randn(chunk * embedding_size, num_neurons[0], requires_grad=True)
    b = torch.randn(num_neurons[0])
    Ws.append(W)
    bs.append(b)

    # Hidden layer
    if len(num_neurons) > 1:
        for n1, n2 in zip(num_neurons, num_neurons[1:]):
            W = torch.randn(n1, n2, requires_grad=True)
            b = torch.randn(n2, requires_grad=True)
            Ws.append(W)
            bs.append(b)

    # Output layer
    W = torch.randn(num_neurons[-1], 27, requires_grad=True)
    b = torch.randn(27)
    Ws.append(W)
    bs.append(b)

    return Ws, bs

W, b = get_MLP(chunk, embedding_size, [100])
print(W[0].shape)

# Train & val scripts

In [None]:
def go_through(xx, embed, Ws, bs):
    # Through embedding
    xx = F.one_hot(xx, num_classes=27).float()
    xx = (xx @ embed).view(xx.shape[0], -1)
    xx = F.tanh(xx) # Easy to forget

    # Through MLP
    num_layers = len(Ws)
    for i, (W, b) in enumerate(zip(Ws, bs)):
        xx = xx @ W + b
        if i < num_layers - 1: # No activation after the last layer
            xx = F.tanh(xx)
    return xx

In [None]:
def train_MLP(X, Y, embed, Ws, bs, batch_size, epochs, lr):
    loss_list = []
    for epoch in range(epochs):
        # Zero out the grad (easy to forget)
        embed.grad = None
        for W, b in zip(Ws, bs):
            W.grad = None
            b.grad = None
        
        # Select batch
        indices = torch.randint(low=0, high=X.shape[0], size=(batch_size,))
        xx = X[indices]
        yy = Y[indices]

        xx = go_through(xx, embed, Ws, bs)
        
        # Compute loss
        loss = F.cross_entropy(xx, yy)
        loss.backward()
        loss_list.append(loss.item())

        # Update params (Use tensor.data, because leaf tensor cannot do in-place operations)
        embed.data += -lr * embed.grad
        for W, b in zip(Ws, bs):
            W.data += -lr * W.grad
            if b.requires_grad == True:
                b.data += -lr * b.grad
            
    return loss_list

# embed = get_embedding(embedding_size)
# embed.requires_grad = True
# Ws, bs = get_MLP(chunk, embedding_size, [100])
# loss_list = train_MLP(Xt, Yt, embed, Ws, bs, 64, 10000, 0.1)
# plt.plot(loss_list)

In [None]:
def dev_MLP(X, Y, embed, Ws, bs):
    xx = go_through(X, embed, Ws, bs)
    loss = F.cross_entropy(xx, Y)
    print(f"loss: {loss}")

# Train & Eval the MLP

In [None]:
chunk = 3
embedding_size = 32
num_neurons = [128, 256, 128]

embed = get_embedding(embedding_size)
Ws, bs = get_MLP(chunk, embedding_size, num_neurons)

for W, b in zip(Ws, bs):
    print(W.shape)

batch_size = 32
epochs = 100000
lr = 0.1

In [None]:
loss_list = train_MLP(Xt, Yt, embed, Ws, bs, batch_size, 50000, 0.01)
plt.plot(loss_list)

In [None]:
dev_MLP(Xdev, Ydev, embed, Ws, bs)

# Infer

In [64]:
def infer_names(num, embed, Ws, bs): # Cant infer in parallel
    for i in range(num):
        result = []
        xx = [0] * chunk
        while True:
            logits = go_through(torch.tensor(xx).unsqueeze(0), embed, Ws, bs)
            probs = logits.exp() / logits.exp().sum(dim=1)
            next_token = torch.multinomial(probs[0], num_samples=1)
            result.append(next_token.item())
            xx = xx[1:] + [next_token]
            if next_token == 0:
                break
        result = [itos[char] for char in result]
        print(''.join(result))
    
infer_names(2, embed, Ws, bs)

jip.
luncekealie.
