# Predict name based on first several letters

In [None]:
import torch
import torch.nn.functional as F
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
print(f"{torch.cuda.is_available()=}")
device="cpu"
if torch.cuda.is_available():
    device="cuda:0"
print(device)

In [None]:
import string
itos = {0:'.'}
for i, c in enumerate(string.ascii_lowercase):
    itos[i+1]=c  
stoi = {s:i for i, s in itos.items()}
print(stoi)
voc_size=len(itos)
print(f"{voc_size=}")

In [None]:
def encode(ss):
    res = [stoi[c] for c in ss]
    return res

def decode(ii, tilldot=False):
    ch = False
    res = []
    for i in ii:
        if i == 0 and tilldot and ch:
            break
        if not(ch) and i != 0:
            ch = True
        res.append(itos[i])
    return ''.join(res)

In [None]:
names_f = "names.txt"
with open(names_f) as f:
    words = f.read().splitlines()

#random.seed(42)
random.shuffle(words)
print(words[:3])
print(len(words))

In [None]:
def add_word(w, bsz, X, Y):
    x = "."*bsz
    xi = [0]*bsz
    for y in w:
        yi = stoi[y]
        X.append(xi)
        Y.append(yi)
        xi = xi[1:]
        xi.append(yi)
    X.append(xi)
    Y.append(0)

In [None]:
att=3
emb=10
hidden = 200

Xa, Ya = [], []
for w in words:
    add_word(w, att, Xa, Ya)
X = torch.tensor(Xa, device=device)
Y = torch.tensor(Ya, device=device)
print(f"{X.shape=}")
print(f"{Y.shape=}")

In [None]:
n1 = int(len(X) * 0.8)
n2 = int(len(X) * 0.9)
print("Split Global Dataset on lines:", n1, n2)
X_tr = X[:n1]
Y_tr = Y[:n1]
X_val = X[n1:n2]
Y_val = Y[n1:n2]
X_tst = X[n2:]
Y_tst = Y[n2:]
print(f"{X_tr.shape=}")
print(f"{X_val.shape=}")
print(f"{X_tst.shape=}")

In [None]:
from typing import Any


class Linear:
    def __init__(self, in_f, out_f, bias=True, device=None, dtype=None) -> None:
        self.weight = torch.randn(size=(in_f, out_f), dtype=dtype, device=device) * (in_f**-0.5)
        self.bias = torch.zeros(out_f, dtype=dtype, device=device) if bias else None
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    def parameters(self):
        if self.bias is not None:
            return [self.weight, self.bias]
        return [self.weight]
    
class Tanh:
    def __call__(self, x) -> Any:
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []
    
class BatchNorm1d:
    def __init__(self, num_f, eps=1e-05, momentum=0.1, device=None, dtype=None) -> None:
        self.eps = eps
        self.momentum = momentum
        self.training = True
        self.gamma = torch.ones(num_f, dtype=dtype, device=device)
        self.beta = torch.zeros(num_f, dtype=dtype, device=device)
        self.running_mean = torch.zeros(num_f, dtype=dtype, device=device)
        self.running_var = torch.ones(num_f, dtype=dtype, device=device)
    def __call__(self, x):
        if self.training:
            xmean = x.mean(0, keepdim=True) # batch mean
            xvar = x.var(0, keepdim=True) # batch var
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar
        return self.out
    def parameters(self):
        return [self.gamma, self.beta]


In [None]:
#torch.manual_seed(42)
E_w = torch.randn(size=(voc_size, emb), device=device)

layers = [
    Linear(emb*att, hidden, device=device), BatchNorm1d(hidden, device=device), Tanh(),
    Linear(hidden, voc_size, device=device)
]

with torch.no_grad():
    layers[-1].weight *= 0.1 # reduce last layer confidence

params = [E_w] + [p for l in layers for p in l.parameters()]
nparams = sum([t.numel() for t in params])
print(f"{nparams=}")
for t in params:
    t.requires_grad_()

In [None]:
def forward(x):
    y = E_w[x].flatten(1)
    for l in layers:
        y = l(y)
    return y

def calc_loss(L, Y):
    return F.cross_entropy(L, Y)

def backward(loss: torch.Tensor):
    for p in params:
        if p.grad is not None:
            p.grad.zero_()
    loss.backward()

def update_params(step):
    for p in params:
        p.data -= step * p.grad


In [None]:
def get_batch(X0, Y0, n):
    rids = torch.randint(0, n1, (n,), device=device)
    return X0[rids], Y0[rids]

In [None]:
LL=[]

batch = 32
X0, Y0 = get_batch(X_tr, Y_tr, batch)

In [None]:
# Training Loop
L = forward(X0)
loss = calc_loss(L, Y0)
print("No Training. loss:", loss.item())

# initial training
for i in range(1):
    backward(loss)
    update_params(0.1)
    X0, Y0 = get_batch(X_tr, Y_tr, batch)
    L = forward(X0)
    loss = calc_loss(L, Y0)
print("After Initial training. loss: ", loss.item())


In [None]:
# print("dead neurons:", torch.where(torch.abs(H0_w.grad)<0.0001, 1,0).sum())
# with torch.no_grad():
#     X0, Y0 = X_val, Y_val
#     Ey = E_w[X0].flatten(1)
#     H0_y = torch.tanh(Ey @ H0_w + H0_b)
#     L = H0_y @ H1_w + H1_b
#     gg = H0_y.flatten().detach().cpu().numpy()
#     print(gg.shape)
#     plt.hist(gg, bins=50)

In [None]:
for l in layers:
    l.training = True

N = 20000
#steps=[0.01, 0.003]
steps=[0.1, 0.03, 0.01, 0.003, 0.0001]
#steps=[0.1]
#steps=[0.01]
#steps=[0.03]
#steps=[0.0001]

X0, Y0 = get_batch(X_tr, Y_tr, batch)
L = forward(X0)
loss = calc_loss(L, Y0)

WIN = []
for step in steps:
    for i in range(N):
        backward(loss)
        update_params(step)
        X0, Y0 = get_batch(X_tr, Y_tr, batch)
        L = forward(X0)
        loss = calc_loss(L, Y0)
        WIN.append(loss.item())
        if i % 200 == 0:
          LL.append(np.mean(WIN))
          WIN = []
        
    print(f"Step {step} done. Loss: {LL[-1]}")

In [None]:
# Validation / Test
for l in layers:
    l.training = False

with torch.no_grad():
    L = forward(X_val)
    loss = calc_loss(L, Y_val)
    print("Validation loss", loss)

    L = forward(X_tst)
    loss = calc_loss(L, Y_tst)
    print("Test loss", loss)

In [None]:
# Plot loss
df = pd.DataFrame(LL, columns=['X'])
X_col=df['X']
MA_X_col = df['X'].rolling(window=20).mean()
plt.figure(figsize=(20, 5))
plt.plot(X_col, 'b-', label='loss')
plt.plot(MA_X_col, 'r-', label='MA20')
plt.grid(linestyle='--')
plt.legend(loc='upper center')
plt.show()
print("Tail of Moving Average column")
MA_X_col[-10:]

In [None]:
s=encode("emm")

for i in range(100):
    x = torch.tensor([s[-att:]], device=device)
    L = forward(x)
    L = torch.softmax(L, dim=-1)
    ci = torch.multinomial(L, num_samples=1).item()
    #ci = int(torch.argmax(L).item())
    c = itos[ci]
    if ci == 0:
        break
    s += [ci]
print(decode(s))

In [None]:
beg = []
batch = words[30:55]
for w in batch:
    if len(w) < att:
        w = "." * (len(w) - att) + w
    beg.append(encode(w[:att]))
x = torch.tensor(beg, device=device)
#print(x)
for i in range(7):
    L = forward(x[:,-att:])
    L = torch.softmax(L, dim=-1)
    y = torch.multinomial(L, num_samples=1)
    #y = torch.argmax(L, dim=1, keepdim=True)
    x = torch.cat([x,y],dim=-1)
for i, row in enumerate(x.detach().cpu().numpy()):
    print(decode(row, True), "   ", batch[i])


In [None]:
def plot_emb():
    plt.figure(figsize=(8, 8))
    plt.scatter(E_w[:,4].detach().cpu().numpy(), E_w[:,5].detach().cpu().numpy(), s=200)
    for i in range(voc_size):
        plt.text(E_w[i,4].item(), E_w[i,5].item(), itos[i], ha="center", va="center", color="white")

In [None]:
plot_emb()