In [370]:
from net import Tensor, Network, Embedding, Recurrent, Vanilla, MLP, char_tokenize, stoi, itos, sgd, parse_txt
from net.util import SEED

import math
import numpy as np

In [371]:
# Hyper Parameters
BLOCK_SIZE = 3
FEATURES = 32
VOCAB_SIZE = 27
HIDDEN_SIZE = 64
size = (HIDDEN_SIZE, 128, VOCAB_SIZE)
LR = 0.1
STEPS = 10000
BATCH_SIZE = 128
TEMPERATURE = 0.8

In [372]:
# Name data
names = parse_txt("../data/names.txt")
xs, ys, vocab = char_tokenize(names, BLOCK_SIZE)
str_to_int = stoi(vocab)
int_to_str = itos(vocab)


# Splits
b1 = math.floor(len(xs) * 0.8)
b2 = math.floor(len(xs) * 0.9)
x_train = Tensor(xs[:b1])
y_train = Tensor(ys[:b1], grad_required=False)
x_test = Tensor(xs[b1:b2], grad_required=False)
y_test = Tensor(ys[b1:b2], grad_required=False)
x_dev = Tensor(xs[b2:])
y_dev = Tensor(ys[b2:], grad_required=False)

In [373]:
emb = Embedding(FEATURES, VOCAB_SIZE)
cell = Vanilla(HIDDEN_SIZE, FEATURES)
rnn = Recurrent(cell)
mlp = MLP(size, 'tanh')
model = Network([emb, rnn, mlp])

In [410]:
model.training = True
sgd(model, x_train, y_train, LR * 0.01, BATCH_SIZE, STEPS)

Loss: 2.415642953473757 on step: 1
Loss: 2.3356541100472836 on step: 101
Loss: 2.1422598807628024 on step: 201
Loss: 2.0225547344157726 on step: 301
Loss: 2.455628622132769 on step: 401
Loss: 2.126248081684291 on step: 501
Loss: 2.381793575977924 on step: 601
Loss: 2.130827288401496 on step: 701
Loss: 2.2229562423225335 on step: 801
Loss: 2.0321112520848614 on step: 901
Loss: 2.0335286462055246 on step: 1001
Loss: 2.2134474695272277 on step: 1101
Loss: 2.188841525177631 on step: 1201
Loss: 2.2080059349760983 on step: 1301
Loss: 2.382658257780363 on step: 1401
Loss: 2.2479546940149895 on step: 1501
Loss: 2.3816655689237747 on step: 1601
Loss: 2.187193901803687 on step: 1701
Loss: 2.145356558399481 on step: 1801
Loss: 2.4684136907357375 on step: 1901
Loss: 2.3798719573402813 on step: 2001
Loss: 2.2560180469249276 on step: 2101
Loss: 2.2947701608076803 on step: 2201
Loss: 2.0876094801206593 on step: 2301
Loss: 2.357226536608525 on step: 2401
Loss: 2.2585861371652416 on step: 2501
Loss: 2.

In [411]:
# Check entropy
model.training = False
model.forward(x_test)
model.loss(y_test).item()

2.3320191794219136

In [412]:
from difflib import SequenceMatcher

# Similarity score to avoid dups
def is_similar(a, b, threshold=0.8):
    return SequenceMatcher(None, a.lower(), b.lower()).ratio() > threshold


j = 0
prev = set()
while j < 20:
    input = [str_to_int['.']] * BLOCK_SIZE
    out = ""
    while True:
        x = Tensor(np.array(input), grad_required=False)
        model.forward(x)
        logits = model.blocks[-1].out.value.flatten()

        # Temperature and Top-k sampling
        logits = logits / 0.8
        logits -= np.max(logits)  # stabilize
        exp_logits = np.exp(logits)
        probs = exp_logits / exp_logits.sum()

        # Top-k
        top_k = 4
        top_indices = np.argsort(probs)[-top_k:]
        top_probs = probs[top_indices]
        top_probs /= top_probs.sum()

        i = np.random.choice(top_indices, p=top_probs)
        ch = int_to_str[i]

        if ch == '.':
            break

        out += ch
        input = input[1:] + [i]

    if not any(is_similar(out, word) for word in prev):
        print(out.capitalize())
        j += 1
        prev.add(out)


Amarie
Marah
Jaya
Anna
Jaylee
Jamillia
Jani
Kayla
Kalena
Kallie
Merya
Ala
Kendelina
Ari
Jennah
Kenso
Amora
Amille
Aley
Ariah
