In [21]:
import torch
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
words = open('../_resources/names.txt', 'r').read().splitlines()

In [90]:
chars = sorted(list(set(''.join(words))))

stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0

itos = {i:s for s,i in stoi.items()}
itos = dict(sorted(itos.items(), key=lambda item: 0 if item[1] == '.' else 1))

## Plan

- start with 2 dots
- ensure probability of ".a." etc is 0 (ie newly generated names also always start with `..`

so `emma` would be:
.. => e
.e => m
em => m
mm => a
ma => .

In [56]:
b = {}
for w in words:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        trigram = (ch1, ch2, ch3)
        b[trigram] = b.get(trigram, 0) + 1
        # print(ch1, ch2, ch3)

In [17]:
sorted(b.items(), key = lambda kv: -kv[1])[:10]

[(('.', '.', 'a'), 4410),
 (('.', '.', 'k'), 2963),
 (('.', '.', 'm'), 2538),
 (('.', '.', 'j'), 2422),
 (('.', '.', 's'), 2055),
 (('a', 'h', '.'), 1714),
 (('.', '.', 'd'), 1690),
 (('n', 'a', '.'), 1673),
 (('.', '.', 'r'), 1639),
 (('.', '.', 'l'), 1572)]

In [20]:
N = torch.zeros((27, 27, 27), dtype=torch.int32)

for w in words:
    chs = ['.'] + ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1, ix2, ix3 = stoi[ch1], stoi[ch2], stoi[ch3]
        N[ix1, ix2, ix3] += 1

In [89]:
P = (N+1).float()
P /= P.sum(2, keepdim=True) # [27,1]

P[N == 0] = 0  # zero out illegal combinations
P /= P.sum(2, keepdim=True)  # renormalize across ix3

out = []
g_offset = 0
g = torch.Generator().manual_seed(2147483647 + g_offset)

for i in range(100):
    chs = ['.', '.']
    while True:
        ix1, ix2 = stoi[chs[-2]], stoi[chs[-1]]
        probs = P[ix1, ix2]
        ix3 = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        ch3 = itos[ix3]
        if ch3 == '.':
            out.append(''.join(chs[2:]))
            break
        chs.append(ch3)

print(out)

['ce', 'bra', 'jalius', 'rochityharlonimittain', 'luwak', 'ka', 'da', 'samiyah', 'javer', 'gotai', 'moriellavoji', 'preda', 'kaley', 'maside', 'en', 'aviyah', 'folspihiliven', 'tahlasuzurd', 'del', 'jol', 'pen', 'aisan', 'jaridynne', 'zam', 'der', 'jair', 'tagaikayshaabelarl', 'khysteeven', 'abricayharien', 'xzai', 'kalmaravaysor', 'myson', 'laitynna', 'evedriseriyen', 'kyille', 'lahmie', 'marah', 'ammikamaxemmy', 'asharle', 'alcalha', 'jayceasve', 'selane', 'nellay', 'ra', 'adaliyana', 'isa', 'dougen', 'luishya', 'joharlan', 'annutsaoishri', 'bo', 'ra', 'bekarie', 'vikace', 'ara', 'jayk', 'jagh', 'crylesterliyla', 'ca', 'ro', 'prah', 'ye', 'en', 'aidgessir', 'mer', 'decla', 'tie', 'khamedahzymareizaymarvituricon', 'cri', 'alkirajamere', 'morad', 'lie', 'mariannanfelian', 'keonteahaj', 'ka', 'rena', 'mon', 'keika', 'suynn', 'micia', 'am', 'rocinsofinslie', 'kenie', 'zakiha', 'denovi', 'kar', 'kas', 'try', 'azemir', 'ret', 'ta', 'ley', 'ke', 'sa', 'carlorcagatai', 'versimikavallin', 'sk