In [10]:
words = open("names.txt", encoding="utf-8").read().splitlines()
words = [i.strip() for i in words if i]

# getting the char library from data set
all_data = ''.join(words)
chars = sorted(list(set(all_data)))

stoi = {val:key+1 for key, val in enumerate(chars)}
stoi['|'] = 0
itos = {ind:st for st, ind in stoi.items()}

In [11]:
import torch
import torch.nn.functional as F

In [12]:
inputs = []
targets = []

for w in words:
    new_w = ["|"] + list(w) + ["|"] # these symbols for indicating start and end of word
    for ch1, ch2 in zip(new_w, new_w[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        inputs.append(ix1)
        targets.append(ix2)

inputs = torch.tensor(inputs)
targets = torch.tensor(targets)
lib_len = inputs.nelement()

print(f'Library length: {lib_len}')

# initialize the 'network'
g = torch.Generator().manual_seed(412987498012)

ch_library_len = len(stoi)
W = torch.randn((ch_library_len, ch_library_len), generator=g, requires_grad=True)

Library length: 228146


In [13]:
loss = 0

# gradient descent - you can lower the iteration count to see 
# outputs with not optimized seed
for k in range(200):
    # forward pass
    xenc = F.one_hot(inputs, num_classes=ch_library_len).float() # input to the network
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts, equivalent to N
    probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
    # print(f'Probabilities: ', probs)
    loss = -probs[torch.arange(lib_len), targets].log().mean()

    # backward pass
    W.grad = None
    loss.backward()

    # update
    W.data += -50 * W.grad

print(f'Normalized Likelihood (Loss): {loss.item()}')

Normalized Likelihood (Loss): 2.4626340866088867


In [14]:
word_lens = [len(w) for w in words]
word_lens = torch.tensor(word_lens)

wlenc = F.one_hot(word_lens, num_classes=max(word_lens).item() + 1)
lengths = wlenc.sum(0) # [n, n] -> [n] by adding all columns respectively

# frequency table of word lengths of data
len_frequency = lengths / lengths.sum()

In [15]:
g = torch.Generator().manual_seed(412987498012)
w_lens = torch.multinomial(len_frequency, num_samples=5, replacement=True, generator=g)
for w_len in w_lens.data:

    ix = 0
    name = []
    i = w_len.item()
    while i != 0:
        wenc = F.one_hot(torch.tensor([ix]), num_classes=ch_library_len).float()
        logits = wenc @ W # predicting log-counts
        counts = logits.exp() # exponanciating - getting real counts, no negatives
        prob_of_next_char = counts / counts.sum(1, keepdim=True)

        ix = torch.multinomial(
            prob_of_next_char, num_samples=1, replacement=True, generator=g
            ).item()
        name.append(itos[ix])

        if ix == 0 and i - 1 != 0:
            name = []
            i = w_len.item()
        else : i -= 1
    print(''.join(name))

vomiilal
hoboroni
kalanee
brrinte
kefamie
