In [16]:
import torch
import torch.nn.functional as F

In [9]:
# get name dataset
with open('names.txt', 'r') as file:
    lines = [line.strip() for line in file]
# get unique chars
chars = sorted(list(set(''.join(lines))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s, i in stoi.items()}

In [10]:
sz = len(stoi.items())
N = torch.zeros([sz, sz], dtype=torch.int32)
print(N.shape)
N.dtype

torch.Size([27, 27])


torch.int32

In [11]:
# create bigram matrix
for line in lines:
    chars = ['.'] + list(line) + ['.']
    for pr, sc in zip(chars, chars[1:]):
        idx1 = stoi[pr]
        idx2 = stoi[sc]
        N[idx1][idx2] += 1
N

tensor([[   0, 4410, 1306, 1542, 1690, 1531,  417,  669,  874,  591, 2422, 2963,
         1572, 2538, 1146,  394,  515,   92, 1639, 2055, 1308,   78,  376,  307,
          134,  535,  929],
        [6640,  556,  541,  470, 1042,  692,  134,  168, 2332, 1650,  175,  568,
         2528, 1634, 5438,   63,   82,   60, 3264, 1118,  687,  381,  834,  161,
          182, 2050,  435],
        [ 114,  321,   38,    1,   65,  655,    0,    0,   41,  217,    1,    0,
          103,    0,    4,  105,    0,    0,  842,    8,    2,   45,    0,    0,
            0,   83,    0],
        [  97,  815,    0,   42,    1,  551,    0,    2,  664,  271,    3,  316,
          116,    0,    0,  380,    1,   11,   76,    5,   35,   35,    0,    0,
            3,  104,    4],
        [ 516, 1303,    1,    3,  149, 1283,    5,   25,  118,  674,    9,    3,
           60,   30,   31,  378,    0,    1,  424,   29,    4,   92,   17,   23,
            0,  317,    1],
        [3983,  679,  121,  153,  384, 1271,   82,

In [13]:
# demonstration of broadcasting semantics and associated nuances
# we need the whole row to have the sum of the row for the row to be converted to a distribution
# so its important how we broadcast the tensor
dist = N.float()
sumarr = dist.sum(1, keepdim=True)
bad_sumarr = dist.sum(1)
# print(sumarr.shape)
# print(bad_sumarr.shape)
dummy_arr = torch.zeros([sz, sz])
correct_arr = dummy_arr + sumarr
# print(correct_arr)
bad_dist = dummy_arr + bad_sumarr
# print(bad_dist)

#actual useful code
dist = dist / sumarr

In [46]:
# sample using multinomial

for i in range(20):
    outvec= []
    idx = 0
    while True:
        d = dist[idx]
        idx = torch.multinomial(d, num_samples=1, replacement=True).item()
        outvec.append(itos[idx])
        if idx == 0:
            break
    print(''.join(outvec))

shasofz.
ki.
khanan.
nyci.
adenee.
zilelyllyze.
aron.
cyri.
dazyia.
le.
hirixtilamizaretaynnay.
re.
jaxaynn.
jaynsolosailoeyliomana.
lydivime.
bera.
h.
jageni.
ko.
be.


In [29]:
# now, we move from the hand computed probability approach to using a single layer of neurons
# loss - average negative loss likelihood of next character, we train weights to push these predictions closer to correct
# probabilities
# create the training dataset - feature - prev character, label - next character
xs, ys = [], []
for line in lines:
    chars = ['.'] + list(line) + ['.']
    for ch1, ch2 in zip(chars, chars[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]
        xs.append(idx1)
        ys.append(idx2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
xnum = xs.nelement()
xnum

228146

In [38]:
# initialize the layer and train
W = torch.randn((sz, sz), requires_grad=True)
num_epochs = 20

for i in range(num_epochs):
    xenc = F.one_hot(xs, num_classes=sz).float()
    logits = xenc @ W
    # softmax essentially
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    # clever stuff, this will mean fuck all on a revisit unless you know the semantics we're using here
    loss = -probs[torch.arange(xnum), ys].log().mean()
    print(loss.item())

    #backprop
    W.grad = None
    loss.backward()

    W.data += -100*W.grad

3.8103139400482178
3.176689386367798
2.9380834102630615
2.8131165504455566
2.7476646900177
2.7000184059143066
2.679823875427246
2.63059663772583
2.6152689456939697
2.601663827896118
2.6081130504608154
2.575822353363037
2.5718016624450684
2.565399408340454
2.57899808883667
2.550689935684204
2.5504701137542725
2.5458931922912598
2.562288522720337
2.5354316234588623


In [42]:
# sample and generate

for i in range(10):

    out = []
    ix = 0
    while True:
        # you only need it to have the index
        xenc = F.one_hot(torch.tensor([ix]), num_classes=sz).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdims = True)

        #sample
        ix = torch.multinomial(p, num_samples=1, replacement=True).item()
        out.append(itos[ix])
        if(ix == 0):
            break
    print(''.join(out))

mdianaranajpvocpfa.
bwe.
pshakiiogel.
avivarieonariaynnay.
viartronanlenixxtrxfikkamari.
chariariayninnan.
sjass.
vianademan.
gkaikenah.
mi.
