In [1]:
words = open("names.txt", 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [2]:
print(f"Shortest name: {min(len(w) for w in words)}")
print(f"Longest  name: {max(len(w) for w in words)}")
print(f"Number of names: {len(words)}")

Shortest name: 2
Longest  name: 15
Number of names: 32033


In [3]:
import torch
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
NUM_CHARS = len(stoi.keys())
itos = {i:s for s,i in stoi.items()}
N = torch.zeros((NUM_CHARS, NUM_CHARS), dtype=torch.int32)
N.shape

torch.Size([27, 27])

In [4]:
# ---- Time for a neural network approach ------

# Build the training sets.
xs, ys = [], []
for w in words:
    # Include the start and end chars for this word.
    letters = ['.'] + list(w) + ['.']
    # Iterate over 2 chars at once, n and n+1.  Bigram is just a pair of letters in
    # the order they appear in words.
    for ch1, ch2 in zip(letters, letters[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)  # The next letter is the desired value.
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num_inputs = xs.nelement()
num_inputs

228146

In [10]:
# Change character IDs to a 1.0 in the proper place in an input matrix.
import torch.nn.functional as F
x_encoded = F.one_hot(xs, num_classes=NUM_CHARS).float()
x_encoded.shape

torch.Size([228146, 27])

In [20]:
# As an example just look at a few cases.
sample = xs[0:5]
print(sample)
sample_encoded = F.one_hot(sample, num_classes=NUM_CHARS).float() 
sample_encoded

tensor([ 0,  5, 13, 13,  1])


tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [25]:
g = torch.Generator().manual_seed(2147483647)
# Initialize weights for a set of neurons.
NUM_NEURONS = 27
W = torch.randn((NUM_CHARS, NUM_NEURONS), generator=g, requires_grad=True)
# Initialize the gradient.
W.grad = None
W[0]

tensor([ 1.5674, -0.2373, -0.0274, -1.1008,  0.2859, -0.0296, -1.5471,  0.6049,
         0.0791,  0.9046, -0.4713,  0.7868, -0.3284, -0.4330,  1.3729,  2.9334,
         1.5618, -1.6261,  0.6772, -0.8404,  0.9849, -0.1484, -1.4795,  0.4483,
        -0.0707,  2.4968,  2.4448], grad_fn=<SelectBackward0>)

In [30]:
# Test the output = W . x + b  (for now b is zero so ignore)

# We interperet the resulting value as the log of the counts we would
# get if we had sampled the model some huge number of times.  In the 
# simple bigram model we ended up just literally counting the examples
# in the training data.  Without doing that here we just interperet the
# outputs as the log of those counts.  The log of counts is called logits.
logits = x_encoded @ W

counts = logits.exp()   # Exponentiating them gives back counts, like N above.
probs  = counts / counts.sum(1, keepdim=True)
# These last two lines are also called softmax.
# At this point we have the probabiliites for every training example.  Each row
# is an example input case, and the 27 entries along that row are the probabilities
# of that entry being the next char that the model is predicting.

# This process of going once through the network multiplying x by W is a "forward pass"
print("probs[0]:", probs[0])

# Now we calculate the loss.  It is the negative log likelihood.
loss = -probs[torch.arange(num_inputs), ys].log().mean()
loss.item()

probs[0]: tensor([0.0607, 0.0100, 0.0123, 0.0042, 0.0168, 0.0123, 0.0027, 0.0232, 0.0137,
        0.0313, 0.0079, 0.0278, 0.0091, 0.0082, 0.0500, 0.2378, 0.0603, 0.0025,
        0.0249, 0.0055, 0.0339, 0.0109, 0.0029, 0.0198, 0.0118, 0.1537, 0.1459],
       grad_fn=<SelectBackward0>)


3.758953332901001

In [31]:
# BTW, here is what's happening in that list comprehension.
# for i in range(num_inputs):
#     x = xs[i]  <-- input to network
#     y = ys[i]  <-- expected output
#     probability_for_next_char = probs[i, y]
#     log_likelihood = torch.log(probability_for_next_char)
#     neg_log_likelihood = -log_likelihood
#     all_negll.append(neg_log_likelihood)
# avg_neg_ll = all_negll.mean()
for i in range(5):  # Let's just look at 5 samples.
    x = xs[i].item()  # <-- input to network
    y = ys[i].item()  # <-- expected output
    print("-----")
    print("input to the net:   ", x)
    print("expected next char: ", y)
    # print("output probs of the model: ", probs[i])
    probability_for_next_char = probs[i, y]
    print("Probability assigned to char by the model: ", probability_for_next_char.item())
    log_likelihood = torch.log(probability_for_next_char)
    print("log likelihood: ", log_likelihood.item())
    neg_log_likelihood = -log_likelihood
    print("neg ll: ", neg_log_likelihood.item())


-----
input to the net:    0
expected next char:  5
Probability assigned to char by the model:  0.012286253273487091
log likelihood:  -4.3992743492126465
neg ll:  4.3992743492126465
-----
input to the net:    5
expected next char:  13
Probability assigned to char by the model:  0.018050702288746834
log likelihood:  -4.014570713043213
neg ll:  4.014570713043213
-----
input to the net:    13
expected next char:  13
Probability assigned to char by the model:  0.026691533625125885
log likelihood:  -3.623408794403076
neg ll:  3.623408794403076
-----
input to the net:    13
expected next char:  1
Probability assigned to char by the model:  0.07367684692144394
log likelihood:  -2.6080667972564697
neg ll:  2.6080667972564697
-----
input to the net:    1
expected next char:  0
Probability assigned to char by the model:  0.0149775305762887
log likelihood:  -4.201204299926758
neg ll:  4.201204299926758


In [13]:
# Backward pass.  
loss.backward()

# Note how awesome this is.  loss was calculated using lots of math operations, and 
# it started by going through the network.  torch kept track of all of the math
# operations, so we can just call .backward() on loss to go all the way backward
# to the start of the network and compute all of the gradients.

# Now use the resulting gradient to update all of the params.
# We will go 10% along the negative of the gradient.
W.data += -0.1 * W.grad

In [32]:
# Loop to do more training.  

# Gradient descent
LEARNING_RATE = 10

for k in range(200):
    x_encoded = F.one_hot(xs, num_classes=NUM_CHARS).float()
    logits = x_encoded @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdim=True)
    loss = -probs[torch.arange(num_inputs), ys].log().mean()
    print(loss.item())

    # backward pass
    W.grad = None  # Reset gradients before computing them again!
    loss.backward()

    # Update
    W.data += -LEARNING_RATE * W.grad

3.758953332901001
3.6702592372894287
3.591153860092163
3.520017385482788
3.4557948112487793
3.3977162837982178
3.345163583755493
3.2975902557373047
3.254483938217163
3.2153544425964355
3.17973256111145
3.147189140319824
3.117339849472046
3.0898516178131104
3.064443588256836
3.0408785343170166
3.0189590454101562
2.998518705368042
2.9794163703918457
2.9615297317504883
2.9447529315948486
2.928992509841919
2.9141645431518555
2.900193929672241
2.887012004852295
2.874558925628662
2.8627758026123047
2.851613759994507
2.841024398803711
2.830965757369995
2.8213980197906494
2.812286138534546
2.8035969734191895
2.7953009605407715
2.7873711585998535
2.7797818183898926
2.7725107669830322
2.7655372619628906
2.7588419914245605
2.7524070739746094
2.7462172508239746
2.7402572631835938
2.7345142364501953
2.7289748191833496
2.7236287593841553
2.7184643745422363
2.713472604751587
2.708644390106201
2.7039716243743896
2.699446678161621
2.6950621604919434
2.6908113956451416
2.6866886615753174
2.6826879978179

In [37]:
# We can now sample from our trained model.

# Generate new names using our model.
g = torch.Generator().manual_seed(1337)
for i in range(20):
    out_name = []
    ix = 0  # Start with the start char.
    while True:
        # p = P[ix]  # Grab this row of probabilities.
        x_encoded = F.one_hot(torch.tensor([ix]), num_classes=NUM_CHARS).float()
        logits = x_encoded @ W   # Predict log-counts
        counts = logits.exp()
        p = counts / counts.sum(1, keepdim=True)  # Probabilities for next char.
        # torch.multinomial will sample from a given probability.
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        if ix == 0:
            break
        out_name.append(itos[ix])
    print(''.join(out_name))

myliena
r
a
ahi
gsammian
n
xxtfpcaleldfenixe
kar
jaranigf
meceeja
ty
sbenna
jathe
tyzimcalajan
ma
vmanpcciloly
jai
royu
ka
aqamyn
