In [1]:
import torch
import torch.nn.functional as F

In [2]:
words = open('names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
len(words)

32033

In [4]:
#character encoding

chars = sorted(list(set(''.join(words)))) 
stoi = {s:i+1 for i,s in enumerate(chars)}  #maps characters to integers
stoi['.'] = 0                               # . = 0, a = 1, b = 2, ..., z = 26
itos = {i:s for s,i in stoi.items()}        #remaps integers to characters

In [5]:
# let's create the training set of bigrams (x,y)
xs, ys = [], []

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    #print(ch1, ch2)
    xs.append(ix1)
    ys.append(ix2)
    
xs = torch.tensor(xs)
ys = torch.tensor(ys)

num = xs.nelement()
print('number of examples: ', num)

number of examples:  228146


In [6]:
xs, ys

(tensor([ 0,  5, 13,  ..., 25, 26, 24]),
 tensor([ 5, 13, 13,  ..., 26, 24,  0]))

In [7]:
#converts the integers to a 27-dimensional vector using one-hot encoding

xenc = F.one_hot(xs, num_classes=27).float()
xenc

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 1., 0., 0.]])

In [8]:
xenc.shape

torch.Size([228146, 27])

In [9]:
W = torch.randn((27, 27), requires_grad = True) #generates a random set of weights based on normal 
logits = xenc @ W         #using matrix multiplication to perform (x*w) 
logits                    #weighted sum of one-hot encoded features

tensor([[-0.6235, -1.5779, -0.4353,  ...,  0.4561, -0.3079,  0.5908],
        [-0.7750,  0.8377,  1.9633,  ...,  1.3582, -0.2448, -1.0104],
        [ 0.0557,  2.0179,  0.1426,  ..., -0.3167,  1.1315,  1.6629],
        ...,
        [-1.0436,  1.3638,  0.9527,  ..., -1.5812,  0.6337, -0.2417],
        [-1.0589,  0.1361, -0.8992,  ..., -1.7689,  1.1836, -0.7171],
        [-0.1194,  1.0546,  0.0868,  ..., -1.9429, -1.7843,  0.1652]],
       grad_fn=<MmBackward0>)

In [10]:
#softmax implementation
counts = logits.exp()
probs = counts / counts.sum(1, keepdims=True)
probs

tensor([[0.0173, 0.0067, 0.0209,  ..., 0.0510, 0.0238, 0.0584],
        [0.0080, 0.0403, 0.1241,  ..., 0.0678, 0.0136, 0.0063],
        [0.0231, 0.1645, 0.0252,  ..., 0.0159, 0.0678, 0.1154],
        ...,
        [0.0056, 0.0618, 0.0410,  ..., 0.0033, 0.0298, 0.0124],
        [0.0093, 0.0306, 0.0109,  ..., 0.0046, 0.0872, 0.0130],
        [0.0203, 0.0657, 0.0250,  ..., 0.0033, 0.0038, 0.0270]],
       grad_fn=<DivBackward0>)

In [11]:
# gradient descent
for k in range(100):
  
  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding
  logits = xenc @ W # predict log-counts
    
  #softmax implementation
  counts = logits.exp() 
  probs = counts / counts.sum(1, keepdims=True) # probabilities for next character

  #negative log likelihood loss + L2 regularization
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(loss.item())
  
  # backward pass
  W.grad = None # set to zero the gradient
  loss.backward()
  
  # update
  W.data += -50 * W.grad

3.7106027603149414
3.37222957611084
3.156460762023926
3.0200746059417725
2.9315707683563232
2.867912530899048
2.819028377532959
2.77999210357666
2.747936248779297
2.7210605144500732
2.6981966495513916
2.6785390377044678
2.6615073680877686
2.6466641426086426
2.633666753768921
2.6222383975982666
2.6121466159820557
2.603194236755371
2.59521484375
2.5880656242370605
2.5816283226013184
2.575803279876709
2.570507526397705
2.565673351287842
2.561244249343872
2.5571727752685547
2.5534188747406006
2.5499484539031982
2.5467326641082764
2.5437464714050293
2.540968656539917
2.5383784770965576
2.5359609127044678
2.5336992740631104
2.5315802097320557
2.5295932292938232
2.5277259349823
2.5259697437286377
2.524315595626831
2.5227551460266113
2.5212819576263428
2.5198893547058105
2.5185706615448
2.5173213481903076
2.516136407852173
2.5150113105773926
2.5139412879943848
2.512923240661621
2.511953592300415
2.511029005050659
2.5101468563079834
2.5093040466308594
2.508498191833496
2.50772762298584
2.506989

In [12]:
# finally, sample from the 'neural net' model
g = torch.Generator().manual_seed(345678987654)

for i in range(10):
  
  out = []
  ix = 0
  while True:
    xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
    logits = xenc @ W # predict log-counts
    counts = logits.exp() # counts
    p = counts / counts.sum(1, keepdims=True) # probabilities for next character
    
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

lee.
autinaushan.
n.
be.
ahara.
alona.
arillain.
ha.
chiursh.
li.
