In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
words = open("names.txt", mode='r').read().split()
print(words[:10])

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia', 'harper', 'evelyn']


In [5]:
# create a mapping to/from characters and integers  

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i:s for s, i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [207]:
# create dataset 

block_size = 3 
X, Y = [], []

for word in words:
    context = [0] * block_size
    
    for ch in word + '.':
        X.append(context)
        ix = stoi[ch]
        Y.append(ix)
        context = context[1:] + [ix]

# check that the dataset is correct 
for x, y in zip(X[:12],Y[:12]):
    print(''.join(itos[ix] for ix in x), end="")
    print(" predicts", itos[y])

# convert dataset to tensors 
X = torch.tensor(X)
Y = torch.tensor(Y)

... predicts e
..e predicts m
.em predicts m
emm predicts a
mma predicts .
... predicts o
..o predicts l
.ol predicts i
oli predicts v
liv predicts i
ivi predicts a
via predicts .


In [223]:
X.shape, Y.shape

(torch.Size([228146, 3]), torch.Size([228146]))

In [224]:
#initialized weights and biases

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((27,2), generator=g)
W1 = torch.randn((6, 100), generator=g)
b1 = torch.randn((100), generator=g)
W2 = torch.randn((100, 27), generator=g)
b2 = torch.randn((27), generator=g)
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True

In [227]:
for _ in range(1000):

    mini_batch_ix = torch.randint(0, X.shape[0], (32,))

    # forward pass
    emb = C[X[mini_batch_ix]]
    h = torch.tanh(emb.view(-1,6) @ W1 + b1)
    logits = h @ W2 + b2
    #counts = logits.exp()
    #prob = counts / counts.sum(1, keepdims=True)
    
    # backward pass 
    
    W1.grad = None
    W2.grad = None 
    loss = F.cross_entropy(logits, Y[mini_batch_ix])
    print(loss.item())
    
    for p in parameters:
        p.grad = None
    
    loss.backward()
    
    for p in parameters: 
        p.data += -0.01 * p.grad

2.38461971282959
2.5741660594940186
2.4802987575531006
2.4038054943084717
2.586205005645752
2.568932294845581
2.7873167991638184
2.3976540565490723
2.339299440383911
2.694056987762451
2.4467921257019043
2.428407669067383
2.4735748767852783
2.8345396518707275
2.609112024307251
2.804691791534424
2.310946464538574
2.2887086868286133
2.385289430618286
2.635430335998535
2.851534128189087
2.670628070831299
2.435506582260132
2.5619864463806152
2.2639975547790527
2.4767494201660156
2.444340229034424
2.3209140300750732
2.841078281402588
2.2808480262756348
2.440336227416992
2.4084415435791016
2.769502878189087
2.747025966644287
2.4086050987243652
2.3636178970336914
3.089571475982666
2.516012668609619
2.9065427780151367
2.870877265930176
2.5580201148986816
2.436622381210327
2.5494680404663086
2.582615852355957
2.717586040496826
2.7399446964263916
2.490036725997925
2.36069655418396
2.427973747253418
2.4738268852233887
2.213611602783203
2.512845277786255
2.8238754272460938
2.3744351863861084
2.4872