In [82]:
import torch
import string
import torch.nn.functional as F
import torch.nn as nn

In [1]:
words = open('names.txt', 'r').read().splitlines()

In [3]:
words[:5]

['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [34]:
i2x = {i+1: x for i, x in enumerate(list(string.ascii_lowercase))}
i2x[0] = '.'
x2i = {x:i for i, x in i2x.items()}

## Bigram
### Experimenting with 1 word

In [68]:
xs = []
ys = []
for word in words[:1]:
    word = ['.'] + list(word) + ['.']
    for i in range(len(word)-1):
        i1 = x2i[word[i]]
        i2 = x2i[word[i+1]]
        xs.append(i1)
        ys.append(i2)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [71]:
X_train = F.one_hot(xs, num_classes=27).float()
y_train = F.one_hot(ys, num_classes=27).float()

In [128]:
g = torch.Generator().manual_seed(2147483647)
W = torch.randn(27,27, requires_grad=True, generator=g)

In [137]:
out = X_train @ W
out.shape

torch.Size([5, 27])

In [138]:
softmax = nn.Softmax(dim=1)   # every row should sum up to 1
probs = softmax(out)
loss = -probs[torch.arange(5), ys].log().mean()   # negative mean log likelihood
loss

tensor(3.3930, grad_fn=<NegBackward0>)

In [136]:
optim = torch.optim.Adam([W], lr=0.1)
optim.zero_grad()
loss.backward()
optim.step()

##### Naive Gradient Descent #######
# W.grad = None
# loss.backward()
# W.data -= 0.1*W.grad

### Using all inputs

In [227]:
# prepping the data
xs = []
ys = []
for word in words:
    word = ['.'] + list(word) + ['.']
    for i in range(len(word)-1):
        i1 = x2i[word[i]]
        i2 = x2i[word[i+1]]
        xs.append(i1)
        ys.append(i2)
        
xs = torch.tensor(xs)
ys = torch.tensor(ys)
X_train = F.one_hot(xs, num_classes=27).float()
y_train = F.one_hot(ys, num_classes=27).float()
g = torch.Generator().manual_seed(2147483647)
W = torch.randn(27,27, requires_grad=True, generator=g)
softmax = nn.Softmax(dim=1)
optim = torch.optim.Adam([W], lr=0.1)

print(X_train.shape)

torch.Size([228146, 27])


In [228]:
# gradient descent
for _ in range(40):
    
    # forward pass
    out = X_train @ W
    probs = softmax(out)
    loss = -probs[torch.arange(X_train.shape[0]), ys].log().mean() + 0.01*(W**2).mean()   # add a regulation term
    
    # backward pass
    optim.zero_grad()
    loss.backward()
    optim.step()

print(loss.item())

2.497866153717041


In [245]:
# sampling form the model
g = torch.Generator().manual_seed(2147483647)

for _ in range(5):
    name = []
    i = 0
    while True:
        # use the trained weights to make predictions
        x = F.one_hot(torch.tensor([i]), num_classes=27).float()
        out = x @ W   
        probs = softmax(out)
        i = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        name.append(i2x[i])
        if i == 0:
            break
    print(''.join(name))

mor.
axx.
minaynnnyles.
kondoderaea.
memumizarie.


## Trigram

In [268]:
# prepping the data
x1s = []
x2s = []
ys = []
for word in words:
    word = ['.'] + ['.'] + list(word) + ['.']
    for i in range(len(word)-2):
        i1 = x2i[word[i]]
        i2 = x2i[word[i+1]]
        i3 = x2i[word[i+2]]
        x1s.append(i1)
        x2s.append(i2)
        ys.append(i3)
        
x1s = torch.tensor(x1s)
x2s = torch.tensor(x2s)
X1_train = F.one_hot(x1s, num_classes=27).float()
X2_train = F.one_hot(x2s, num_classes=27).float()
X_train = torch.column_stack((X1_train, X2_train))   # X_train shape: N x 54


ys = torch.tensor(ys)
y_train = F.one_hot(ys, num_classes=27).float()
g = torch.Generator().manual_seed(2147483647)
W = torch.randn(54,27, requires_grad=True, generator=g)  # matrix shape: 54 x 27 -> outputs still have 27 classes
softmax = nn.Softmax(dim=1)
optim = torch.optim.Adam([W], lr=0.6)

print(X_train.shape)

torch.Size([228146, 54])


In [269]:
# gradient descent
for _ in range(40):
    
    # forward pass
    out = X_train @ W
    probs = softmax(out)
    loss = -probs[torch.arange(X_train.shape[0]), ys].log().mean() + 0.01*(W**2).mean()
    
    # backward pass
    optim.zero_grad()
    loss.backward()
    optim.step()

    print(loss.item())

4.242240905761719
3.1926229000091553
2.728522300720215
2.5960476398468018
2.5908966064453125
2.6151044368743896
2.6266603469848633
2.6185386180877686
2.6052286624908447
2.587240219116211
2.5617809295654297
2.534749746322632
2.5137593746185303
2.500807762145996
2.491429328918457
2.4801177978515625
2.465301513671875
2.4496428966522217
2.436999797821045
2.4292776584625244
2.424954891204834
2.421311616897583
2.4167377948760986
2.4111599922180176
2.405439853668213
2.40049147605896
2.3964712619781494
2.3929433822631836
2.3893401622772217
2.3855104446411133
2.381984233856201
2.3793246746063232
2.37713885307312
2.374643564224243
2.371906042098999
2.3695249557495117
2.367680788040161
2.3661837577819824
2.3648431301116943
2.363551616668701


In [270]:
# sampling form the model
g = torch.Generator().manual_seed(2147483647)

for _ in range(20):
    name = []
    i = 0
    j = 0
    while True:
        x1 = F.one_hot(torch.tensor([i]), num_classes=27).float()
        x2 = F.one_hot(torch.tensor([j]), num_classes=27).float()
        x = torch.column_stack((x1, x2))  # shape: 1 x 54
        out = x @ W   
        probs = softmax(out)
        k = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        name.append(i2x[k])
        if k == 0:
            break
        else:
            i = j
            j = k
    print(''.join(name))

mor.
ays.
minayloryeer.
klini.
hokad.
menseny.
rie.
pahiilaia.
elon.
hamirierien.
elyonn.
ga.
ta.
celyn.
ilan.
lumioh.
mije.
ai.
ea.
jijanca.


Trigram seems to do slightly better but not so much lol!