In [23]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for visualising

In [24]:
# read in all the words
words = open("data.txt", encoding="utf-8").read().splitlines()
words = [i.strip() for i in words if i]

In [25]:
len(words)

905

In [26]:
# build the vocabulary of characters and mappings to /from integers
chars = sorted(list(set(''.join(words)))) # getting the character library
lib_len = len(chars)
stoi = {val:key+1 for key, val in enumerate(chars)}
stoi['|'] = 0
itos = {ind:st for st, ind in stoi.items()}
print(itos)

{1: '-', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: 'i', 11: 'j', 12: 'k', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'p', 18: 'q', 19: 'r', 20: 's', 21: 't', 22: 'u', 23: 'v', 24: 'w', 25: 'x', 26: 'y', 27: 'z', 28: 'é', 29: 'ë', 30: 'í', 31: 'ó', 32: 'ú', 33: 'û', 34: '’', 0: '|'}


In [27]:
# building the dataset

block_size = 3 # context length: how many characters we take to predict the next one
dim_size = 2
X, Y = [], []

for w in words[:5]:
    print(w)
    context = [0] * block_size

    for ch in w + '|':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

aego
||| ---> a
||a ---> e
|ae ---> g
aeg ---> o
ego ---> |
aelrindel
||| ---> a
||a ---> e
|ae ---> l
ael ---> r
elr ---> i
lri ---> n
rin ---> d
ind ---> e
nde ---> l
del ---> |
aerendyl
||| ---> a
||a ---> e
|ae ---> r
aer ---> e
ere ---> n
ren ---> d
end ---> y
ndy ---> l
dyl ---> |
aeson
||| ---> a
||a ---> e
|ae ---> s
aes ---> o
eso ---> n
son ---> |
afamrail
||| ---> a
||a ---> f
|af ---> a
afa ---> m
fam ---> r
amr ---> a
mra ---> i
rai ---> l
ail ---> |


In [28]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([39, 3]), torch.int64, torch.Size([39]), torch.int64)

In [29]:
h_neuron_count = 100
g = torch.Generator().manual_seed(412987498012)
C = torch.randn((lib_len, dim_size), generator=g)
W1 = torch.randn((block_size * dim_size, h_neuron_count), generator=g) # weights
b1 = torch.randn(h_neuron_count, generator=g) # biases
W2 = torch.randn((h_neuron_count, lib_len), generator=g) # weights for output layer
b2 = torch.randn(lib_len, generator=g) # biases for output layer
parameters = [C, W1, b1, W2, b2]

In [30]:
sum(p.nelement() for p in parameters) # parameter count

4202

In [38]:
# forward pass
emb = C[X]
h = torch.tanh(emb.view(-1, block_size * dim_size) @ W1 + b1) # hidden layer
logits = h @ W2 + b2
# counts = logits.exp()
# probs = counts / counts.sum(1, keepdims=True)
# loss = -probs[torch.arange(len(Y)), Y].log().mean()
loss = F.cross_entropy(logits, Y) # More readable and efficient way to calculate loss
print(loss)

tensor(15.6207)
