In [30]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt # for visualising

In [31]:
# read in all the words
words = open("data.txt", encoding="utf-8").read().splitlines()
words = [i.strip() for i in words if i]

In [32]:
len(words)

905

In [33]:
# build the vocabulary of characters and mappings to /from integers
chars = sorted(list(set(''.join(words)))) # getting the character library
lib_len = len(chars)
stoi = {val:key+1 for key, val in enumerate(chars)}
stoi['|'] = 0
itos = {ind:st for st, ind in stoi.items()}
print(itos)

{1: '-', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: 'i', 11: 'j', 12: 'k', 13: 'l', 14: 'm', 15: 'n', 16: 'o', 17: 'p', 18: 'q', 19: 'r', 20: 's', 21: 't', 22: 'u', 23: 'v', 24: 'w', 25: 'x', 26: 'y', 27: 'z', 28: 'é', 29: 'ë', 30: 'í', 31: 'ó', 32: 'ú', 33: 'û', 34: '’', 0: '|'}


In [34]:
# building the dataset

block_size = 3 # context length: how many characters we take to predict the next one
dim_size = 2
X, Y = [], []

for w in words[:5]:
    print(w)
    context = [0] * block_size

    for ch in w + '|':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix] # crop and append

X = torch.tensor(X)
Y = torch.tensor(Y)

aego
||| ---> a
||a ---> e
|ae ---> g
aeg ---> o
ego ---> |
aelrindel
||| ---> a
||a ---> e
|ae ---> l
ael ---> r
elr ---> i
lri ---> n
rin ---> d
ind ---> e
nde ---> l
del ---> |
aerendyl
||| ---> a
||a ---> e
|ae ---> r
aer ---> e
ere ---> n
ren ---> d
end ---> y
ndy ---> l
dyl ---> |
aeson
||| ---> a
||a ---> e
|ae ---> s
aes ---> o
eso ---> n
son ---> |
afamrail
||| ---> a
||a ---> f
|af ---> a
afa ---> m
fam ---> r
amr ---> a
mra ---> i
rai ---> l
ail ---> |


In [35]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([39, 3]), torch.int64, torch.Size([39]), torch.int64)

In [36]:
C = torch.randn((lib_len - 1, dim_size))

In [37]:
emb = C[X]
emb.shape

torch.Size([39, 3, 2])

In [38]:
W1 = torch.randn((block_size * dim_size, 100)) # seed
b1 = torch.randn(100) # bias 1

In [39]:
h = torch.tanh(emb.view(-1, block_size * dim_size) @ W1 + b1) # hidden layer
h

tensor([[ 0.7911, -0.5798,  0.2342,  ...,  0.9349,  0.5874,  0.7574],
        [ 0.9508,  0.9424, -0.9995,  ...,  0.8405,  0.7504, -0.8916],
        [ 0.9863, -0.9988,  0.9836,  ...,  0.1378, -0.8979,  0.8807],
        ...,
        [ 0.9811,  0.7440, -0.9996,  ...,  0.9654,  0.8288, -0.8440],
        [ 1.0000, -0.8650,  0.9999,  ...,  0.9915, -0.9994, -0.0020],
        [-0.9995,  0.9997, -0.9977,  ..., -0.9263,  1.0000,  0.9986]])