In [1]:
import torch

words = open('names.txt', 'r').read().splitlines()
chars = sorted(list(set("".join(words))))
stoi  = {s:i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos  = {i:s for s,i in stoi.items()}

In [2]:
# creating sample inputs and preds from raw data
context_window = 3

X, Y = [], []
for word in words[:5]:
    context = [0] * context_window
    for char in word + '.':
        itos_context= [itos[i] for i in context]
        stoi_char = stoi[char]
        X.append(context)
        Y.append(stoi_char)
        # print(f"for context {''.join(itos_context)} ===> {char}")
        context = context[1:] + [stoi[char]]
        # print(char)
        
X = torch.tensor(X)
Y = torch.tensor(Y)

print(f"shape for input: {X.shape} ===> Targe shape {Y.shape}")

shape for input: torch.Size([32, 3]) ===> Targe shape torch.Size([32])


In [3]:
## creating the embedding matrix. think of representing each letter with 2 values instead of 1 like one hot. here 2 is embedding size
embedding_size = 2
C = torch.randn((len(stoi), embedding_size))
C.shape

torch.Size([27, 2])

In [4]:
# multiple the input with the embedding matrix. effectively we are seeing input integet representing char in each input and querying for its embedding value.
# for 32 examples, with 3 chars/words each, and each char/words represented by 2 values, we get (32, 3, 2)

C[X].shape

torch.Size([32, 3, 2])

In [5]:
emb = C[X]

# we can stack each of the 3 char/words together. we can also flatten its embedding into a 1d vector. 
# so we are getting 3 1d vectors of length 2 and concating them into single vector giving us (32, 6)

emb = emb.view(-1, context_window * embedding_size)
emb.shape

torch.Size([32, 6])

In [7]:
# manually creating the weights and biases for the model [3 words each being represented by 2 values * 1 layer of 100 neurons * 1 output layer of 27 classes]

g = torch.Generator().manual_seed(2147483647)
embedding_size = 2
C = torch.randn((len(stoi), embedding_size))
W1 = torch.randn((embedding_size * context_window, 100), generator=g)
B1 = torch.randn((100), generator=g)
W2 = torch.randn((100, len(stoi)), generator=g)
B2 = torch.randn((len(stoi)), generator=g)

In [10]:
emb = C[X]
h = torch.tanh(emb.view(-1, context_window * embedding_size) @ W1 + B1)
logits = h @ W2 + B2
logits.shape

torch.Size([32, 27])

In [11]:
emb = C[X]
h = torch.tanh(emb.view(-1, context_window * embedding_size) @ W1 + B1)
logits = h @ W2 + B2
counts = logits.exp()
probs = counts / counts.sum(-1, keepdim=True)
probs.shape

torch.Size([32, 27])

In [23]:
probs[torch.arange(len(Y)), Y].shape

torch.Size([32])

In [24]:
# this is indexing into the probability of each predicted for the correct class.
# torch.range(1, len(Y)).long() this gives us a list from 1 to len(input) and Y gives us the correct class in form of integer for each input
# using the combination of above we can index into what the probabilities are for the correct class for each input in our models output
print("probs for 32 inputs at correct prediction ")
probs[torch.arange(len(Y)), Y]

probs for 32 inputs at correct prediction 


tensor([1.3125e-09, 5.3484e-14, 3.6140e-08, 3.9006e-08, 4.6471e-08, 8.7818e-14,
        6.2721e-07, 3.0474e-09, 9.0487e-09, 1.3605e-07, 2.0612e-01, 1.2362e-10,
        6.2226e-07, 6.2992e-09, 8.1582e-08, 8.0139e-10, 9.0715e-10, 2.8441e-11,
        5.5209e-05, 1.4619e-12, 5.5277e-11, 8.2960e-06, 5.6776e-06, 3.0789e-06,
        2.1748e-09, 1.3100e-10, 4.8137e-14, 6.3812e-10, 4.6483e-02, 8.0428e-03,
        6.2098e-04, 1.7615e-07])

In [25]:
loss= -probs[torch.arange(len(Y)), Y].log().mean()
loss

tensor(17.5882)