In [14]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read the vocabulary of words for names
with open('names.txt', 'r') as f:
    words = f.read().splitlines()

In [3]:
len(words)

32033

In [9]:
# build vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
print(itos)

{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [11]:
# Build the dataset
block_size = 3    # context length - how many characters do we take to predict the next one
X, Y = [], []

for w in words[:5]:
    print(f"word is: {w}")
    context = [0] * block_size
    for ch in w + '.':
        ix = stoi[ch]
        X.append(context)
        Y.append(ix)
        print(f"X is: {X}::Y is: {Y}")
        
        print(''.join(itos[i] for i in context), '--->', itos[ix])
        context = context[1:] + [ix]
        print(f"Updated Context is: {context}")

word is: emma
X is: [[0, 0, 0]]::Y is: [5]
... ---> e
Updated Context is: [0, 0, 5]
X is: [[0, 0, 0], [0, 0, 5]]::Y is: [5, 13]
..e ---> m
Updated Context is: [0, 5, 13]
X is: [[0, 0, 0], [0, 0, 5], [0, 5, 13]]::Y is: [5, 13, 13]
.em ---> m
Updated Context is: [5, 13, 13]
X is: [[0, 0, 0], [0, 0, 5], [0, 5, 13], [5, 13, 13]]::Y is: [5, 13, 13, 1]
emm ---> a
Updated Context is: [13, 13, 1]
X is: [[0, 0, 0], [0, 0, 5], [0, 5, 13], [5, 13, 13], [13, 13, 1]]::Y is: [5, 13, 13, 1, 0]
mma ---> .
Updated Context is: [13, 1, 0]
word is: olivia
X is: [[0, 0, 0], [0, 0, 5], [0, 5, 13], [5, 13, 13], [13, 13, 1], [0, 0, 0]]::Y is: [5, 13, 13, 1, 0, 15]
... ---> o
Updated Context is: [0, 0, 15]
X is: [[0, 0, 0], [0, 0, 5], [0, 5, 13], [5, 13, 13], [13, 13, 1], [0, 0, 0], [0, 0, 15]]::Y is: [5, 13, 13, 1, 0, 15, 12]
..o ---> l
Updated Context is: [0, 15, 12]
X is: [[0, 0, 0], [0, 0, 5], [0, 5, 13], [5, 13, 13], [13, 13, 1], [0, 0, 0], [0, 0, 15], [0, 15, 12]]::Y is: [5, 13, 13, 1, 0, 15, 12, 9]
.ol 

In [15]:
X = torch.tensor(X)
Y = torch.tensor(Y)

In [16]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([32, 3]), torch.int64, torch.Size([32]), torch.int64)

## Learnings so far:

- Character set is extracted from all words. This serves are two lookups - string to index: stoi and index to string: itos
- A context window is decided upon. We are using 3. Given three characters, predict what is the next one
- This sliding context window data is created from the original names lsit. Alognside, for each context widnow, the label/output Y is also generated which is the next character
- eg: for the first five words, 32 rows of such 3 size context widnows are generated in this case. Hence, X.shape is [32, 3]
- Now, we want to represent these indiivudal 27 characters into lower dimensional embedding space (27 characters into -> 2 dimensional) for this example. C -> 27 rows and 2 columns
- We then will build a neural network which takes these Xs and predicts the Y
- 

In [17]:
# Randomly initialised embedding's matrix for 27 characters into 2 dimensions
C = torch.randn((27, 2))
C

tensor([[-1.6227, -0.2779],
        [-0.1320,  1.0068],
        [-0.2130, -0.3723],
        [-0.6357,  0.8559],
        [-0.1941,  0.3481],
        [-1.1056,  0.3212],
        [ 0.4088, -0.9490],
        [-0.1770, -0.0271],
        [-0.6720, -0.9016],
        [ 1.8713, -0.9765],
        [ 1.7681,  1.1848],
        [ 0.2720, -0.7867],
        [-0.5769, -0.5816],
        [ 0.7620,  0.4512],
        [-0.8475, -0.7943],
        [ 0.8001, -1.1426],
        [ 0.9964, -0.8215],
        [ 0.7685,  1.1401],
        [ 0.0064, -1.0963],
        [-0.1820,  0.6712],
        [-0.8513, -2.3924],
        [ 0.3257,  0.5558],
        [-0.3931,  0.6683],
        [ 0.5250, -0.9097],
        [ 0.1942,  0.1418],
        [-0.8426, -0.9767],
        [-0.8494, -0.0673]])

In [23]:
# Now for the characters from 0 to 27 - there are two ways to fetch their embedding vector from the C matrix

# 1. Directly -> charcater e -> index 5 -> C[5]
# 2. Through One Hot Encoding -> F.one_hot(5, num_classes=27) - 27 dimensional vector of all zeroes except the fifth bit turned on
# Both provide identical output as seen below
print(f"Direct C[5]:: {C[5]}")
print(f"Indirect OneHotEncoded:: {F.one_hot(torch.tensor(5), num_classes=27).float() @ C}")

Direct C[5]:: tensor([-1.1056,  0.3212])
Indirect OneHotEncoded:: tensor([-1.1056,  0.3212])


In [24]:
# X is looked up form embedding smatrix C -> to get embeddings for entire X in one go
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

In [27]:
emb[0:1]

tensor([[[-1.6227, -0.2779],
         [-1.6227, -0.2779],
         [-1.6227, -0.2779]]])

In [30]:
# Weights matrix for layer 1 - randomnly initialised
# Key points: 
# 1. Detemine the number of inputs = 3x2 -> 2 dimensional embeedings and 3 of them together
# 2. Number of neurons in the layer - a variable we have to decide - using 100 here
W1 = torch.randn((6, 100))
b1 = torch.randn(100)

# the operation we wnat to now perform is emb @ W1 + b1
# However, dimensions for emb (32,3,2) does not match W1(6,100). We need to convert emb 3,2 into 6
# basically [[0, 0, 0]] -> converted do their respect indiivual elements embeddings 
# [[[-1.6227, -0.2779], [-1.6227, -0.2779], [-1.6227, -0.2779]
# We use torch.cat and pull out the these 3 one by one and concatenate them together
# emb[:, 0, :], emb[:, 1, :], emb[:, 2, :] and concatenate along dimension 1
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 2, :]], 1).shape


torch.Size([32, 6])