In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read in all the names, as a list of names
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
print(f"{len(words)} words")

32033 words


In [4]:
# Build character to integer mapping
# First we combine all the names into one string, then create a set() (this keeps unique characters only)
# Then we sort alphabetically
chars = sorted(set("".join(words)))

# String to integer
stoi = {c:i+1 for (i, c) in enumerate(chars)}
stoi['.'] = 0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [5]:
# Integer to string
itos = {i:c for (c,i) in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

## Build the Dataset

In [7]:
# Context length: how many previous characters do we use to predict the next character
block_size = 3

# Inputs to the neural net (the previous characters seen)
X = []
# The labels of the inputs to the neural net (the next character)
Y = []

# Create a dataset, mapping `block_size` characters -> next character
for word in words[:5]:
    print(word)
    
    # Initialize the context with empty characters
    context = [stoi["."]] * block_size
    # We want the last characters of the name to be included in the dataset
    # which is why we append "." to the end of the name
    for w in word + ".":
        # Get the next character
        ix = stoi[w]
        
        # Save the input and output
        X.append(context)
        Y.append(ix)
            
        # Print the example
        print("".join(itos[i] for i in context) + " ---> " + itos[ix])
        
        # Remove the earliest character in the context, and append the newest character
        # as the latest character in the context
        context = context[1:] + [ix]

# Inputs
X = torch.tensor(X)
# Labels
Y = torch.tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [8]:
# We have x many input examples, with each with a context size `block_size`
print(f"Input shape: {X.shape}, {X.dtype}")

Input shape: torch.Size([32, 3]), torch.int64


In [9]:
# The next character is one of 27 possible characters
print(f"Labels shape: {Y.shape}, {Y.dtype}")

Labels shape: torch.Size([32]), torch.int64


## Building the Embedding Table `C`
In the paper, 17,000 possible words are crammed into a 30-dimensional space.
We have 27 possible characters, so let's cram them into a 2-dimensional space.

In [14]:
# 27 possible characters, 2-dimensional space
# Each character has a 2-dimenional embedding
C = torch.randn((27, 2))
C

tensor([[-0.0088, -1.1016],
        [ 0.7970, -0.3561],
        [-0.3655, -0.4275],
        [-1.1269, -0.0039],
        [ 0.0813, -1.8065],
        [-0.2585,  0.4642],
        [-1.1480, -1.4260],
        [-0.4930,  0.7735],
        [ 0.3500,  0.9715],
        [-0.9872,  0.2489],
        [ 0.0348,  0.7573],
        [ 0.0066,  0.3187],
        [-2.2436,  0.7938],
        [ 0.5505,  1.3333],
        [ 0.5966,  0.5793],
        [ 0.3108, -0.8955],
        [-0.7096, -0.2041],
        [-0.2271,  1.7375],
        [ 0.4156,  1.4305],
        [-0.4538, -0.0266],
        [-1.8510,  0.7650],
        [ 0.4639, -0.8264],
        [-0.8211,  0.0090],
        [ 0.6062,  0.6858],
        [-2.1856, -1.0730],
        [-0.0255,  1.4148],
        [-0.1103,  1.1912]])

In [11]:
c = 'g'
c_index = stoi[c]
print(f"Character {c} maps to {c_index}")
C[c_index]

Character g maps to 7


tensor([-0.7620,  0.1511])

In [12]:
# Indexing into the embedding table `C` is the same as matrix multiplying
# `C` with the one-hot encoding representation of the input character
v = F.one_hot(torch.tensor(c_index), num_classes=27)
v.dtype

torch.int64

In [13]:
# Need to cast the vector to a float since the embedding table C contains floats
v.float() @ C

tensor([-0.7620,  0.1511])

Embedding a single character into the embedding table `C` is easy. Just use the integer representation of that character, and index into `C`. But how do we simultaneously embed `[32,3]` (32 examples, each of size 3, stored in array `X`) into `C`?

In addition to integers, we can use lists to index into `C`.

In [15]:
# This gets the rows at index 2, 3, and 4
C[[2,3,4]]

tensor([[-0.3655, -0.4275],
        [-1.1269, -0.0039],
        [ 0.0813, -1.8065]])

In [20]:
# We can also index using Tensors
C[torch.tensor([2,3,4])]

tensor([[-0.3655, -0.4275],
        [-1.1269, -0.0039],
        [ 0.0813, -1.8065]])

In [21]:
# We can also get the same row multiple times
C[[2,2,2]]

tensor([[-0.3655, -0.4275],
        [-0.3655, -0.4275],
        [-0.3655, -0.4275]])

In [26]:
# Recall, X contains the characters as integers as input
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [27]:
# We can index into the embedding table `C` using multi-dimensional tensors too
# The character integers in X is used as the indices into C
print("X.shape", X.shape)
C[X]

X.shape torch.Size([32, 3])


tensor([[[-0.0088, -1.1016],
         [-0.0088, -1.1016],
         [-0.0088, -1.1016]],

        [[-0.0088, -1.1016],
         [-0.0088, -1.1016],
         [-0.2585,  0.4642]],

        [[-0.0088, -1.1016],
         [-0.2585,  0.4642],
         [ 0.5505,  1.3333]],

        [[-0.2585,  0.4642],
         [ 0.5505,  1.3333],
         [ 0.5505,  1.3333]],

        [[ 0.5505,  1.3333],
         [ 0.5505,  1.3333],
         [ 0.7970, -0.3561]],

        [[-0.0088, -1.1016],
         [-0.0088, -1.1016],
         [-0.0088, -1.1016]],

        [[-0.0088, -1.1016],
         [-0.0088, -1.1016],
         [ 0.3108, -0.8955]],

        [[-0.0088, -1.1016],
         [ 0.3108, -0.8955],
         [-2.2436,  0.7938]],

        [[ 0.3108, -0.8955],
         [-2.2436,  0.7938],
         [-0.9872,  0.2489]],

        [[-2.2436,  0.7938],
         [-0.9872,  0.2489],
         [-0.8211,  0.0090]],

        [[-0.9872,  0.2489],
         [-0.8211,  0.0090],
         [-0.9872,  0.2489]],

        [[-0.8211,  0

In [28]:
# [32, 3] was the shape of the input X, then each input has an embedding of 2
C[X].shape

torch.Size([32, 3, 2])

In [33]:
# Get the 2nd character in the 7th example
example_index = 7
character_index = 2
X[example_index,character_index]

tensor(12)

In [37]:
# Get the character representation of the integer
itos[X[example_index,character_index].item()]

'l'

In [34]:
# Get the *embedding*of the 2nd character in the 7th example
C[X][example_index,character_index]

tensor([-2.2436,  0.7938])

In [38]:
# C[12] is equivalent to C[X[7,2]]
C[12]

tensor([-2.2436,  0.7938])

In [41]:
# We created our embedding table integrated with our example inputs!
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

## Contructing the Hidden Layer

In [42]:
# Weights
# 3 characters per input, and each characters has two floats that represent it (it's embedding)
# 3*2 = 6 outputs in the first layer
# So the hidden layer has to take in 6 inputs
# The number of out put nodes is a variable, we arbitrary choose 100 for now
W1 = torch.randn((6,100))
# Biases, should match the size of the hidden layer
b = torch.randn(100)