In [2]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# Read in all the names, as a list of names
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [4]:
print(f"{len(words)} words")

32033 words


In [5]:
# Build character to integer mapping
# First we combine all the names into one string, then create a set() (this keeps unique characters only)
# Then we sort alphabetically
chars = sorted(set("".join(words)))

# String to integer
stoi = {c:i+1 for (i, c) in enumerate(chars)}
stoi['.'] = 0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [6]:
# Integer to string
itos = {i:c for (c,i) in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

## Build the Dataset

In [7]:
# Context length: how many previous characters do we use to predict the next character
block_size = 3

# Inputs to the neural net (the previous characters seen)
X = []
# The labels of the inputs to the neural net (the next character)
Y = []

# Create a dataset, mapping `block_size` characters -> next character
for word in words[:5]:
    print(word)
    
    # Initialize the context with empty characters
    context = [stoi["."]] * block_size
    # We want the last characters of the name to be included in the dataset
    # which is why we append "." to the end of the name
    for w in word + ".":
        # Get the next character
        ix = stoi[w]
        
        # Save the input and output
        X.append(context)
        Y.append(ix)
            
        # Print the example
        print("".join(itos[i] for i in context) + " ---> " + itos[ix])
        
        # Remove the earliest character in the context, and append the newest character
        # as the latest character in the context
        context = context[1:] + [ix]

# Inputs
X = torch.tensor(X)
# Labels
Y = torch.tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [8]:
# We have x many input examples, with each with a context size `block_size`
print(f"Input shape: {X.shape}, {X.dtype}")

Input shape: torch.Size([32, 3]), torch.int64


In [9]:
# The next character is one of 27 possible characters
print(f"Labels shape: {Y.shape}, {Y.dtype}")

Labels shape: torch.Size([32]), torch.int64


## Building the Embedding Table `C`
In the paper, 17,000 possible words are crammed into a 30-dimensional space.
We have 27 possible characters, so let's cram them into a 2-dimensional space.

In [10]:
# 27 possible characters, 2-dimensional space
# Each character has a 2-dimenional embedding
C = torch.randn((27, 2))
C

tensor([[-1.1733,  2.0990],
        [-1.0517, -0.5137],
        [-0.4361, -0.3689],
        [ 1.6064, -0.9127],
        [ 0.8234, -0.9242],
        [-1.1432,  0.8409],
        [ 0.1157, -0.9679],
        [-0.8842,  1.2050],
        [-0.1721, -0.9472],
        [ 2.7344, -0.2924],
        [ 0.2427,  0.1372],
        [ 0.9499,  0.5008],
        [-1.3637, -0.6750],
        [ 0.9408, -1.1901],
        [ 1.3130, -0.9485],
        [-0.3181,  0.7576],
        [-0.5376,  0.5322],
        [-0.4070, -0.4677],
        [-1.9054, -0.1338],
        [ 0.3915, -1.3050],
        [-0.2548, -1.2440],
        [-0.8101, -1.1608],
        [ 0.5212, -0.2390],
        [ 0.9280, -0.2610],
        [-0.8942,  0.3725],
        [ 2.2526,  0.8406],
        [ 2.1230,  0.2043]])

In [11]:
c = 'g'
c_index = stoi[c]
print(f"Character {c} maps to {c_index}")
C[c_index]

Character g maps to 7


tensor([-0.8842,  1.2050])

In [12]:
# Indexing into the embedding table `C` is the same as matrix multiplying
# `C` with the one-hot encoding representation of the input character
v = F.one_hot(torch.tensor(c_index), num_classes=27)
v.dtype

torch.int64

In [13]:
# Need to cast the vector to a float since the embedding table C contains floats
v.float() @ C

tensor([-0.8842,  1.2050])

Embedding a single character into the embedding table `C` is easy. Just use the integer representation of that character, and index into `C`. But how do we simultaneously embed `[32,3]` (32 examples, each of size 3, stored in array `X`) into `C`?

In addition to integers, we can use lists to index into `C`.

In [14]:
# This gets the rows at index 2, 3, and 4
C[[2,3,4]]

tensor([[-0.4361, -0.3689],
        [ 1.6064, -0.9127],
        [ 0.8234, -0.9242]])

In [15]:
# We can also index using Tensors
C[torch.tensor([2,3,4])]

tensor([[-0.4361, -0.3689],
        [ 1.6064, -0.9127],
        [ 0.8234, -0.9242]])

In [16]:
# We can also get the same row multiple times
C[[2,2,2]]

tensor([[-0.4361, -0.3689],
        [-0.4361, -0.3689],
        [-0.4361, -0.3689]])

In [17]:
# Recall, X contains the characters as integers as input
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [18]:
# We can index into the embedding table `C` using multi-dimensional tensors too
# The character integers in X is used as the indices into C
print("X.shape", X.shape)
C[X]

X.shape torch.Size([32, 3])


tensor([[[-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990]],

        [[-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1.1432,  0.8409]],

        [[-1.1733,  2.0990],
         [-1.1432,  0.8409],
         [ 0.9408, -1.1901]],

        [[-1.1432,  0.8409],
         [ 0.9408, -1.1901],
         [ 0.9408, -1.1901]],

        [[ 0.9408, -1.1901],
         [ 0.9408, -1.1901],
         [-1.0517, -0.5137]],

        [[-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990]],

        [[-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-0.3181,  0.7576]],

        [[-1.1733,  2.0990],
         [-0.3181,  0.7576],
         [-1.3637, -0.6750]],

        [[-0.3181,  0.7576],
         [-1.3637, -0.6750],
         [ 2.7344, -0.2924]],

        [[-1.3637, -0.6750],
         [ 2.7344, -0.2924],
         [ 0.5212, -0.2390]],

        [[ 2.7344, -0.2924],
         [ 0.5212, -0.2390],
         [ 2.7344, -0.2924]],

        [[ 0.5212, -0

In [19]:
# [32, 3] was the shape of the input X, then each input has an embedding of 2
C[X].shape

torch.Size([32, 3, 2])

In [20]:
# Get the 2nd character in the 7th example
example_index = 7
character_index = 2
X[example_index,character_index]

tensor(12)

In [21]:
# Get the character representation of the integer
itos[X[example_index,character_index].item()]

'l'

In [22]:
# Get the *embedding*of the 2nd character in the 7th example
C[X][example_index,character_index]

tensor([-1.3637, -0.6750])

In [23]:
# C[12] is equivalent to C[X[7,2]]
C[12]

tensor([-1.3637, -0.6750])

In [24]:
# We created our embedding table integrated with our example inputs!
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

## Contructing the Hidden Layer

In [25]:
# Weights
# 3 characters per input, and each characters has two floats that represent it (it's embedding)
# 3*2 = 6 outputs in the first layer
# So the hidden layer has to take in 6 inputs
# The number of out put nodes is a variable, we arbitrary choose 100 for now
W1 = torch.randn((6,100))
# Biases, should match the size of the hidden layer
b = torch.randn(100)

In [27]:
# The goal is to be able to do:
# emb @ W1 + b
# with `@` indicating matrix multiply
# and `emb` as our input
# We can't do this directly since their shapes don't match correctly
# The goal is to get the input/embedding of shape [32,6]
emb.shape

torch.Size([32, 3, 2])

In [28]:
W1.shape

torch.Size([6, 100])

In [30]:
# To do this, we can concatenate the three characters and their embeddings
# This grabs all the examples, indexes that into index 0 (first character)
# then grabs all the embeddings of the first character for all the examples
emb[:, 0, :]

tensor([[-1.1733,  2.0990],
        [-1.1733,  2.0990],
        [-1.1733,  2.0990],
        [-1.1432,  0.8409],
        [ 0.9408, -1.1901],
        [-1.1733,  2.0990],
        [-1.1733,  2.0990],
        [-1.1733,  2.0990],
        [-0.3181,  0.7576],
        [-1.3637, -0.6750],
        [ 2.7344, -0.2924],
        [ 0.5212, -0.2390],
        [-1.1733,  2.0990],
        [-1.1733,  2.0990],
        [-1.1733,  2.0990],
        [-1.0517, -0.5137],
        [-1.1733,  2.0990],
        [-1.1733,  2.0990],
        [-1.1733,  2.0990],
        [ 2.7344, -0.2924],
        [ 0.3915, -1.3050],
        [-1.0517, -0.5137],
        [-0.4361, -0.3689],
        [-1.1432,  0.8409],
        [-1.3637, -0.6750],
        [-1.1733,  2.0990],
        [-1.1733,  2.0990],
        [-1.1733,  2.0990],
        [ 0.3915, -1.3050],
        [-0.3181,  0.7576],
        [-0.5376,  0.5322],
        [-0.1721, -0.9472]])

In [33]:
# This gets the embeddings of all the first characters in each example
emb[:, 0, :].shape

torch.Size([32, 2])

In [34]:
# We have to concatenate the embeddings of all the characters
# Each emb[:, x, :] has shape [32, 2]
# and we want to concatenate cross the dim=1 to get [32,6]
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 1, :]], dim=1).shape

torch.Size([32, 6])

In [35]:
# Hard-coding the concatenation wouldn't work for other block sizes
# So we generalize using torch.unbind
# For block_size 3, torch.unbind(emb, dim=1) == [emb[:, 0, :], emb[:, 1, :], emb[:, 1, :]]
torch.unbind(emb, dim=1)

(tensor([[-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1.1432,  0.8409],
         [ 0.9408, -1.1901],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-0.3181,  0.7576],
         [-1.3637, -0.6750],
         [ 2.7344, -0.2924],
         [ 0.5212, -0.2390],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1.0517, -0.5137],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [ 2.7344, -0.2924],
         [ 0.3915, -1.3050],
         [-1.0517, -0.5137],
         [-0.4361, -0.3689],
         [-1.1432,  0.8409],
         [-1.3637, -0.6750],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [ 0.3915, -1.3050],
         [-0.3181,  0.7576],
         [-0.5376,  0.5322],
         [-0.1721, -0.9472]]),
 tensor([[-1.1733,  2.0990],
         [-1.1733,  2.0990],
         [-1

In [39]:
torch.cat(torch.unbind(emb, dim=1), dim=1).shape

torch.Size([32, 6])

A more efficient way to concatenate the embeddings is to use `view()`

In [40]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [41]:
# One list of 18 numbers
a.shape

torch.Size([18])

In [42]:
# We can represent this tensor with different dimensions
# Two lists of 9 numbers
a.view((2,9))

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [44]:
# 3 lists of 3 lists of 2 numbers
a.view([3,3,2])

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

`view()` is very efficent because of a tensor's `storage()`

In [47]:
# All the numbers are always stored in consecutive memory
# tensor.view() only changes the view of the memory
# No memory is copied, moved, or changed
a.storage()

 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [48]:
# So we can call view() on our input/embedding to do the same concatenation as before
# but much more efficiently
emb.view(32, 6)

tensor([[-1.1733,  2.0990, -1.1733,  2.0990, -1.1733,  2.0990],
        [-1.1733,  2.0990, -1.1733,  2.0990, -1.1432,  0.8409],
        [-1.1733,  2.0990, -1.1432,  0.8409,  0.9408, -1.1901],
        [-1.1432,  0.8409,  0.9408, -1.1901,  0.9408, -1.1901],
        [ 0.9408, -1.1901,  0.9408, -1.1901, -1.0517, -0.5137],
        [-1.1733,  2.0990, -1.1733,  2.0990, -1.1733,  2.0990],
        [-1.1733,  2.0990, -1.1733,  2.0990, -0.3181,  0.7576],
        [-1.1733,  2.0990, -0.3181,  0.7576, -1.3637, -0.6750],
        [-0.3181,  0.7576, -1.3637, -0.6750,  2.7344, -0.2924],
        [-1.3637, -0.6750,  2.7344, -0.2924,  0.5212, -0.2390],
        [ 2.7344, -0.2924,  0.5212, -0.2390,  2.7344, -0.2924],
        [ 0.5212, -0.2390,  2.7344, -0.2924, -1.0517, -0.5137],
        [-1.1733,  2.0990, -1.1733,  2.0990, -1.1733,  2.0990],
        [-1.1733,  2.0990, -1.1733,  2.0990, -1.0517, -0.5137],
        [-1.1733,  2.0990, -1.0517, -0.5137,  0.5212, -0.2390],
        [-1.0517, -0.5137,  0.5212, -0.2

In [50]:
# These are equal operations
emb.view(32,6) == torch.cat(torch.unbind(emb, dim=1), dim=1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [54]:
emb.view(32,6).shape

torch.Size([32, 6])

In [52]:
W1.shape

torch.Size([6, 100])

In [53]:
b.shape

torch.Size([100])

In [51]:
# Now we can do the matrix multiplication to get our hidden layer
h = emb.view(32,6) @ W1 + b
h

tensor([[ 2.2113,  1.6906, -4.3554,  ..., -2.3010, -3.7414, -5.1612],
        [ 1.4776,  0.7002, -3.1253,  ..., -0.2176, -2.9236, -3.8324],
        [-1.7211, -1.5596, -0.3741,  ...,  4.1063,  1.1268, -0.3361],
        ...,
        [-0.3858, -0.4014, -0.5281,  ...,  3.1009,  0.5141,  1.0116],
        [-2.5479, -0.6621, -0.7186,  ...,  3.1786,  3.4945,  2.6015],
        [ 0.4771,  0.7597, -1.3006,  ...,  1.1065,  0.0914,  3.3237]])

In [55]:
h.shape

torch.Size([32, 100])