In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read in all the names, as a list of names
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
print(f"{len(words)} words")

32033 words


In [4]:
# Build character to integer mapping
# First we combine all the names into one string, then create a set() (this keeps unique characters only)
# Then we sort alphabetically
chars = sorted(set("".join(words)))

# String to integer
stoi = {c:i+1 for (i, c) in enumerate(chars)}
stoi['.'] = 0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [5]:
# Integer to string
itos = {i:c for (c,i) in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

## Build the Dataset

In [6]:
# Context length: how many previous characters do we use to predict the next character
block_size = 3

# Inputs to the neural net (the previous characters seen)
X = []
# The labels of the inputs to the neural net (the next character)
Y = []

# Create a dataset, mapping `block_size` characters -> next character
for word in words[:5]:
    print(word)
    
    # Initialize the context with empty characters
    context = [stoi["."]] * block_size
    # We want the last characters of the name to be included in the dataset
    # which is why we append "." to the end of the name
    for w in word + ".":
        # Get the next character
        ix = stoi[w]
        
        # Save the input and output
        X.append(context)
        Y.append(ix)
            
        # Print the example
        print("".join(itos[i] for i in context) + " ---> " + itos[ix])
        
        # Remove the earliest character in the context, and append the newest character
        # as the latest character in the context
        context = context[1:] + [ix]

# Inputs
X = torch.tensor(X)
# Labels
Y = torch.tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [7]:
# We have x many input examples, with each with a context size `block_size`
print(f"Input shape: {X.shape}, {X.dtype}")

Input shape: torch.Size([32, 3]), torch.int64


In [8]:
# The next character is one of 27 possible characters
print(f"Labels shape: {Y.shape}, {Y.dtype}")

Labels shape: torch.Size([32]), torch.int64


## Building the Embedding Table `C`
In the paper, 17,000 possible words are crammed into a 30-dimensional space.
We have 27 possible characters, so let's cram them into a 2-dimensional space.

In [9]:
# 27 possible characters, 2-dimensional space
# Each character has a 2-dimenional embedding
C = torch.randn((27, 2))
C

tensor([[-0.1067, -0.3255],
        [ 0.3376,  0.3419],
        [ 0.1550, -1.1830],
        [ 1.5495, -0.5934],
        [ 0.2971, -0.6370],
        [ 0.7099, -1.3830],
        [ 0.7070, -0.6605],
        [ 1.7801,  1.2121],
        [ 0.1998,  0.2748],
        [ 0.6895,  0.9553],
        [ 0.9003,  1.3806],
        [ 0.3174, -0.4722],
        [ 0.8947,  0.9249],
        [-0.2009, -0.5896],
        [ 0.5659, -0.1542],
        [ 0.6248,  0.4970],
        [-0.6143,  0.1434],
        [-0.4804,  0.7898],
        [-0.6984,  1.8530],
        [-0.4805,  0.5944],
        [-0.8023,  1.0666],
        [ 0.7912, -0.0591],
        [-0.3390,  1.2435],
        [ 1.0390,  2.1184],
        [-1.8662,  1.3436],
        [-0.1631,  0.7092],
        [ 0.4480,  1.0854]])

In [10]:
c = 'g'
c_index = stoi[c]
print(f"Character {c} maps to {c_index}")
C[c_index]

Character g maps to 7


tensor([1.7801, 1.2121])

In [11]:
# Indexing into the embedding table `C` is the same as matrix multiplying
# `C` with the one-hot encoding representation of the input character
v = F.one_hot(torch.tensor(c_index), num_classes=27)
v.dtype

torch.int64

In [12]:
# Need to cast the vector to a float since the embedding table C contains floats
v.float() @ C

tensor([1.7801, 1.2121])

Embedding a single character into the embedding table `C` is easy. Just use the integer representation of that character, and index into `C`. But how do we simultaneously embed `[32,3]` (32 examples, each of size 3, stored in array `X`) into `C`?

In addition to integers, we can use lists to index into `C`.

In [13]:
# This gets the rows at index 2, 3, and 4
C[[2,3,4]]

tensor([[ 0.1550, -1.1830],
        [ 1.5495, -0.5934],
        [ 0.2971, -0.6370]])

In [14]:
# We can also index using Tensors
C[torch.tensor([2,3,4])]

tensor([[ 0.1550, -1.1830],
        [ 1.5495, -0.5934],
        [ 0.2971, -0.6370]])

In [15]:
# We can also get the same row multiple times
C[[2,2,2]]

tensor([[ 0.1550, -1.1830],
        [ 0.1550, -1.1830],
        [ 0.1550, -1.1830]])

In [16]:
# Recall, X contains the characters as integers as input
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [17]:
# We can index into the embedding table `C` using multi-dimensional tensors too
# The character integers in X is used as the indices into C
print("X.shape", X.shape)
C[X]

X.shape torch.Size([32, 3])


tensor([[[-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255]],

        [[-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [ 0.7099, -1.3830]],

        [[-0.1067, -0.3255],
         [ 0.7099, -1.3830],
         [-0.2009, -0.5896]],

        [[ 0.7099, -1.3830],
         [-0.2009, -0.5896],
         [-0.2009, -0.5896]],

        [[-0.2009, -0.5896],
         [-0.2009, -0.5896],
         [ 0.3376,  0.3419]],

        [[-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255]],

        [[-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [ 0.6248,  0.4970]],

        [[-0.1067, -0.3255],
         [ 0.6248,  0.4970],
         [ 0.8947,  0.9249]],

        [[ 0.6248,  0.4970],
         [ 0.8947,  0.9249],
         [ 0.6895,  0.9553]],

        [[ 0.8947,  0.9249],
         [ 0.6895,  0.9553],
         [-0.3390,  1.2435]],

        [[ 0.6895,  0.9553],
         [-0.3390,  1.2435],
         [ 0.6895,  0.9553]],

        [[-0.3390,  1

In [18]:
# [32, 3] was the shape of the input X, then each input has an embedding of 2
C[X].shape

torch.Size([32, 3, 2])

In [19]:
# Get the 2nd character in the 7th example
example_index = 7
character_index = 2
X[example_index,character_index]

tensor(12)

In [20]:
# Get the character representation of the integer
itos[X[example_index,character_index].item()]

'l'

In [21]:
# Get the *embedding*of the 2nd character in the 7th example
C[X][example_index,character_index]

tensor([0.8947, 0.9249])

In [22]:
# C[12] is equivalent to C[X[7,2]]
C[12]

tensor([0.8947, 0.9249])

In [23]:
# We created our embedding table integrated with our example inputs!
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

## Contructing the Hidden Layer

In [24]:
# Weights
# 3 characters per input, and each characters has two floats that represent it (it's embedding)
# 3*2 = 6 outputs in the first layer
# So the hidden layer has to take in 6 inputs
# The number of out put nodes is a variable, we arbitrary choose 100 for now
W1 = torch.randn((6,100))
# Biases, should match the size of the hidden layer
b1 = torch.randn(100)

In [25]:
# The goal is to be able to do:
# emb @ W1 + b
# with `@` indicating matrix multiply
# and `emb` as our input
# We can't do this directly since their shapes don't match correctly
# The goal is to get the input/embedding of shape [32,6]
emb.shape

torch.Size([32, 3, 2])

In [26]:
W1.shape

torch.Size([6, 100])

In [27]:
# To do this, we can concatenate the three characters and their embeddings
# This grabs all the examples, indexes that into index 0 (first character)
# then grabs all the embeddings of the first character for all the examples
emb[:, 0, :]

tensor([[-0.1067, -0.3255],
        [-0.1067, -0.3255],
        [-0.1067, -0.3255],
        [ 0.7099, -1.3830],
        [-0.2009, -0.5896],
        [-0.1067, -0.3255],
        [-0.1067, -0.3255],
        [-0.1067, -0.3255],
        [ 0.6248,  0.4970],
        [ 0.8947,  0.9249],
        [ 0.6895,  0.9553],
        [-0.3390,  1.2435],
        [-0.1067, -0.3255],
        [-0.1067, -0.3255],
        [-0.1067, -0.3255],
        [ 0.3376,  0.3419],
        [-0.1067, -0.3255],
        [-0.1067, -0.3255],
        [-0.1067, -0.3255],
        [ 0.6895,  0.9553],
        [-0.4805,  0.5944],
        [ 0.3376,  0.3419],
        [ 0.1550, -1.1830],
        [ 0.7099, -1.3830],
        [ 0.8947,  0.9249],
        [-0.1067, -0.3255],
        [-0.1067, -0.3255],
        [-0.1067, -0.3255],
        [-0.4805,  0.5944],
        [ 0.6248,  0.4970],
        [-0.6143,  0.1434],
        [ 0.1998,  0.2748]])

In [28]:
# This gets the embeddings of all the first characters in each example
emb[:, 0, :].shape

torch.Size([32, 2])

In [29]:
# We have to concatenate the embeddings of all the characters
# Each emb[:, x, :] has shape [32, 2]
# and we want to concatenate cross the dim=1 to get [32,6]
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 1, :]], dim=1).shape

torch.Size([32, 6])

In [30]:
# Hard-coding the concatenation wouldn't work for other block sizes
# So we generalize using torch.unbind
# For block_size 3, torch.unbind(emb, dim=1) == [emb[:, 0, :], emb[:, 1, :], emb[:, 1, :]]
torch.unbind(emb, dim=1)

(tensor([[-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [ 0.7099, -1.3830],
         [-0.2009, -0.5896],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [ 0.6248,  0.4970],
         [ 0.8947,  0.9249],
         [ 0.6895,  0.9553],
         [-0.3390,  1.2435],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [ 0.3376,  0.3419],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [ 0.6895,  0.9553],
         [-0.4805,  0.5944],
         [ 0.3376,  0.3419],
         [ 0.1550, -1.1830],
         [ 0.7099, -1.3830],
         [ 0.8947,  0.9249],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [-0.4805,  0.5944],
         [ 0.6248,  0.4970],
         [-0.6143,  0.1434],
         [ 0.1998,  0.2748]]),
 tensor([[-0.1067, -0.3255],
         [-0.1067, -0.3255],
         [ 0

In [31]:
torch.cat(torch.unbind(emb, dim=1), dim=1).shape

torch.Size([32, 6])

A more efficient way to concatenate the embeddings is to use `view()`

In [32]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [33]:
# One list of 18 numbers
a.shape

torch.Size([18])

In [34]:
# We can represent this tensor with different dimensions
# Two lists of 9 numbers
a.view((2,9))

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [35]:
# 3 lists of 3 lists of 2 numbers
a.view([3,3,2])

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

`view()` is very efficent because of a tensor's `storage()`

In [36]:
# All the numbers are always stored in consecutive memory
# tensor.view() only changes the view of the memory
# No memory is copied, moved, or changed
a.storage()

  a.storage()


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [37]:
# So we can call view() on our input/embedding to do the same concatenation as before
# but much more efficiently
emb.view(32, 6)

tensor([[-0.1067, -0.3255, -0.1067, -0.3255, -0.1067, -0.3255],
        [-0.1067, -0.3255, -0.1067, -0.3255,  0.7099, -1.3830],
        [-0.1067, -0.3255,  0.7099, -1.3830, -0.2009, -0.5896],
        [ 0.7099, -1.3830, -0.2009, -0.5896, -0.2009, -0.5896],
        [-0.2009, -0.5896, -0.2009, -0.5896,  0.3376,  0.3419],
        [-0.1067, -0.3255, -0.1067, -0.3255, -0.1067, -0.3255],
        [-0.1067, -0.3255, -0.1067, -0.3255,  0.6248,  0.4970],
        [-0.1067, -0.3255,  0.6248,  0.4970,  0.8947,  0.9249],
        [ 0.6248,  0.4970,  0.8947,  0.9249,  0.6895,  0.9553],
        [ 0.8947,  0.9249,  0.6895,  0.9553, -0.3390,  1.2435],
        [ 0.6895,  0.9553, -0.3390,  1.2435,  0.6895,  0.9553],
        [-0.3390,  1.2435,  0.6895,  0.9553,  0.3376,  0.3419],
        [-0.1067, -0.3255, -0.1067, -0.3255, -0.1067, -0.3255],
        [-0.1067, -0.3255, -0.1067, -0.3255,  0.3376,  0.3419],
        [-0.1067, -0.3255,  0.3376,  0.3419, -0.3390,  1.2435],
        [ 0.3376,  0.3419, -0.3390,  1.2

In [38]:
# These are equal operations
emb.view(32,6) == torch.cat(torch.unbind(emb, dim=1), dim=1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [39]:
emb.view(32,6).shape

torch.Size([32, 6])

In [40]:
W1.shape

torch.Size([6, 100])

In [41]:
b1.shape

torch.Size([100])

In [42]:
# Now we can do the matrix multiplication to get our hidden layer
# We can also do  emb.view(-1,6) and PyTorch will derive what that value should be
h = emb.view(emb.shape[0],6) @ W1 + b1
h

tensor([[ 0.1273,  1.2489, -0.9927,  ...,  0.2371,  0.3973,  1.9040],
        [ 4.0179,  2.2308, -0.1354,  ..., -0.7418,  1.6091,  1.5000],
        [ 2.8780,  0.0130, -1.5041,  ..., -0.7511,  1.6324,  2.4067],
        ...,
        [-1.5841,  2.7658, -0.9163,  ...,  0.3088, -0.2343,  4.0642],
        [-1.0090,  1.5656,  1.4986,  ...,  2.0962,  0.5058, -0.4029],
        [-0.3640,  1.1712,  0.8459,  ...,  0.4099,  0.4293,  1.6749]])

In [43]:
h.shape

torch.Size([32, 100])

In [44]:
# We apply a tanh so that the values are between -1 and 1
# Each example gets a tanh applied to it
h = torch.tanh(h)
h

tensor([[ 0.1266,  0.8480, -0.7585,  ...,  0.2327,  0.3776,  0.9566],
        [ 0.9994,  0.9772, -0.1346,  ..., -0.6303,  0.9230,  0.9051],
        [ 0.9937,  0.0130, -0.9059,  ..., -0.6358,  0.9264,  0.9839],
        ...,
        [-0.9192,  0.9921, -0.7242,  ...,  0.2993, -0.2301,  0.9994],
        [-0.7654,  0.9163,  0.9049,  ...,  0.9702,  0.4666, -0.3825],
        [-0.3487,  0.8247,  0.6889,  ...,  0.3884,  0.4047,  0.9322]])

In [45]:
# Recall that we arbitrarily chose 100 nodes for the hidden layer
h.shape

torch.Size([32, 100])

In [46]:
# We should always double-check that the broadcasting that occurs
# when the bias `b` is added is correct
W1.shape

torch.Size([6, 100])

In [47]:
b1.shape

torch.Size([100])

In [48]:
# 6, 100 <-- W1
# 1, 100 <-- b1

## Constructing the Final Output Layer

In [49]:
# W1 is a [6,100], meaning it has 100 outputs
# So W2 must take in 100 outputs as inputs, and return 27 possible values
# (since we have 27 possible characters)
W2 = torch.rand((100, 27))
b2 = torch.rand(27)

In [50]:
# `logits` are the outputs of this neural net
# Recall h = tanh(emb @ W1 + b1), where `emb` contains the inputs
logits = h @ W2 + b2

In [51]:
# For each input example (32 total), the output can be one of 27 values
# (a value is generated for each of the 27 characters)
logits.shape

torch.Size([32, 27])

In [52]:
# Exponentiate the logits to get "fake counts"
counts = logits.exp()

In [53]:
# Then normalize the counts to get a probability
# dim=1 because that is the final output
probs = counts / counts.sum(dim=1, keepdims=True)

In [54]:
probs.shape

torch.Size([32, 27])

In [55]:
probs

tensor([[3.2830e-02, 2.6477e-03, 2.5297e-02, 7.1397e-04, 2.3385e-02, 5.9174e-04,
         2.5189e-03, 1.9391e-02, 4.1260e-02, 7.0951e-02, 2.8402e-02, 5.5036e-03,
         3.3958e-03, 2.6606e-02, 1.3090e-02, 2.2776e-02, 2.6900e-03, 1.5114e-04,
         2.8795e-03, 3.9758e-03, 1.4960e-01, 3.4082e-01, 1.1522e-01, 5.4229e-02,
         7.4304e-03, 3.0238e-03, 6.2127e-04],
        [7.0526e-04, 1.5738e-04, 3.8278e-03, 1.3625e-02, 3.0200e-02, 2.9946e-03,
         2.9349e-02, 1.5566e-02, 2.3605e-02, 3.9144e-03, 8.7515e-03, 2.8959e-02,
         1.1130e-02, 2.3665e-03, 2.5662e-02, 1.8296e-01, 6.2588e-03, 4.6235e-04,
         2.4473e-03, 2.4237e-01, 1.2478e-02, 2.8679e-02, 6.9121e-02, 2.2280e-01,
         4.8036e-03, 2.4091e-02, 2.7105e-03],
        [1.0336e-02, 2.3779e-04, 4.5990e-04, 1.8581e-03, 2.0557e-01, 3.7531e-04,
         1.6629e-03, 7.5845e-04, 1.9920e-03, 1.4704e-04, 4.5243e-02, 7.5215e-04,
         1.2404e-03, 4.2718e-03, 1.1525e-02, 1.4547e-02, 1.1825e-03, 1.6374e-05,
         1.1558e-

In [56]:
# Each row in probs now sum to 1
probs[0].sum()

tensor(1.)

In [57]:
# The correct answer of the next character is provided by Y
# So for each row in probs, we need to get the probability that was generated for the output character
# of each example (a row in probs)
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [58]:
# Creates a list of indices to index into probs
# We want to index into each example (32 examples total)
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [59]:
# We then use Y as a list of indicies
# This automatically grabs the probabiltiy assigned to each output character in Y, for each example
probs[torch.arange(32), Y]

tensor([5.9174e-04, 2.3665e-03, 4.2718e-03, 1.6732e-03, 7.7971e-03, 2.2776e-02,
        2.1806e-03, 4.2491e-01, 2.4570e-02, 6.6700e-01, 4.8058e-01, 2.5227e-04,
        2.6477e-03, 2.6446e-02, 6.9888e-03, 5.0306e-04, 7.0951e-02, 1.1265e-02,
        4.3044e-03, 5.1735e-03, 1.8809e-04, 8.8046e-03, 2.3793e-04, 3.2037e-02,
        3.6939e-04, 3.9758e-03, 1.9754e-03, 9.6106e-04, 9.4357e-03, 1.9646e-01,
        2.6564e-03, 1.4407e-04])

In [60]:
# The ideal goal of training is to get all the probabilities of each correct character per example is 1 (100%)

In [61]:
# Now we calculate the negative log liklihood for loss
# This is the loss we'd like to minimize
loss = -probs[torch.arange(32), Y].log().mean()
loss

tensor(5.2129)

## Complete Neural Network

In [62]:
# Dataset
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [63]:
# For reproducibility
g = torch.Generator().manual_seed(2147483647)

In [64]:
# 27 characters, each character has an embedding/representation of two floats
C = torch.randn((27, 2), generator=g)
# 3 characters per input, 2 embeddings per character
# 100 is an arbitrary number of nodes
W1 = torch.randn((6,100), generator=g)
b1 = torch.randn(100, generator=g)

# 100 inputs from previous layer, 27 possible characters as outputs
W2 = torch.randn((100,27), generator=g)
b2 = torch.randn(27, generator=g)

parameters = [C, W1, b1, W2, b2]

In [65]:
# Number of parameters in total
sum(p.nelement() for p in parameters)

3481

In [71]:
# Another way to calculate the loss
# More efficient forward and backward pass, and handles extreme values in logits
# Examples, large numbers exponentiated (logits.exp()) can lead to infinity as a value
#     PyTorch would handle that case
F.cross_entropy(logits, Y)

tensor(17.7697)

In [75]:
# Computes the gradients
for p in parameters:
    p.requires_grad = True

In [76]:
for iter in range(1000):
    # Forward Pass

    # Embeddings of all the characters in the input X
    emb = C[X] # shape: (32, block_size, 2)
    # Hidden layer
    # We get the embedding to a shape that is compatible with the W1 matrix
    # Activations for each of our 32 examples
    h = torch.tanh(emb.view(-1, 6) @ W1 + b1) # shape: (32, 100)
    # Get the results for each example (a probability distribution across the possible next character)
    logits = h @ W2 + b2 # Shape: (32, 27)
    # counts = logits.exp()
    # Create a probability distribution
    # Each row in prob sum to 1.0
    # prob = counts / counts.sum(1, keepdims=True)
    # Get the probability of the actual next character (given by Y)
    # The goal of training is to get the probability of the correct character close to 1.0
    # Negative log likelihood loss
    # loss = -prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Y)

    # Backward Pass

    # Reset the gradients
    for p in parameters:
        p.grad = None

    # Calculate the gradients    
    loss.backward()
    if iter%100 == 0:
        print(f"Iteration {iter} loss: {loss.item()}")
    
    # Update
    learning_rate = 0.1
    for p in parameters:
        # Nudge the weights in the direction of the gradient
        # Negate since we are trying to minimize the loss
        p.data += -learning_rate * p.grad

Iteration 0 loss: 17.76971435546875
Iteration 100 loss: 0.3354485332965851
Iteration 200 loss: 0.2789476215839386
Iteration 300 loss: 0.2678886950016022
Iteration 400 loss: 0.26317593455314636
Iteration 500 loss: 0.2638612389564514
Iteration 600 loss: 0.26023462414741516
Iteration 700 loss: 0.2586209177970886
Iteration 800 loss: 0.25756219029426575
Iteration 900 loss: 0.2567654848098755


The loss is decreasing easily because we are overfitting. We have over 3k parameters for just 32 examples.
We can't get loss completely to zero because there are examples where the next character could be from a possible set of characters (ex. "..." can be followed by either "e" (emma), "o" (olivia), etc.)