In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read in all the names, as a list of names
words = open("names.txt", "r").read().splitlines()
words[:8]

['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']

In [3]:
print(f"{len(words)} words")

32033 words


In [4]:
# Build character to integer mapping
# First we combine all the names into one string, then create a set() (this keeps unique characters only)
# Then we sort alphabetically
chars = sorted(set("".join(words)))

# String to integer
stoi = {c:i+1 for (i, c) in enumerate(chars)}
stoi['.'] = 0
stoi

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 '.': 0}

In [5]:
# Integer to string
itos = {i:c for (c,i) in stoi.items()}
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

## Build the Dataset

In [6]:
# Context length: how many previous characters do we use to predict the next character
block_size = 3

# Inputs to the neural net (the previous characters seen)
X = []
# The labels of the inputs to the neural net (the next character)
Y = []

# Create a dataset, mapping `block_size` characters -> next character
for word in words[:5]:
    print(word)
    
    # Initialize the context with empty characters
    context = [stoi["."]] * block_size
    # We want the last characters of the name to be included in the dataset
    # which is why we append "." to the end of the name
    for w in word + ".":
        # Get the next character
        ix = stoi[w]
        
        # Save the input and output
        X.append(context)
        Y.append(ix)
            
        # Print the example
        print("".join(itos[i] for i in context) + " ---> " + itos[ix])
        
        # Remove the earliest character in the context, and append the newest character
        # as the latest character in the context
        context = context[1:] + [ix]

# Inputs
X = torch.tensor(X)
# Labels
Y = torch.tensor(Y)

emma
... ---> e
..e ---> m
.em ---> m
emm ---> a
mma ---> .
olivia
... ---> o
..o ---> l
.ol ---> i
oli ---> v
liv ---> i
ivi ---> a
via ---> .
ava
... ---> a
..a ---> v
.av ---> a
ava ---> .
isabella
... ---> i
..i ---> s
.is ---> a
isa ---> b
sab ---> e
abe ---> l
bel ---> l
ell ---> a
lla ---> .
sophia
... ---> s
..s ---> o
.so ---> p
sop ---> h
oph ---> i
phi ---> a
hia ---> .


In [7]:
# We have x many input examples, with each with a context size `block_size`
print(f"Input shape: {X.shape}, {X.dtype}")

Input shape: torch.Size([32, 3]), torch.int64


In [8]:
# The next character is one of 27 possible characters
print(f"Labels shape: {Y.shape}, {Y.dtype}")

Labels shape: torch.Size([32]), torch.int64


## Building the Embedding Table `C`
In the paper, 17,000 possible words are crammed into a 30-dimensional space.
We have 27 possible characters, so let's cram them into a 2-dimensional space.

In [9]:
# 27 possible characters, 2-dimensional space
# Each character has a 2-dimenional embedding
C = torch.randn((27, 2))
C

tensor([[-9.2731e-01,  1.4308e+00],
        [-3.6150e-01, -6.0263e-01],
        [ 1.6107e+00, -4.1672e-02],
        [-9.0850e-01, -7.9790e-01],
        [ 1.9119e-01, -1.4500e-01],
        [-5.9361e-01, -9.5892e-01],
        [ 1.0253e-01,  4.1094e-01],
        [-6.1766e-01, -4.0082e-01],
        [-5.4639e-01,  3.3811e-01],
        [ 5.2021e-01,  6.2468e-01],
        [ 8.3759e-02, -6.9934e-04],
        [ 7.8323e-01,  3.6488e-01],
        [-4.6051e-01,  4.7479e-02],
        [-2.3226e-01,  9.8454e-01],
        [-9.0988e-01,  1.1672e+00],
        [-7.4115e-02,  5.3692e-01],
        [ 6.8232e-01,  3.2136e-01],
        [-2.4315e+00, -2.1637e-01],
        [-8.2748e-01, -1.1725e-01],
        [-5.4597e-01, -2.4455e+00],
        [-1.0707e+00,  1.9573e-01],
        [-1.6855e+00,  1.1341e+00],
        [-1.8326e+00,  5.5144e-01],
        [-7.0988e-01,  5.1479e-01],
        [-1.0982e+00,  7.6035e-01],
        [ 2.0533e+00, -3.2587e-01],
        [ 6.2168e-01,  8.6846e-01]])

In [10]:
c = 'g'
c_index = stoi[c]
print(f"Character {c} maps to {c_index}")
C[c_index]

Character g maps to 7


tensor([-0.6177, -0.4008])

In [11]:
# Indexing into the embedding table `C` is the same as matrix multiplying
# `C` with the one-hot encoding representation of the input character
v = F.one_hot(torch.tensor(c_index), num_classes=27)
v.dtype

torch.int64

In [12]:
# Need to cast the vector to a float since the embedding table C contains floats
v.float() @ C

tensor([-0.6177, -0.4008])

Embedding a single character into the embedding table `C` is easy. Just use the integer representation of that character, and index into `C`. But how do we simultaneously embed `[32,3]` (32 examples, each of size 3, stored in array `X`) into `C`?

In addition to integers, we can use lists to index into `C`.

In [13]:
# This gets the rows at index 2, 3, and 4
C[[2,3,4]]

tensor([[ 1.6107, -0.0417],
        [-0.9085, -0.7979],
        [ 0.1912, -0.1450]])

In [14]:
# We can also index using Tensors
C[torch.tensor([2,3,4])]

tensor([[ 1.6107, -0.0417],
        [-0.9085, -0.7979],
        [ 0.1912, -0.1450]])

In [15]:
# We can also get the same row multiple times
C[[2,2,2]]

tensor([[ 1.6107, -0.0417],
        [ 1.6107, -0.0417],
        [ 1.6107, -0.0417]])

In [16]:
# Recall, X contains the characters as integers as input
X

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22],
        [ 9, 22,  9],
        [22,  9,  1],
        [ 0,  0,  0],
        [ 0,  0,  1],
        [ 0,  1, 22],
        [ 1, 22,  1],
        [ 0,  0,  0],
        [ 0,  0,  9],
        [ 0,  9, 19],
        [ 9, 19,  1],
        [19,  1,  2],
        [ 1,  2,  5],
        [ 2,  5, 12],
        [ 5, 12, 12],
        [12, 12,  1],
        [ 0,  0,  0],
        [ 0,  0, 19],
        [ 0, 19, 15],
        [19, 15, 16],
        [15, 16,  8],
        [16,  8,  9],
        [ 8,  9,  1]])

In [17]:
# We can index into the embedding table `C` using multi-dimensional tensors too
# The character integers in X is used as the indices into C
print("X.shape", X.shape)
C[X]

X.shape torch.Size([32, 3])


tensor([[[-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308]],

        [[-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.5936, -0.9589]],

        [[-0.9273,  1.4308],
         [-0.5936, -0.9589],
         [-0.2323,  0.9845]],

        [[-0.5936, -0.9589],
         [-0.2323,  0.9845],
         [-0.2323,  0.9845]],

        [[-0.2323,  0.9845],
         [-0.2323,  0.9845],
         [-0.3615, -0.6026]],

        [[-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308]],

        [[-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.0741,  0.5369]],

        [[-0.9273,  1.4308],
         [-0.0741,  0.5369],
         [-0.4605,  0.0475]],

        [[-0.0741,  0.5369],
         [-0.4605,  0.0475],
         [ 0.5202,  0.6247]],

        [[-0.4605,  0.0475],
         [ 0.5202,  0.6247],
         [-1.8326,  0.5514]],

        [[ 0.5202,  0.6247],
         [-1.8326,  0.5514],
         [ 0.5202,  0.6247]],

        [[-1.8326,  0

In [18]:
# [32, 3] was the shape of the input X, then each input has an embedding of 2
C[X].shape

torch.Size([32, 3, 2])

In [19]:
# Get the 2nd character in the 7th example
example_index = 7
character_index = 2
X[example_index,character_index]

tensor(12)

In [20]:
# Get the character representation of the integer
itos[X[example_index,character_index].item()]

'l'

In [21]:
# Get the *embedding*of the 2nd character in the 7th example
C[X][example_index,character_index]

tensor([-0.4605,  0.0475])

In [22]:
# C[12] is equivalent to C[X[7,2]]
C[12]

tensor([-0.4605,  0.0475])

In [23]:
# We created our embedding table integrated with our example inputs!
emb = C[X]
emb.shape

torch.Size([32, 3, 2])

## Contructing the Hidden Layer

In [49]:
# Weights
# 3 characters per input, and each characters has two floats that represent it (it's embedding)
# 3*2 = 6 outputs in the first layer
# So the hidden layer has to take in 6 inputs
# The number of out put nodes is a variable, we arbitrary choose 100 for now
W1 = torch.randn((6,100))
# Biases, should match the size of the hidden layer
b1 = torch.randn(100)

In [25]:
# The goal is to be able to do:
# emb @ W1 + b
# with `@` indicating matrix multiply
# and `emb` as our input
# We can't do this directly since their shapes don't match correctly
# The goal is to get the input/embedding of shape [32,6]
emb.shape

torch.Size([32, 3, 2])

In [26]:
W1.shape

torch.Size([6, 100])

In [27]:
# To do this, we can concatenate the three characters and their embeddings
# This grabs all the examples, indexes that into index 0 (first character)
# then grabs all the embeddings of the first character for all the examples
emb[:, 0, :]

tensor([[-0.9273,  1.4308],
        [-0.9273,  1.4308],
        [-0.9273,  1.4308],
        [-0.5936, -0.9589],
        [-0.2323,  0.9845],
        [-0.9273,  1.4308],
        [-0.9273,  1.4308],
        [-0.9273,  1.4308],
        [-0.0741,  0.5369],
        [-0.4605,  0.0475],
        [ 0.5202,  0.6247],
        [-1.8326,  0.5514],
        [-0.9273,  1.4308],
        [-0.9273,  1.4308],
        [-0.9273,  1.4308],
        [-0.3615, -0.6026],
        [-0.9273,  1.4308],
        [-0.9273,  1.4308],
        [-0.9273,  1.4308],
        [ 0.5202,  0.6247],
        [-0.5460, -2.4455],
        [-0.3615, -0.6026],
        [ 1.6107, -0.0417],
        [-0.5936, -0.9589],
        [-0.4605,  0.0475],
        [-0.9273,  1.4308],
        [-0.9273,  1.4308],
        [-0.9273,  1.4308],
        [-0.5460, -2.4455],
        [-0.0741,  0.5369],
        [ 0.6823,  0.3214],
        [-0.5464,  0.3381]])

In [28]:
# This gets the embeddings of all the first characters in each example
emb[:, 0, :].shape

torch.Size([32, 2])

In [29]:
# We have to concatenate the embeddings of all the characters
# Each emb[:, x, :] has shape [32, 2]
# and we want to concatenate cross the dim=1 to get [32,6]
torch.cat([emb[:, 0, :], emb[:, 1, :], emb[:, 1, :]], dim=1).shape

torch.Size([32, 6])

In [30]:
# Hard-coding the concatenation wouldn't work for other block sizes
# So we generalize using torch.unbind
# For block_size 3, torch.unbind(emb, dim=1) == [emb[:, 0, :], emb[:, 1, :], emb[:, 1, :]]
torch.unbind(emb, dim=1)

(tensor([[-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.5936, -0.9589],
         [-0.2323,  0.9845],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.0741,  0.5369],
         [-0.4605,  0.0475],
         [ 0.5202,  0.6247],
         [-1.8326,  0.5514],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.3615, -0.6026],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [ 0.5202,  0.6247],
         [-0.5460, -2.4455],
         [-0.3615, -0.6026],
         [ 1.6107, -0.0417],
         [-0.5936, -0.9589],
         [-0.4605,  0.0475],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0.5460, -2.4455],
         [-0.0741,  0.5369],
         [ 0.6823,  0.3214],
         [-0.5464,  0.3381]]),
 tensor([[-0.9273,  1.4308],
         [-0.9273,  1.4308],
         [-0

In [31]:
torch.cat(torch.unbind(emb, dim=1), dim=1).shape

torch.Size([32, 6])

A more efficient way to concatenate the embeddings is to use `view()`

In [32]:
a = torch.arange(18)
a

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17])

In [33]:
# One list of 18 numbers
a.shape

torch.Size([18])

In [34]:
# We can represent this tensor with different dimensions
# Two lists of 9 numbers
a.view((2,9))

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17]])

In [35]:
# 3 lists of 3 lists of 2 numbers
a.view([3,3,2])

tensor([[[ 0,  1],
         [ 2,  3],
         [ 4,  5]],

        [[ 6,  7],
         [ 8,  9],
         [10, 11]],

        [[12, 13],
         [14, 15],
         [16, 17]]])

`view()` is very efficent because of a tensor's `storage()`

In [36]:
# All the numbers are always stored in consecutive memory
# tensor.view() only changes the view of the memory
# No memory is copied, moved, or changed
a.storage()

  a.storage()


 0
 1
 2
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
[torch.storage.TypedStorage(dtype=torch.int64, device=cpu) of size 18]

In [37]:
# So we can call view() on our input/embedding to do the same concatenation as before
# but much more efficiently
emb.view(32, 6)

tensor([[-0.9273,  1.4308, -0.9273,  1.4308, -0.9273,  1.4308],
        [-0.9273,  1.4308, -0.9273,  1.4308, -0.5936, -0.9589],
        [-0.9273,  1.4308, -0.5936, -0.9589, -0.2323,  0.9845],
        [-0.5936, -0.9589, -0.2323,  0.9845, -0.2323,  0.9845],
        [-0.2323,  0.9845, -0.2323,  0.9845, -0.3615, -0.6026],
        [-0.9273,  1.4308, -0.9273,  1.4308, -0.9273,  1.4308],
        [-0.9273,  1.4308, -0.9273,  1.4308, -0.0741,  0.5369],
        [-0.9273,  1.4308, -0.0741,  0.5369, -0.4605,  0.0475],
        [-0.0741,  0.5369, -0.4605,  0.0475,  0.5202,  0.6247],
        [-0.4605,  0.0475,  0.5202,  0.6247, -1.8326,  0.5514],
        [ 0.5202,  0.6247, -1.8326,  0.5514,  0.5202,  0.6247],
        [-1.8326,  0.5514,  0.5202,  0.6247, -0.3615, -0.6026],
        [-0.9273,  1.4308, -0.9273,  1.4308, -0.9273,  1.4308],
        [-0.9273,  1.4308, -0.9273,  1.4308, -0.3615, -0.6026],
        [-0.9273,  1.4308, -0.3615, -0.6026, -1.8326,  0.5514],
        [-0.3615, -0.6026, -1.8326,  0.5

In [38]:
# These are equal operations
emb.view(32,6) == torch.cat(torch.unbind(emb, dim=1), dim=1)

tensor([[True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, True, True],
        [True, True, True, True, T

In [39]:
emb.view(32,6).shape

torch.Size([32, 6])

In [40]:
W1.shape

torch.Size([6, 100])

In [50]:
b1.shape

torch.Size([100])

In [51]:
# Now we can do the matrix multiplication to get our hidden layer
# We can also do  emb.view(-1,6) and PyTorch will derive what that value should be
h = emb.view(emb.shape[0],6) @ W1 + b1
h

tensor([[-1.3747,  0.0097, -0.2228,  ..., -0.0455,  3.2644,  0.1557],
        [ 1.2660, -0.8733,  3.0157,  ..., -1.6933,  5.1096, -1.6967],
        [-2.5756, -0.6274, -0.3910,  ..., -4.0954, -1.4742,  2.1312],
        ...,
        [-2.3773, -0.3995,  1.9806,  ..., -1.8449,  1.0093, -0.2069],
        [-1.7430, -2.0088,  1.1688,  ..., -2.2488,  0.8505, -0.2493],
        [-0.7574, -0.5350,  3.0834,  ..., -2.5627,  2.2206, -1.1542]])

In [52]:
h.shape

torch.Size([32, 100])

In [53]:
# We apply a tanh so that the values are between -1 and 1
# Each example gets a tanh applied to it
h = torch.tanh(h)
h

tensor([[-0.8798,  0.0097, -0.2191,  ..., -0.0455,  0.9971,  0.1545],
        [ 0.8527, -0.7030,  0.9952,  ..., -0.9346,  0.9999, -0.9350],
        [-0.9885, -0.5563, -0.3722,  ..., -0.9994, -0.9004,  0.9722],
        ...,
        [-0.9829, -0.3795,  0.9626,  ..., -0.9513,  0.7655, -0.2040],
        [-0.9406, -0.9646,  0.8239,  ..., -0.9780,  0.6913, -0.2443],
        [-0.6395, -0.4892,  0.9958,  ..., -0.9882,  0.9767, -0.8192]])

In [47]:
# Recall that we arbitrarily chose 100 nodes for the hidden layer
h.shape

torch.Size([32, 100])

In [48]:
# We should always double-check that the broadcasting that occurs
# when the bias `b` is added is correct
W1.shape

torch.Size([6, 100])

In [54]:
b1.shape

torch.Size([100])

In [55]:
# 6, 100 <-- W1
# 1, 100 <-- b1

## Constructing the Final Output Layer

In [56]:
# W1 is a [6,100], meaning it has 100 outputs
# So W2 must take in 100 outputs as inputs, and return 27 possible values
# (since we have 27 possible characters)
W2 = torch.rand((100, 27))
b2 = torch.rand(27)

In [57]:
# `logits` are the outputs of this neural net
# Recall h = tanh(emb @ W1 + b1), where `emb` contains the inputs
logits = h @ W2 + b2

In [58]:
# For each input example (32 total), the output can be one of 27 values
# (a value is generated for each of the 27 characters)
logits.shape

torch.Size([32, 27])

In [59]:
# Exponentiate the logits to get "fake counts"
counts = logits.exp()

In [60]:
# Then normalize the counts to get a probability
# dim=1 because that is the final output
probs = counts / counts.sum(dim=1, keepdims=True)

In [62]:
probs.shape

torch.Size([32, 27])

In [63]:
probs

tensor([[3.9618e-04, 1.2329e-02, 1.2944e-01, 4.9393e-03, 2.0572e-01, 2.8765e-04,
         5.2524e-02, 4.9635e-03, 1.1636e-04, 1.5191e-02, 2.4567e-04, 2.0490e-02,
         2.5583e-01, 1.6568e-02, 5.1502e-03, 5.3125e-04, 2.5418e-03, 9.2257e-04,
         7.6164e-04, 1.0799e-01, 1.3748e-02, 1.4347e-03, 1.0768e-01, 1.5232e-02,
         1.4127e-02, 1.3523e-04, 1.0701e-02],
        [4.9310e-04, 5.9830e-02, 9.0055e-03, 2.1468e-03, 7.1099e-02, 1.2801e-02,
         7.2743e-03, 1.3667e-02, 8.6591e-03, 1.8418e-04, 4.1736e-03, 2.1978e-01,
         5.9718e-03, 1.1478e-01, 2.2109e-03, 6.6828e-03, 2.7476e-02, 6.4243e-03,
         2.6111e-03, 4.7357e-02, 8.1693e-03, 6.2603e-03, 1.1429e-01, 7.2644e-03,
         1.9630e-01, 7.3732e-04, 4.4357e-02],
        [3.0622e-04, 3.5605e-05, 1.2429e-01, 3.2011e-04, 2.5475e-01, 5.3287e-04,
         3.1936e-03, 6.3487e-03, 5.4305e-06, 1.6503e-03, 2.4819e-05, 4.9178e-03,
         9.9153e-02, 1.0857e-03, 5.2974e-04, 9.8552e-03, 2.2048e-02, 6.3814e-04,
         8.0811e-

In [64]:
# Each row in probs now sum to 1
probs[0].sum()

tensor(1.0000)

In [65]:
# The correct answer of the next character is provided by Y
# So for each row in probs, we need to get the probability that was generated for the output character
# of each example (a row in probs)
Y

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9,  1,  0,  1, 22,  1,  0,  9, 19,
         1,  2,  5, 12, 12,  1,  0, 19, 15, 16,  8,  9,  1,  0])

In [66]:
# Creates a list of indices to index into probs
# We want to index into each example (32 examples total)
torch.arange(32)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [67]:
# We then use Y as a list of indicies
# This automatically grabs the probabiltiy assigned to each output character in Y, for each example
probs[torch.arange(32), Y]

tensor([2.8765e-04, 1.1478e-01, 1.0857e-03, 4.7808e-05, 1.1000e-03, 5.3125e-04,
        4.7332e-02, 1.7001e-03, 1.4636e-02, 2.1056e-02, 1.8331e-03, 3.3086e-04,
        1.2329e-02, 1.2238e-01, 1.0410e-03, 5.7472e-03, 1.5191e-02, 1.1333e-01,
        4.1266e-01, 5.9662e-02, 3.2234e-04, 1.2041e-02, 4.2785e-04, 4.3027e-05,
        9.1726e-04, 1.0799e-01, 3.7572e-02, 7.0738e-03, 4.2445e-03, 6.2252e-03,
        1.7957e-03, 4.7157e-04])

In [68]:
# The ideal goal of training is to get all the probabilities of each correct character per example is 1 (100%)

In [70]:
# Now we calculate the negative log liklihood for loss
# This is the loss we'd like to minimize
loss = -probs[torch.arange(32), Y].log().mean()
loss

tensor(5.4410)

## Complete Neural Network

In [71]:
# Dataset
X.shape, Y.shape

(torch.Size([32, 3]), torch.Size([32]))

In [72]:
# For reproducibility
g = torch.Generator().manual_seed(2147483647)

In [77]:
# 27 characters, each character has an embedding/representation of two floats
C = torch.randn((27, 2), generator=g)
# 3 characters per input, 2 embeddings per character
# 100 is an arbitrary number of nodes
W1 = torch.randn((6,100), generator=g)
b1 = torch.randn(100, generator=g)

# 100 inputs from previous layer, 27 possible characters as outputs
W2 = torch.randn((100,27), generator=g)
b2 = torch.randn(27, generator=g)

parameters = [C, W1, b1, W2, b2]

In [78]:
# Number of parameters in total
sum(p.nelement() for p in parameters)

3481