In [1]:
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

from karpathy_nn.makemore.data.load_data import load_names


In [2]:
words = load_names()
words[:10]


['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
len(words)


32033

In [4]:
min(len(word) for word in words)


2

In [5]:
max(len(word) for word in words)


15

Now we are simply going to count how often any of the combinations of two characters occurs in the training set.

In [6]:
bigram_counter = {}

for word in words:
    # Note: word is just a string
    # These kinds of braces are conventionally used in NLP
    # to denote special tokens.
    tokens = ["<S>"] + list(word) + ["<E>"]

    # zip only goes until the shorter iterator ends
    for token1, token2 in zip(tokens, tokens[1:]):
        bigram = (token1, token2)
        bigram_counter[bigram] = bigram_counter.get(bigram, 0) + 1


In [7]:
# Combinations sorted by how common they are
sorted(bigram_counter.items(), reverse=True, key=lambda key_value: key_value[1])


[(('n', '<E>'), 6763),
 (('a', '<E>'), 6640),
 (('a', 'n'), 5438),
 (('<S>', 'a'), 4410),
 (('e', '<E>'), 3983),
 (('a', 'r'), 3264),
 (('e', 'l'), 3248),
 (('r', 'i'), 3033),
 (('n', 'a'), 2977),
 (('<S>', 'k'), 2963),
 (('l', 'e'), 2921),
 (('e', 'n'), 2675),
 (('l', 'a'), 2623),
 (('m', 'a'), 2590),
 (('<S>', 'm'), 2538),
 (('a', 'l'), 2528),
 (('i', '<E>'), 2489),
 (('l', 'i'), 2480),
 (('i', 'a'), 2445),
 (('<S>', 'j'), 2422),
 (('o', 'n'), 2411),
 (('h', '<E>'), 2409),
 (('r', 'a'), 2356),
 (('a', 'h'), 2332),
 (('h', 'a'), 2244),
 (('y', 'a'), 2143),
 (('i', 'n'), 2126),
 (('<S>', 's'), 2055),
 (('a', 'y'), 2050),
 (('y', '<E>'), 2007),
 (('e', 'r'), 1958),
 (('n', 'n'), 1906),
 (('y', 'n'), 1826),
 (('k', 'a'), 1731),
 (('n', 'i'), 1725),
 (('r', 'e'), 1697),
 (('<S>', 'd'), 1690),
 (('i', 'e'), 1653),
 (('a', 'i'), 1650),
 (('<S>', 'r'), 1639),
 (('a', 'm'), 1634),
 (('l', 'y'), 1588),
 (('<S>', 'l'), 1572),
 (('<S>', 'c'), 1542),
 (('<S>', 'e'), 1531),
 (('j', 'a'), 1473),
 (

We will store this data as a matrix, in which the rows correspond to the first character, the columns correspond to the second one, and the entry at that location gives how often the second character follows the first one.

In [8]:
# Note: for a very small dataset it is not guaranteed that all letters of the
# alphabet will occur. Therefore, this is a more flexible approach.
chars = sorted(list(set("".join(words))))

string_to_integer = {string: integer for integer, string in enumerate(chars)}
string_to_integer["<S>"] = len(chars)
string_to_integer["<E>"] = len(chars) + 1

integer_to_string = {integer: string for string, integer in string_to_integer.items()}

# We add the two special tokens.
co_occurrence_matrix = torch.zeros((len(chars) + 2, len(chars) + 2), dtype=torch.int32)

for word in words:
    # Note: word is just a string
    tokens = ["<S>"] + list(word) + ["<E>"]

    # zip only goes until the shorter iterator ends
    for token1, token2 in zip(tokens, tokens[1:]):
        idx1, idx2 = string_to_integer[token1], string_to_integer[token2]
        co_occurrence_matrix[idx1, idx2] += 1


In [9]:
fig = plt.figure(figsize=(16, 16))
plt.imshow(co_occurrence_matrix, cmap="Blues")

for i in range(co_occurrence_matrix.shape[0]):
    for j in range(co_occurrence_matrix.shape[1]):
        bigram_string = integer_to_string[i] + integer_to_string[j]

        # ha = horizontal alignment
        # va = vertical alignment
        plt.text(j, i, bigram_string, ha="center", va="bottom", color="gray")
        plt.text(
            j, i, co_occurrence_matrix[i, j].item(), ha="center", va="top", color="gray"
        )
plt.axis("off")
plt.savefig("bigram_visualization.pdf")
plt.close(fig)


The last row is a row of zeros, because the end token will never be followed by any other token. The penultimate columns is also a column of zeros, because the start token will never follow any other token. Let's make our plot prettier. Instead of having a separate start of word and end of word token, we will use the "." token for both of these.

In [10]:
# Note: for a very small dataset it is not guaranteed that all letters of the
# alphabet will occur. Therefore, this is a more flexible approach.
chars = sorted(list(set("".join(words))))

# We put . to the first index, shift all indices by one
string_to_integer = {string: integer + 1 for integer, string in enumerate(chars)}
string_to_integer["."] = 0

integer_to_string = {integer: string for string, integer in string_to_integer.items()}

# We add the two special tokens.
co_occurrence_matrix = torch.zeros((len(chars) + 1, len(chars) + 1), dtype=torch.int32)
num_tokens = len(co_occurrence_matrix)

for word in words:
    # Note: word is just a string
    tokens = ["."] + list(word) + ["."]

    # zip only goes until the shorter iterator ends
    for token1, token2 in zip(tokens, tokens[1:]):
        idx1, idx2 = string_to_integer[token1], string_to_integer[token2]
        co_occurrence_matrix[idx1, idx2] += 1


In [11]:
fig = plt.figure(figsize=(16, 16))
plt.imshow(co_occurrence_matrix, cmap="Blues")

for i in range(co_occurrence_matrix.shape[0]):
    for j in range(co_occurrence_matrix.shape[1]):
        bigram_string = integer_to_string[i] + integer_to_string[j]

        # ha = horizontal alignment
        # va = vertical alignment
        plt.text(j, i, bigram_string, ha="center", va="bottom", color="gray")
        plt.text(
            j, i, co_occurrence_matrix[i, j].item(), ha="center", va="top", color="gray"
        )
plt.axis("off")
plt.savefig("bigram_visualization.pdf")
plt.close(fig)


.. never happens because we don't have empty words. First row corresponds to word beginnings, first column corresponds to word endings.

This matrix has all the information necessary for us to sample from the bigram model.

In [12]:
g_cpu = torch.Generator().manual_seed(2147483647)

co_occurrence_matrix_float = co_occurrence_matrix.float()

for _ in range(20):
    out_list = []
    idx = 0
    while True:
        unnormalized_probability_vector = co_occurrence_matrix_float[idx]
        idx = torch.multinomial(
            input=unnormalized_probability_vector,
            num_samples=1,
            replacement=True,
            generator=g_cpu,
        ).item()
        # There is no need to normalize to obtain a valid categorical probability vector.
        # If we needed it, we could simply do
        # >>> probability_matrix = co_occurrence_matrix / co_occurrence_matrix.sum(dim=1, keepdims=True)
        # because the float conversion happens automatically.
        # Interesting note: by construction, the sums over rows and columns match.
        # This is because there are exactly as many bigrams whose first element is e.g. "b"
        # as there are bigrams whose second element is "b". For "." this also works because we do not
        # distinguish between start and end tokens.

        if idx == 0:
            break
        out_list.append(integer_to_string[idx])

    print("".join(out_list))


mor
axx
minaymoryles
kondlaisah
anchshizarie
odaren
iaddash
h
jhinatien
egushl
h
br
a
jayn
ilemannariaenien
be
f
akiinela
trttanakeroruceyaaxatona
lamoynayrkiedengin


As we can see, the bigram language model is pretty terrible. We can also see that "h" is both a likely beginning and ending of a name (and that's exactly why we get multiple generated "h"s).

To see that the bigram model is still a bit more plausible than sampling random characters, we can compare to the output below.

In [13]:
g_cpu = torch.Generator().manual_seed(2147483647)

for _ in range(20):
    out_list = []
    idx = 0
    while True:
        # We consider the uniform distribution
        probability_vector = torch.ones(
            num_tokens
        )  # No need to normalize actually
        idx = torch.multinomial(
            input=probability_vector, num_samples=1, replacement=True, generator=g_cpu
        ).item()

        if idx == 0:
            break
        out_list.append(integer_to_string[idx])

    print("".join(out_list))


qvsaayxbqrqmyqwuznivanukotdjvdhd
qnoymtzduqkatdetkpfjdgigvlejfkrsqlwnirghhzwlu
idcx
cekmzucjnjoeovjvrggqrjr
cfbhabkslpokc
xtxwbpmknuusxdgzfexhwqpldpdnwzvkyxsqjforqqpfxstwkfoufhvwfhmsuyyotvcvvqpfcbydjcouhkajkhqnnpqmmllaordqy
gszpw
zlgijinangzzuulsyvqrufuawavsdbnwvlmrypvgrsfgpshgnmwafqmsjdvbhngvoiigxhkwdltrdkwnagzyknqv
lfstdqigvncdoidetsukgdp
cfpjsxeqjcsmjwguzes
woflfjxflylgbegpjdpovdtw
dlzysqtrbhxhcdneiuum
xtyslfbmaboaanyjpojuujflcsaucqcgtjmlzqtbaisvxrtgupkppigxudejdzsroqeigovuxmvt
jlxfolkozci
tkhdivkdifaxcevlpktkwwvuxlymtwylgpzauwdvxfvbooflddphmjeomjgjcqeqwt

wlxclcjbm
quuyijtnzmycshclormjyrerqslomdrlbuwqnlmitbrmqhtbdwbyvlsmwnborwcdhjotezwnsxuvffvinrmedelubhdfgtavxqfgmnyqrygyevxaapbjtnwfnwewqxerdytttvfo
iauarz
tynoqkyp


Much worse, right? But the bigram model is still quite terrible. Btw, the *parameters* of our bigram language model are stored in the ``co_occurrence_matrix``. The training was to compute these co-occurrences. Inference is just the loop we implemented above.

Now we want to evaluate the quality of the model. We want to summarize the model's performance into a single number. How good is our model at predicting the training set, for example? This will give us the training loss.

Now we actually need the normalized probabilities, so I wasn't that smart with the optimization. :)

In [14]:
probability_matrix = co_occurrence_matrix / co_occurrence_matrix.sum(dim=1, keepdims=True)

In [15]:
for word in words[:3]:
    # Note: word is just a string
    tokens = ["."] + list(word) + ["."]

    # zip only goes until the shorter iterator ends
    for token1, token2 in zip(tokens, tokens[1:]):
        idx1, idx2 = string_to_integer[token1], string_to_integer[token2]
        # These give the conditional probabilities p(idx2 | idx1)
        bigram_probability = probability_matrix[idx1, idx2]

        print(f"{token1}{token2}: {bigram_probability:.4f}")

.e: 0.0478
em: 0.0377
mm: 0.0253
ma: 0.3899
a.: 0.1960
.o: 0.0123
ol: 0.0780
li: 0.1777
iv: 0.0152
vi: 0.3541
ia: 0.1381
a.: 0.1960
.a: 0.1377
av: 0.0246
va: 0.2495
a.: 0.1960


If we sampled uniformly at random, we would have $1/27$ as all of the probabilities above, which is roughly $4\%$. We can see that the probabilities above are mostly higher than that, thus, our model is doing a better job than random guessing. If we had a very good model, we would expect these probabilities to be near 1, because then the model is correctly predicting what is next on the training set.

How to summarize the probabilities into a single number? We can take the product of all the probabilities, as that is the likelihood of the entire dataset under the model. (This is because for this model, we assume independence between bigrams.) Therefore, under a good model, we expect to have a high likelihood for the dataset.

Of course, working with the product of many probabilities is never a good idea considering numerical stability. It is much more stable to consider the log-likelihood of the dataset, which allows us to work with the sum of log-probabilities
of the form $\log p(\texttt{idx2} \mid \texttt{idx1})$.

In [16]:
log_likelihood = 0
n = 0
for word in words:
    # Note: word is just a string
    tokens = ["."] + list(word) + ["."]

    # zip only goes until the shorter iterator ends
    for token1, token2 in zip(tokens, tokens[1:]):
        idx1, idx2 = string_to_integer[token1], string_to_integer[token2]
        bigram_log_probability = probability_matrix[idx1, idx2].log()

        log_likelihood += bigram_log_probability
        n += 1

        # print(f"{token1}{token2}: {bigram_log_probability:.4f}")

print(f"{log_likelihood = }")

nll = -log_likelihood  # Very nice loss function

print(f"{nll = }")
print(f"{nll / n = }")

log_likelihood = tensor(-559891.7500)
nll = tensor(559891.7500)
nll / n = tensor(2.4541)


Our goal is to maximize the likelihood of the data wrt. the model parameters (defined by the co-occurrence matrix in the bigram model case). The probabilities will not be stored explicitly later on, they will rather be calculated by a neural network whose weights we want to tweak in order to maximize the data likelihood. Maximizing the data likelihood is equivalent to minimizing a scaled version of the negative log-likelihood (average instead of sum), which is what people most often do.

In [17]:
log_likelihood = 0
n = 0
for word in ["andrejq"]:
    # Note: word is just a string
    tokens = ["."] + list(word) + ["."]

    # zip only goes until the shorter iterator ends
    for token1, token2 in zip(tokens, tokens[1:]):
        idx1, idx2 = string_to_integer[token1], string_to_integer[token2]
        bigram_log_probability = probability_matrix[idx1, idx2].log()

        log_likelihood += bigram_log_probability
        n += 1

        print(f"{token1}{token2}: {bigram_log_probability:.4f}")

print(f"{log_likelihood = }")

nll = -log_likelihood  # Very nice loss function

print(f"{nll = }")
print(f"{nll / n = }")


.a: -1.9829
an: -1.8296
nd: -3.2594
dr: -2.5620
re: -2.0127
ej: -5.9171
jq: -inf
q.: -2.2736
log_likelihood = tensor(-inf)
nll = tensor(inf)
nll / n = tensor(inf)


Here jq is infinitely unlikely according to our model (because the training data did not contain that bigram), therefore we get an infinite loss. People usually don't like that, so they use *model smoothing*. This adds some fake counts to make all bigrams at least a tiny bit plausible. The usual choice is a count of one added to each combination, but we can add as many counts as we like. The more we add, the more uniform model we are going to have.

In [18]:
co_occurrence_matrix_p1 = co_occurrence_matrix + 1
probability_matrix = co_occurrence_matrix_p1 / co_occurrence_matrix_p1.sum(dim=1, keepdims=True)


In [19]:
log_likelihood = 0
n = 0
for word in ["andrejq"]:
    # Note: word is just a string
    tokens = ["."] + list(word) + ["."]

    # zip only goes until the shorter iterator ends
    for token1, token2 in zip(tokens, tokens[1:]):
        idx1, idx2 = string_to_integer[token1], string_to_integer[token2]
        bigram_log_probability = probability_matrix[idx1, idx2].log()

        log_likelihood += bigram_log_probability
        n += 1

        print(f"{token1}{token2}: {bigram_log_probability:.4f}")

print(f"{log_likelihood = }")

nll = -log_likelihood  # Very nice loss function

print(f"{nll = }")
print(f"{nll / n = }")


.a: -1.9835
an: -1.8302
nd: -3.2594
dr: -2.5646
re: -2.0143
ej: -5.9004
jq: -7.9817
q.: -2.3331
log_likelihood = tensor(-27.8672)
nll = tensor(27.8672)
nll / n = tensor(3.4834)


With this modification, jq is still very unlikely but not totally unlikely.

Now, we arrived at this model by doing something that felt sensible: counting and normalizing counts. We will now cast the problem of bigram character level language modeling into the neural network framework. We are going to approach things slightly differently but end up in a very similar spot.

Our neural network will still be a bigram character level language model, so it receives a single character as input and it outputs the probability distribution over the next character in the sequence. We will optimize the model's parameters on the dataset using gradient-based optimization such that it assigns high probability to the tokens that actually follow the given token in the dataset.

In [20]:
# Create the training set of bigrams (x, y)
xs, ys = [], []

# The word "emma" alone gives 5 examples for the NN:
# x = ., y = e
# x = e, y = m
# x = m, y = m
# x = m, y = a
# x = a, y = .
# These are represented as integers.
# The input is a token category, and the output is also a token category.
# We can stack these together into mini-batches.
for word in words[:1]:  # Only consider first name for simplicity
    tokens = ["."] + list(word) + ["."]

    for token1, token2 in zip(tokens, tokens[1:]):
        idx1, idx2 = string_to_integer[token1], string_to_integer[token2]

        xs.append(idx1)
        ys.append(idx2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
        

Note: ``torch.tensor`` infers the dtype automatically, while ``torch.Tensor`` returns a ``torch.FloatTensor``. The recommendation is to stick to ``torch.tensor``, whose dtype we can explicitly modify when needed.

It doesn't make sense to have a neuron act directly on an integer input. The integer value itself doesn't have any meaning: there is no ordering between the tokens. A common way of encoding categories is the one-hot encoding. Here, both the inputs and the outputs will be one-hot encoded.

In [21]:
# Randomly initialize len(co_occurrence_matrix) neurons' weights.
# Each neuron receives 27 inputs and are independent in their forward propagation.
g = torch.Generator().manual_seed(2147483647)
W = torch.randn(num_tokens, num_tokens, generator=g)

# We encode the input token to the network as a one-hot vector.
# Sadly F.one_hot doesn't support a dtype argument, so we
# have to cast our result to float afterwards.
xenc = F.one_hot(xs, num_classes=num_tokens).float()

# xenc @ W tells us what the firing rate of these neurons is for each of the examples provided.
# The neurons have their weights in the columns of W.
# "@" is just performing the dot product between all input examples and all output neuron weight
# vectors efficiently, in parallel (SIMD).
logits = xenc @ W  # Predict log-counts, i.e., unnormalized log-probabilities.
counts = logits.exp()  # Roughly equivalent to the counts (unnormalized probabilities)

# The output of the NN is a probability distribution over the next character in the sequence.
probs = counts / counts.sum(dim=1, keepdims=True)

# This way of turning logits into probs is called the softmax operation.


In [22]:
nlls = torch.zeros(len(xs))
for i in range(5):
    # ith bigram:
    x = xs[i].item()  # Input character index
    y = ys[i].item()  # Label character index
    print("-" * 8)
    print(f"Bigram example {i + 1}: {integer_to_string[x]}{integer_to_string[y]} "
          f"(indices {x}, {y})"
    )
    print("Input to the neural net:", x)
    print("Output probabilities from the neural net:", probs[i])
    print("Label (actual next character):", y)
    prob = probs[i, y]
    print("Probability assigned by the net to the correct character:", prob.item())
    log_prob = prob.log()
    print("Log-likelihood:", log_prob.item())
    nll = -log_prob
    print("Negative log-likelihood (duh):", nll.item())
    nlls[i] = nll

print("=" * 8)
print("Average per-sample negative log-likelihood, i.e., loss = ", nlls.mean().item())

--------
Bigram example 1: .e (indices 0, 5)
Input to the neural net: 0
Output probabilities from the neural net: tensor([0.0607, 0.0100, 0.0123, 0.0042, 0.0168, 0.0123, 0.0027, 0.0232, 0.0137,
        0.0313, 0.0079, 0.0278, 0.0091, 0.0082, 0.0500, 0.2378, 0.0603, 0.0025,
        0.0249, 0.0055, 0.0339, 0.0109, 0.0029, 0.0198, 0.0118, 0.1537, 0.1459])
Label (actual next character): 5
Probability assigned by the net to the correct character: 0.012286253273487091
Log-likelihood: -4.3992743492126465
Negative log-likelihood (duh): 4.3992743492126465
--------
Bigram example 2: em (indices 5, 13)
Input to the neural net: 5
Output probabilities from the neural net: tensor([0.0290, 0.0796, 0.0248, 0.0521, 0.1989, 0.0289, 0.0094, 0.0335, 0.0097,
        0.0301, 0.0702, 0.0228, 0.0115, 0.0181, 0.0108, 0.0315, 0.0291, 0.0045,
        0.0916, 0.0215, 0.0486, 0.0300, 0.0501, 0.0027, 0.0118, 0.0022, 0.0472])
Label (actual next character): 13
Probability assigned by the net to the correct character:

Let's optimize W to decrease the NLL loss value as much as possible.

In [23]:
# Create the training set
xs, ys = [], []

for word in words:
    tokens = ["."] + list(word) + ["."]

    for token1, token2 in zip(tokens, tokens[1:]):
        idx1, idx2 = string_to_integer[token1], string_to_integer[token2]

        xs.append(idx1)
        ys.append(idx2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)
num_elements = xs.nelement()
print("Number of examples:", num_elements)

# Initialize the "network"
g = torch.Generator().manual_seed(2147483647)
W = torch.randn(num_tokens, num_tokens, generator=g, requires_grad=True)


Number of examples: 228146


In [24]:
num_epochs = 500
for epoch in range(num_epochs):
    # Forward pass
    xenc = F.one_hot(xs, num_classes=num_tokens).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(dim=1, keepdims=True)
    loss = -probs[torch.arange(len(xs)), ys].log().mean()  # + 0.01 * (W**2).mean()
    print(loss.item())

    # Backward pass
    # More efficient than setting it to zero, but still has the same effect
    W.grad = None
    loss.backward()

    # Update
    learning_rate = 50  # !!!
    W.data -= learning_rate * W.grad


3.758953332901001
3.371100664138794
3.154043197631836
3.020373821258545
2.927711248397827
2.8604021072387695
2.8097290992736816
2.7701022624969482
2.7380728721618652
2.711496591567993
2.6890032291412354
2.6696884632110596
2.65293025970459
2.638277292251587
2.6253881454467773
2.613990545272827
2.60386323928833
2.5948216915130615
2.5867116451263428
2.579403877258301
2.572789192199707
2.5667760372161865
2.5612881183624268
2.5562589168548584
2.551633596420288
2.547365665435791
2.5434155464172363
2.5397486686706543
2.5363364219665527
2.5331544876098633
2.5301806926727295
2.5273966789245605
2.5247862339019775
2.522334575653076
2.520029067993164
2.517857789993286
2.515810489654541
2.513878345489502
2.512051820755005
2.510324001312256
2.5086867809295654
2.5071349143981934
2.5056610107421875
2.5042612552642822
2.5029289722442627
2.5016608238220215
2.5004520416259766
2.4992988109588623
2.498197317123413
2.497144937515259
2.496137857437134
2.495173215866089
2.4942493438720703
2.49336314201355
2.4

Remember: When we optimized just by counting, our loss was originally 2.476x. Here we expect a very similar loss, as we are doing something similar, just by gradient-based optimization (the actual job of W doesn't have to be counting, only in this abstract way).

Note that we are not taking any additional information compared to the first setting, we just want to predict the next word based on the previous one. Instead of counting and normalizing, we are doing this prediction based on gradient-based learning. It just so happens that the original explicit approach optimizes the loss function very well, without any need for gradient-based optimization. This is because the setup for bigram language models is so simple, so we can just estimate the joint probabilities in a table. The gradient-based approach is significantly more flexible, though. We can make the neural net much more complex, and we can also take in multiple previous tokens. But the outputs of the NN will always be logits, and the loss will also stay the same. As we complexify the neural nets and work our way up to transformers, no fundamental change is being made.

For the original bigram model, it's not so straightforward how we could have extended it to multiple input tokens, because eventually the tables would get way too large (as there are too many combinations of what previous characters could be: the size of the probability tensor grows exponentially in the context size). This is an unscalable approach, whereas the NN-based bigram approach is very much scalable.
- Take one token as the context: 26 possibilities.
- Take two tokens as the context: 26<sup>2</sup> = 676 possibilities. The matrix would have 676 rows (or a new axis).
- Take three tokens as the context: 26<sup>3</sup> = 19683 possibilities. The matrix would have this many rows, which clearly doesn't scale well with the context size. It also doesn't work very well.
- With NNs, however, we can get very expressive models that predict correct conditionals based on a longer context.

Note: ``xenc`` is a one-hot vector. Multiplying it with ``W`` just selects the ith row of ``W``, i.e., only the ith weight of each neuron matters for calculating the pre-activations. We then simply call softmax on this row of ``W``. But that's exactly what happened before in the first bigram formulation. The first character of the bigram was used as a lookup into the ``probability_matrix`` to get the probability distribution we need. ``W.exp() == co_occurrence_matrix``. It contains the pseudo log-counts. ``co_occurrence_matrix`` was filled in by counting, whereas the ``W`` matrix was filled in by gradient-based optimization. This is also why we obtain roughly the same loss value at the end of training.

Note: At the smoothing step of the first approach we added fake counts to make the distributions more uniform. The gradient-based framework has an equivalent to smoothing: When ``W`` elements are all equal to each other, the probabilities become completely uniform. Trying to incentivize ``W`` to be constant is equivalent to label smoothing. The more we incentivize it in the loss function, the more we smooth the labels. We can pick that constant to be 0, and we can force the ``W`` matrix to be close to the 0 matrix. This is what *regularization* is for. We can augment the loss function by a small component that we call the regularization term. The $L_2$ regularization takes all elements of ``W``, squares them and sums them together. A hyperparameter $\lambda$ can control the strength of the regularization. The regularization strength is analogous to the number of fake counts we add.

Finally, we want to sample from this "neural net".

In [25]:
g = torch.Generator().manual_seed(2147483647)

for _ in range(5):
    out_list = []
    idx = 0

    while True:
        # Before: prob = probability_matrix[idx]
        # Now:
        xenc = F.one_hot(torch.tensor([idx]), num_classes=num_tokens).float()
        logits = xenc @ W
        counts = logits.exp()
        prob = counts / counts.sum(dim=1, keepdims=True)

        idx = torch.multinomial(prob, num_samples=1, replacement=True, generator=g).item()

        if idx == 0:
            break

        out_list.append(integer_to_string[idx])

    print("".join(out_list))

mor
axx
minaymoryles
kondlaisah
anchshizarie


This is exactly the same result as the first approach gave! This just shows that the two approaches are very-very similar, and our hand-crafted matrix is just as good as the optimized matrix. If we train the gradient-based approach for less epochs, we can see deviations.