E01: train a trigram language model, i.e. take two characters as an input to predict the 3rd one. Feel free to use either counting or a neural net. Evaluate the loss; Did it improve over a bigram model?



In [140]:
#Using counting first, NN approach is implemented for E04

import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn.functional as F
import random
%matplotlib inline
words = open('names.txt','r').read().splitlines()

In [141]:
N = torch.zeros((27,27,27), dtype=torch.int32)
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [142]:
for w in words:
  chs = ['.']+ list(w) + ['.']
  for ch1, ch2,ch3 in zip(chs,chs[1:],chs[2:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    ix3 = stoi[ch3]
    N[ix1,ix2,ix3] +=1


In [143]:
P = (N+1).float()
P = P / P.sum(1,keepdim=True)

In [144]:
g = torch.Generator()


In [145]:
for i in range(50):
    out = []
    ix1, ix2 = 0, 0
    while True:
        p = N[ix1, ix2].float()
        if p.sum() == 0:
            p = torch.ones_like(p)
        p = p / p.sum()

        ix3 = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()

        out.append(itos[ix3])

        if ix3 == 0:
            break

        ix1, ix2 = ix2, ix3

    print(''.join(out))


cy.
intemili.
hyn.
yea.
tuliaayertlierryiah.
viele.
sta.
jayahlor.
ulaber.
liann.
gra.
novianib.
payukeianey.
evyaarsha.
morie.
xiella.
bra.
cataelia.
gus.
na.
prannustagus.
hon.
niya.
gwestes.
fosefor.
gerseon.
fiahen.
gre.
peria.
rekshayris.
iya.
zairo.
ovaniyangabenni.
ya.
pem.
den.
fane.
da.
fady.
vaya.
pren.
woluleimukenna.
phe.
praelliasian.
arivany.
la.
jiollucamr.
dalinet.
bre.
ni.


In [146]:
nlls = []
num_examples = 5

for i in range(num_examples):
    out = []
    ix1, ix2 = 0, 0
    while True:
        p = N[ix1, ix2].float()
        if p.sum() == 0:
            p = torch.ones_like(p)
        p = p / p.sum()

        ix3 = torch.multinomial(p, num_samples=1, replacement=True).item()

        out.append(itos[ix3])

        if ix3 == 0:
            break

        prob_correct = p[ix3]
        logp = torch.log(prob_correct)
        nll = -logp
        nlls.append(nll.item())

        ix1, ix2 = ix2, ix3

    print(f"Generated sequence {i+1}: {''.join(out)}")

nlls = torch.tensor(nlls)
print('=========')
print('Average negative log likelihood (loss):', nlls.mean().item())


Generated sequence 1: vignalisammon.
Generated sequence 2: deqiya.
Generated sequence 3: moran.
Generated sequence 4: lid.
Generated sequence 5: grayzarce.
Average negative log likelihood (loss): 2.5011322498321533


E02: split up the dataset randomly into 80% train set, 10% dev set, 10% test set. Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?


In [147]:
# First, we split the datset randomly

random.seed(21)
random.shuffle(words)

n = len(words)
train_split = int(0.8 * n)
dev_split = int(0.9 * n)

train_words = words[:train_split]
dev_words = words[train_split:dev_split]
test_words = words[dev_split:]

print(f"Train size: {len(train_words)}, Dev size: {len(dev_words)}, Test size: {len(test_words)}")


Train size: 25626, Dev size: 3203, Test size: 3204


In [148]:
# Next, train the bigram

bigram_counts = torch.zeros((27, 27), dtype=torch.float32)
for word in train_words:
    chs = ['.'] + list(word) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        bigram_counts[ix1, ix2] += 1

bigram_probs = bigram_counts / bigram_counts.sum(dim=1, keepdim=True)


In [149]:
# Next, train the trigram

trigram_counts = torch.zeros((27, 27, 27), dtype=torch.float32)
for word in train_words:
    chs = ['.'] + list(word) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        trigram_counts[ix1, ix2, ix3] += 1

trigram_probs = trigram_counts / trigram_counts.sum(dim=2, keepdim=True)


In [150]:
# Finally, we evaluate the models with negative log liklihood

def compute_loss(words, model_probs, ngram_size):
    total_loss = 0
    total_chars = 0

    for word in words:
        chs = ['.'] + list(word) + ['.']
        for i in range(len(chs) - ngram_size + 1):
            indices = [stoi[chs[j]] for j in range(i, i + ngram_size)]
            prob = model_probs[tuple(indices)].item()
            if prob > 0:
                total_loss += -np.log(prob)
            total_chars += 1

    return total_loss / total_chars

bigram_train_loss = compute_loss(train_words, bigram_probs, 2)
bigram_dev_loss = compute_loss(dev_words, bigram_probs, 2)
bigram_test_loss = compute_loss(test_words, bigram_probs, 2)

trigram_train_loss = compute_loss(train_words, trigram_probs, 3)
trigram_dev_loss = compute_loss(dev_words, trigram_probs, 3)
trigram_test_loss = compute_loss(test_words, trigram_probs, 3)

print(f"Bigram Model - Train Loss: {bigram_train_loss:.4f}, Dev Loss: {bigram_dev_loss:.4f}, Test Loss: {bigram_test_loss:.4f}")
print(f"Trigram Model - Train Loss: {trigram_train_loss:.4f}, Dev Loss: {trigram_dev_loss:.4f}, Test Loss: {trigram_test_loss:.4f}")

Bigram Model - Train Loss: 2.4542, Dev Loss: 2.4518, Test Loss: 2.4533
Trigram Model - Train Loss: 2.0567, Dev Loss: 2.0555, Test Loss: 2.0567


E03: use the dev set to tune the strength of smoothing (or regularization) for the trigram model - i.e. try many possibilities and see which one works best based on the dev set loss. What patterns can you see in the train and dev set loss as you tune this strength? Take the best setting of the smoothing and evaluate on the test set once and at the end. How good of a loss do you achieve?



In [151]:
def compute_trigram_probs_with_smoothing(counts, alpha):
    smoothed_counts = counts + alpha
    smoothed_probs = smoothed_counts / smoothed_counts.sum(dim=2, keepdim=True)
    return smoothed_probs


In [152]:

def compute_loss(words, model_probs, ngram_size):
    total_loss = 0
    total_chars = 0

    for word in words:
        chs = ['.'] + list(word) + ['.']
        for i in range(len(chs) - ngram_size + 1):
            indices = [stoi[chs[j]] for j in range(i, i + ngram_size)]
            prob = model_probs[tuple(indices)].item()  # Probability of the next character
            if prob > 0:
                total_loss += -np.log(prob)  # Negative log-likelihood
            total_chars += 1

    return total_loss / total_chars  # Average loss per character


In [153]:
alphas = np.arange(0.0,10,0.5)

train_losses = []
dev_losses = []

for alpha in alphas:
    smoothed_trigram_probs = compute_trigram_probs_with_smoothing(trigram_counts, alpha)

    train_loss = compute_loss(train_words, smoothed_trigram_probs, 3)
    dev_loss = compute_loss(dev_words, smoothed_trigram_probs, 3)

    train_losses.append(train_loss)
    dev_losses.append(dev_loss)

    print(f"Alpha: {alpha:.1f} | Train Loss: {train_loss:.4f}, Dev Loss: {dev_loss:.4f}")

best_alpha_idx = np.argmin(dev_losses)
best_alpha = alphas[best_alpha_idx]

best_trigram_probs = compute_trigram_probs_with_smoothing(trigram_counts, best_alpha)
test_loss = compute_loss(test_words, best_trigram_probs, 3)

print(f"\nBest Alpha: {best_alpha:.1f}")
print(f"Test Loss with Best Alpha: {test_loss:.4f}")

Alpha: 0.0 | Train Loss: 2.0567, Dev Loss: 2.0555
Alpha: 0.5 | Train Loss: 2.0785, Dev Loss: 2.1188
Alpha: 1.0 | Train Loss: 2.0948, Dev Loss: 2.1297
Alpha: 1.5 | Train Loss: 2.1091, Dev Loss: 2.1406
Alpha: 2.0 | Train Loss: 2.1219, Dev Loss: 2.1512
Alpha: 2.5 | Train Loss: 2.1338, Dev Loss: 2.1612
Alpha: 3.0 | Train Loss: 2.1449, Dev Loss: 2.1709
Alpha: 3.5 | Train Loss: 2.1554, Dev Loss: 2.1801
Alpha: 4.0 | Train Loss: 2.1653, Dev Loss: 2.1889
Alpha: 4.5 | Train Loss: 2.1748, Dev Loss: 2.1974
Alpha: 5.0 | Train Loss: 2.1838, Dev Loss: 2.2056
Alpha: 5.5 | Train Loss: 2.1925, Dev Loss: 2.2136
Alpha: 6.0 | Train Loss: 2.2008, Dev Loss: 2.2212
Alpha: 6.5 | Train Loss: 2.2089, Dev Loss: 2.2287
Alpha: 7.0 | Train Loss: 2.2166, Dev Loss: 2.2359
Alpha: 7.5 | Train Loss: 2.2242, Dev Loss: 2.2429
Alpha: 8.0 | Train Loss: 2.2315, Dev Loss: 2.2497
Alpha: 8.5 | Train Loss: 2.2386, Dev Loss: 2.2564
Alpha: 9.0 | Train Loss: 2.2454, Dev Loss: 2.2629
Alpha: 9.5 | Train Loss: 2.2521, Dev Loss: 2.2692


In [154]:
# Tried many alphas, any smoothing makes loss worse

E04: we saw that our 1-hot vectors merely select a row of W, so producing these vectors explicitly feels wasteful. Can you delete our use of F.one_hot in favor of simply indexing into rows of W?


In [155]:
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)


number of examples:  228146


In [180]:
g = torch.Generator()
W = torch.randn((27, 27), generator=g, requires_grad=True)


In [181]:
# Gradient descent
for k in range(500):
    logits = W[xs]
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(num), ys].log().mean() + 0.01 * (W**2).mean()

    W.grad = None
    loss.backward()

    W.data += -50 * W.grad
print(loss.item())


2.4807074069976807


In [184]:
# Sampling
g = torch.Generator()

for i in range(50):
    out = []
    ix = 0
    while True:
        logits = W[ix]  # Directly access the row of W for the current index
        counts = logits.exp()
        probs = counts / counts.sum()
        ix = torch.multinomial(probs, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))


ce.
iatelfigghyn.
yla.
tuliaayerttierarie.
maieyn.
stanira.
klon.
mi.
bl.
khialolerar.
zananibrig.
bleianey.
e.
vaamale.
morina.
igh.
a.
bra.
ca.
ahanaroushie.
rananusengua.
lonaniyl.
reren.
sxfosufoh.
mmyssonn.
safelya.
eri.
giaereksaayrilaia.
dilieo.
k.
ckiykagammynisha.
kemad.
coc.
ne.
kl.
jade.
kayaman.
n.
tolelosmay.
katten.
.
kalellirsiamar.
mardy.
a.
cevoleusann.
diestelli.
t.
dishlinatararobariyan.
e.
jaileshalamh.


E05: look up and use F.cross_entropy instead. You should achieve the same result. Can you think of why we'd prefer to use F.cross_entropy instead?


In [195]:

xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()

g = torch.Generator().manual_seed(99)
W = torch.randn((27, 27), generator=g, requires_grad=True)


In [196]:

for k in range(50):
    logits = W[xs]
    loss = F.cross_entropy(logits, ys) + 0.01 * (W**2).mean()

    W.grad = None
    loss.backward()

    W.data += -50 * W.grad

print(loss.item())

2.5087413787841797


E06: meta-exercise! Think of a fun/interesting exercise and complete it.


In [None]:
# Idea: Fixed characters for start/stop, that is, given i, j, generates names of the form "i-----j"
# Issue: This turned out to generate nonsense that was too long, any length contrainsts also sucked
# Solution - added a bias \alpha towards the stop character
# \alpha is arbitrary, and depends too heavily on the particular start/stop, maybe fix later

In [216]:
#First, with bigram counts, then with a NN

def interpolate_name(model_probs, start_char, end_char, max_len=20, alpha=0.5):
    start_idx = stoi[start_char]
    end_idx = stoi[end_char]
    current_idx = start_idx
    name = [start_char]
    steps = 0

    while steps < max_len and current_idx != end_idx:
        probs = model_probs[current_idx].float()

        # Zero out probabilities for start/stop sequence
        probs[stoi['.']] = 0
        probs = probs / probs.sum()

        # Interpolate toward the end character
        bias = torch.zeros_like(probs)
        bias[end_idx] = 1.0  # Full bias toward the end character
        probs = (1 - alpha) * probs + alpha * bias  # Weighted combination
        probs = probs / probs.sum()

        current_idx = torch.multinomial(probs, num_samples=1).item()
        name.append(itos[current_idx])

        steps += 1

    return ''.join(name)


In [217]:
# Train the bigram model
bigram_counts = torch.zeros((27, 27), dtype=torch.float32)
for word in words:
    chs = ['.'] + list(word) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        bigram_counts[ix1, ix2] += 1

bigram_probs = bigram_counts / bigram_counts.sum(dim=1, keepdim=True)


In [231]:
# Generate names starting from 'a' and stopping at 'h'
start_char = 'a'
end_char = 'h'
for i in range(10):
    alpha = 0.15  # Hand chosen :/
    name = interpolate_name(bigram_probs, start_char, end_char, alpha=alpha)
    print(f"{name}")

anstih
aleth
aliirawanah
ah
aieladolh
atlesh
alynih
anysadenvalanih
assomiaderh
ah


In [249]:
## Neural Net approach

def interpolate_name_nn(W, start_char, end_char, max_len=20, alpha=0.5):

    start_idx = stoi[start_char]
    end_idx = stoi[end_char]
    current_idx = start_idx
    name = [start_char]
    steps = 0

    while steps < max_len and current_idx != end_idx:
        logits = W[current_idx].clone()

        logits[stoi['.']] = float('-inf')
        logits = logits - logits.max()

        probs = logits.exp()
        probs = probs / probs.sum()

        bias = torch.zeros_like(probs)
        bias[end_idx] = 1.0
        probs = (1 - alpha) * probs + alpha * bias
        probs = probs / probs.sum()

        current_idx = torch.multinomial(probs, num_samples=1).item()
        name.append(itos[current_idx])

        steps += 1

    return ''.join(name)

xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
num = xs.nelement()
print('number of examples: ', num)


number of examples:  228146


In [250]:
g = torch.Generator()
W = torch.randn((27, 27), generator=g, requires_grad=True)

In [251]:
for k in range(50):
    logits = W[xs]
    loss = F.cross_entropy(logits, ys) + 0.01 * (W**2).mean()
    W.grad = None
    loss.backward()
    W.data += -50 * W.grad
print(loss.item())

2.5101189613342285


In [255]:
# Generate names between 'a' and 'h'
start_char = 'a'
end_char = 'h'
for i in range(10):
    alpha = 0.2  # Random interpolation strength
    name = interpolate_name_nn(W, start_char, end_char, alpha=alpha)
    print(f"{name}")

anndh
avynsantcarh
airah
angineleh
ah
aynalecenadeh
alleleynnh
ah
annah
ananapbllalh
