In [16]:
# Split up the dataset randomly into 80% train set, 10% dev set, 10% test set.
# Train the bigram and trigram models only on the training set. Evaluate them on dev and test splits. What can you see?
import random # shuffle the list of words to get an even distribution

words = open('../names.txt', 'r').read().splitlines()
random.seed(230)
random.shuffle(words)
train_set = words[:int(len(words)*0.8)]
dev_set = words[int(len(words)*0.8):int(len(words)*0.9)]
test_set = words[int(len(words)*0.9):]


In [15]:
# Map the words to indexes
chars = sorted(list(set(''.join(words)))) # get the unique characters through the set() method
stoi = {s:i +1 for i,s in enumerate(chars)} # string to index
stoi['.'] = 0 # end character

itos = {i:s for s,i in stoi.items()} # index to string
itos

{1: 'a',
 2: 'b',
 3: 'c',
 4: 'd',
 5: 'e',
 6: 'f',
 7: 'g',
 8: 'h',
 9: 'i',
 10: 'j',
 11: 'k',
 12: 'l',
 13: 'm',
 14: 'n',
 15: 'o',
 16: 'p',
 17: 'q',
 18: 'r',
 19: 's',
 20: 't',
 21: 'u',
 22: 'v',
 23: 'w',
 24: 'x',
 25: 'y',
 26: 'z',
 0: '.'}

## Let's now implement the Bigram Model & see how our model performs

In [30]:
import torch

xs, ys = [], []

for w in train_set:
    with_breaks = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(with_breaks, with_breaks[1:]):
        index_one = stoi[ch1]
        index_two = stoi[ch2]

        xs.append(index_one) # this is the training set for the neural network
        ys.append(index_two)

xs = torch.tensor(xs) # convert to tensor
ys = torch.tensor(ys)

batch_size = xs.nelement()

In [33]:
import torch.nn.functional as F

xenc = F.one_hot(xs).float() # one hot encode the inputs
batch_size

182699

In [36]:
import torch.nn.functional as F

g = torch.Generator().manual_seed(2147483647)
weights = torch.randn(27, 27, requires_grad=True, generator=g) # create a random weight matrix

for k in range(40):
    #forward pass
    logits = xenc @ weights # matrix multiplication, give us the log counts
    counts = logits.exp() # convert to counts, get rid of the negatives
    probs = counts / counts.sum(1, keepdims=True)
    loss = -probs[torch.arange(batch_size), ys].log().mean() + 0.01*(weights**2).sum() # L2 regularization. The weights want to be zero, when it's not there is loss so you want to squash
    print(loss.item())

    # backward pass
    weights.grad = None
    loss.backward()

    # update
    weights.data += -50 * weights.grad


10.805068016052246
3.2928354740142822
3.1858861446380615
3.181519031524658
3.1809442043304443
3.180788993835449
3.1807479858398438
3.180734395980835
3.180729866027832
3.1807281970977783
3.180727958679199
3.180727958679199
3.180727958679199
3.180727958679199
3.18072772026062
3.180727481842041
3.18072772026062
3.180727481842041
3.18072772026062
3.180727481842041
3.180727481842041
3.18072772026062
3.18072772026062
3.18072772026062
3.18072772026062
3.18072772026062
3.18072772026062
3.180727958679199
3.18072772026062
3.180727481842041
3.18072772026062
3.180727481842041
3.180727958679199
3.18072772026062
3.180727958679199
3.18072772026062
3.180727958679199
3.18072772026062
3.180727958679199
3.180727481842041


In [39]:
# Convert my dev set to indices so that I can evaluate the model
dev_xs, dev_ys = [], []

for w in dev_set:
    with_breaks = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(with_breaks, with_breaks[1:]):
        index_one = stoi[ch1]
        index_two = stoi[ch2]

        dev_xs.append(index_one) # this is the training set for the neural network
        dev_ys.append(index_two)

dev_xs = torch.tensor(dev_xs) # convert to tensor
dev_ys = torch.tensor(dev_ys)

dev_batch_size = xs.nelement()

In [50]:
dev_xs # contains the indices of the dev set

tensor([ 0,  5, 13,  ..., 12,  9,  1])

In [61]:
# Now let's sample from the neural net using the dev set
# We'll develop an accuracy metric

correct_predictions = 0
total_predictions = 0

# Assuming dev_xs and dev_ys contain corresponding character indices from the dev set
for i in range(len(dev_xs)):
    current_index = dev_xs[i].item()
    next_true_index = dev_ys[i].item()

    # Encode the current index
    xenc = F.one_hot(torch.tensor([current_index]), num_classes=27).float()

    # Do a forward pass
    logits = xenc @ weights
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims=True)

    # Get the predicted next character index
    predicted_index = torch.argmax(probs, dim=1).item()  # Using argmax for deterministic prediction here

    # Update accuracy counters
    correct_predictions += (predicted_index == next_true_index)
    total_predictions += 1

# Calculate and print accuracy
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.2273
