In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt



# Getting the data

In [2]:
# download the names.txt file from github

import requests

url = "https://raw.githubusercontent.com/karpathy/makemore/master/names.txt"

In [3]:
response = requests.get(url)

if response.status_code == 200:
    with open("names.txt", "w") as file:
        file.write(response.text)
    print("Le fichier a été téléchargé avec succès.")
else:
    print("Le téléchargement du fichier a échoué")

Le fichier a été téléchargé avec succès.


In [4]:
# response.text.splitlines() allows us to directly get the list but in the case I'm working offline, I download locally the file

In [5]:
#with open("names.txt", "r") as file:
#    raw_names = file.read()

raw_names = open('names.txt', 'r').read()

In [6]:
names = raw_names.splitlines()
print(len(names))
names[:5]

32033


['emma', 'olivia', 'ava', 'isabella', 'sophia']

# Building the vocabulary of characters and mapping it

In [7]:
characters = sorted(list(set(''.join(name for name in names))))
char_to_int = {char:i+1 for i, char in enumerate(characters)}
char_to_int["."] = 0 #special character to begin of put an end to a sequence
int_to_char = {i:char for char, i in char_to_int.items()}

# Building the dataset

In [8]:
context_size = 4 # number of precedents char to take into account

def build_dataset(words, context_size):
    X = []
    y = []

    for name in names:
        context = [0] * context_size

        for char in name + '.':
            idx = char_to_int[char]
            X.append(context)
            y.append(idx)

            #print(f"{''.join(int_to_char[i] for i in context)}")

            context = context[1:] + [idx]

    X, y = torch.tensor(X), torch.tensor(y)
    print(f"X.shape : {X.shape}, y.shape : {y.shape}")

    return X, y

In [9]:
X, y = build_dataset(names, context_size)

X.shape : torch.Size([228146, 4]), y.shape : torch.Size([228146])


In [10]:
import random
random.seed(42)
random

n1 = int(0.8 * X.shape[0])
n2 = int(0.9 * X.shape[0])

Xtr, Xval, Xtst = X.tensor_split((n1, n2), dim=0)
ytr, yval, ytst = y.tensor_split((n1, n2), dim=0)

In [11]:
print(f"Xtr.shape : {Xtr.shape}, Xval.shape : {Xval.shape}, Xtst.shape : {Xtst.shape}")

Xtr.shape : torch.Size([182516, 4]), Xval.shape : torch.Size([22815, 4]), Xtst.shape : torch.Size([22815, 4])


# Designing the MLP

In [53]:
g = torch.Generator().manual_seed(42) # for reproductibility

In [54]:
C = torch.rand([27, 2], generator=g) # characters to vectors
W1 = torch.rand([4*2, 100], generator=g) # putting in the MLP 4 characters and each one of them has 2 dimensionnal vector associated
b1 = torch.rand(100, generator=g)
W2 = torch.rand([100, 27], generator=g)
b2 = torch.rand(27, generator=g)

# Train the model

In [55]:
emb = C[X]
print(emb.shape)
emb.view(-1, 8).shape

torch.Size([228146, 4, 2])


torch.Size([228146, 8])

In [47]:
step1 = torch.tanh(emb.view(-1, 8) @ W1 + b1)
step1.shape

torch.Size([228146, 100])

In [50]:
step2 = step1 @ W2 + b2
step2.shape # just before applying softmax

torch.Size([228146, 27])

In [51]:
logits = step2

In [52]:
logits

tensor([[49.3166, 51.0630, 48.5743,  ..., 56.3222, 50.4795, 47.2554],
        [49.0209, 50.7765, 48.2821,  ..., 56.0272, 50.1285, 46.9653],
        [48.9999, 50.6780, 48.2724,  ..., 55.9436, 50.1138, 46.9002],
        ...,
        [47.5051, 49.1143, 46.9756,  ..., 54.2297, 48.5662, 45.5596],
        [45.6855, 47.2214, 45.2956,  ..., 52.3446, 46.6939, 43.9058],
        [47.6967, 49.4407, 47.1371,  ..., 54.5449, 48.8628, 45.8645]])

In [56]:
loss = F.cross_entropy(logits, y)

In [57]:
loss

tensor(7.6273)