## Créer des embeddings de tokens


La dernière étape de préparation du texte d'entrée pour entraîner un LLM consiste à convertir les identifiants de tokens en vecteurs d'embedding.


In [1]:
import torch
from gptlight.datasets import fetch_verdict_text
from gptlight.tokenizer import GPTTokenizer
from gptlight.utils import create_dataloader

In [2]:
raw_text = fetch_verdict_text() 
tokenizer = GPTTokenizer()

In [3]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(num_embeddings=12 , embedding_dim=5)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.3035, -0.5880,  0.3486],
        [ 0.6603, -0.2196, -0.3792,  0.7671, -1.1925],
        [ 0.6984, -1.4097,  0.1794,  1.8951,  0.4954],
        [ 0.2692, -0.0770, -1.0205, -0.1690,  0.9178],
        [ 1.5810,  1.3010,  1.2753, -0.2010,  0.4965],
        [-1.5723,  0.9666, -1.1481, -1.1589,  0.3255],
        [-0.6315, -2.8400, -1.3250,  0.1784, -2.1338],
        [ 1.0524, -0.3885, -0.9343, -0.4991, -1.0867],
        [ 0.8805,  1.5542,  0.6266, -0.1755,  1.3111],
        [-0.2199,  0.2190,  0.2045,  0.5146,  0.9938],
        [-0.2587, -1.0826,  0.1036, -2.1996, -0.0885],
        [-0.5612,  0.6716,  0.6933, -0.9487, -0.0765]], requires_grad=True)


In [4]:
print(embedding_layer(torch.tensor([2, 3, 4])))

tensor([[ 0.6984, -1.4097,  0.1794,  1.8951,  0.4954],
        [ 0.2692, -0.0770, -1.0205, -0.1690,  0.9178],
        [ 1.5810,  1.3010,  1.2753, -0.2010,  0.4965]],
       grad_fn=<EmbeddingBackward0>)


- Encoder les positions des mots


In [5]:
torch.manual_seed(123)
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size , embedding_dim=output_dim)

In [6]:
max_length = 4
dataloader = create_dataloader(
    txt=raw_text,
    batch_size=8,
    max_length=max_length,
    stride=max_length,
    shuffle=False   
)

data_iter = iter(dataloader)

inputs, targets = next(data_iter)
print(f"Inputs : \n {inputs}")
print(f"Targets : \n {targets}")
print(f"Inputs shape : {inputs.shape}")


Inputs : 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets : 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
Inputs shape : torch.Size([8, 4])


In [7]:
token_embeddings = token_embedding_layer(inputs)
print(f"Tokens Embeddings Shape : {token_embeddings.shape}")

Tokens Embeddings Shape : torch.Size([8, 4, 256])


In [8]:
context_length = max_length # Dans chaque batch, le nombre de token d'une entrée est au plus égale à max_lenght du DataLoader
pos_embedding_layer = torch.nn.Embedding(num_embeddings=context_length, embedding_dim=output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [9]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
