# Creating token embeddings

Illustration of how the token ID to embedding vector conversion works with a hands on example.

Suppose we have the following four tokens with IDs 2, 3, 5 and 1.

In [21]:
import torch
import tiktoken

In [22]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

For the sake of simplicity we are going to use a small vocabulary of 6 words(instead of 50,257 words in the BPE tokenizer vocabulary), and we want to create embeddings of size 3(in GPT-3 the embedding size is 12,288 dimentions).


vocab: Each of these 6 words will be mapped in R3

quick-->4

fox-->0

is-->3

in-->2

the-->5

house-->1

torch.nn.Embedding return a simple lookup table that stores embeddings of a fixed dictionary and size.

In [23]:
vocab_size=6
output_dim=3
input_ids = torch.tensor([2,3,5,1])

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim) #creates a dictionary
#this initialize the weights of the embedding matrix in a random manner

In [24]:
print(embedding_layer.weight)
#these are the initial weights which needs to be optimized during LLM training as part of the LLM optmization itself.

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [25]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [19]:
print(embedding_layer(input_ids))

# a look up operation that retrieves rows from the embedding layer weight matrix using a token ID.

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


## Positional embeddings (Encoding word positions):

We encode the input tokens into 256-dimensional vector representation.

Assumption: The token IDs were created by the BPE tokenizer which has a vocabulary size of 50,257.

In [26]:
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride): #max length = context size
        self.input_ids = []
        self.target_ids = []
        #tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftxt|>"})
        #using a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids)-max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
        
    #method which will be used by the data loader
    def __getitem__(self, idx): #idx=index
        return self.input_ids[idx], self.target_ids[idx]

In [27]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    #initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")
    #create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    #create dataloader, this function will check the get item method in above function and it will return the input output pairs based on what
    #is mentioned in the get item.
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

In [28]:
vocab_size=50257
optput_dim=256 #GPT-3 has the embedding size of 12,288
token_embedding_layers = torch.nn.Embedding(vocab_size, output_dim)

In [29]:
max_length=4
dataloader=create_dataloader_v1(raw_text,batch_size=8,max_length=max_length,stride=max_length,shuffle=False)
data_iter=iter(dataloader)
inputs, targets=next(data_iter)

In [30]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [33]:
token_embeddings=token_embedding_layers(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 3])
