# Intuition behind Word Embeddings and Positional Word Embeddings in GPT like Models

In [2]:
# Imports
import torch
import torch.nn


### Just for intuition of Embeddings, we will consider vocab_size = 6, word_embedding_size = 4, and pos_embedding_size = 4

In [None]:
# creating word embeddings
vocab_size = 6
word_embedding_size = 4
word_embed  = torch.nn.Embedding(vocab_size, word_embedding_size)

In [6]:
# initalizing with random numbers
# it contains trainable parameters, which will get updated with training

# Initial embeddings
word_embed.weight

Parameter containing:
tensor([[ 0.8803,  0.9265, -0.2485, -0.7905],
        [ 0.1671, -0.4548, -1.1456,  0.5138],
        [-0.9792,  0.3675,  0.9125, -0.9021],
        [ 1.7908,  2.7524,  0.6820, -1.3109],
        [-0.5740,  0.0056, -1.2501,  1.0403],
        [ 1.7969,  1.1708, -1.1153,  0.3860]], requires_grad=True)

In [8]:
# vocabulary_size, word_embed_size
word_embed.weight.shape

torch.Size([6, 4])

In [9]:
# first vocab / token embeddings
word_embed.weight[0]

tensor([ 0.8803,  0.9265, -0.2485, -0.7905], grad_fn=<SelectBackward0>)

### Positional embeddings

In [11]:
# pos_embedding_size out size should match with word_embed out size
# beacuse final embeddings = word embeddings + postional embeddings
# it contains trainable parameters, which will get updated with training

pos_embedding_size = 4
pos_embed = torch.nn.Embedding(vocab_size, pos_embedding_size)

In [12]:
pos_embed.weight

Parameter containing:
tensor([[-0.1031, -0.9458,  1.0143,  0.8707],
        [-1.2417,  0.0516,  0.9171, -0.1363],
        [ 1.3295, -0.3977, -1.3267, -0.7087],
        [ 1.4948, -0.9895,  1.0575,  0.6096],
        [-0.3655, -0.7516,  0.5253,  0.2481],
        [-0.1067,  0.1675,  0.7256,  0.7739]], requires_grad=True)

In [13]:
pos_embed.weight.shape

torch.Size([6, 4])

### Final embedding

In [27]:
# We need postional embeddings because attention mechanism works parallely, and don't have track of token positions
# let's consider only 1 vocab/token to generate embedding

num_tokens = 1
t_embed = word_embed(torch.arange(num_tokens))
p_embed = pos_embed(torch.arange(num_tokens))

final_embed  = t_embed + p_embed
t_embed,p_embed, final_embed

(tensor([[ 0.8803,  0.9265, -0.2485, -0.7905]], grad_fn=<EmbeddingBackward0>),
 tensor([[-0.1031, -0.9458,  1.0143,  0.8707]], grad_fn=<EmbeddingBackward0>),
 tensor([[ 0.7772, -0.0193,  0.7658,  0.0802]], grad_fn=<AddBackward0>))