In [None]:
import numpy as np

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel

# Sinusoidal Positional Encoding

The positional encoding function used in the original Transformer paper is

$$
\begin{align*}
PE_{pos, 2i} &= \sin\left(\frac{pos}{10000^{2i/d_{model}}}\right)\\
PE_{pos, 2i+1} &= \cos\left(\frac{pos}{10000^{2i/d_{model}}}\right)
\end{align*}
$$

This was chosen based on the following desirable properties:

1. The positional encodings are unique for different positions.
2. The encoding function is continuous and differentiable.
3. Encodings are linear with respect to the position.
4. The encodings should generalize to out-of-training sequence lengths.

In [2]:
def sinusoidal_pos_encoding(n_position, dim):
    def cal_angle(position, hid_idx):
        return position / np.power(10000, 2 * (hid_idx // 2) / dim)
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(dim)]
    sinusoidal_encoding = torch.tensor([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoidal_encoding[:, 0::2] = torch.sin(sinusoidal_encoding[:, 0::2])  # dim 2i
    sinusoidal_encoding[:, 1::2] = torch.cos(sinusoidal_encoding[:, 1::2])  # dim 2i+1
    return sinusoidal_encoding

In [None]:
model_id = "Qwen/Qwen2.5-0.5B"
tok = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)

sequence1 = "Naomi went to the store."
sequence2 = "Naomi went to the store to buy some reaction mass pellets."
tokens1 = tok(sequence1, return_tensors="pt")["input_ids"]
embeddings1 = model.embed_tokens(tokens1)
tokens2 = tok(sequence2, return_tensors="pt")["input_ids"]
embeddings2 = model.embed_tokens(tokens2)

In [None]:
print(tokens1.shape, embeddings1.shape)
print(tokens2.shape, embeddings2.shape)

# Generate position encodings for each sequence
pos_enc1 = sinusoidal_pos_encoding(tokens1.shape[1], model.config.hidden_size)
pos_enc2 = sinusoidal_pos_encoding(tokens2.shape[1], model.config.hidden_size)

print(pos_enc1.shape, pos_enc2.shape)

# compare the positional encodings beteween the two sequences
for i in range(pos_enc1.shape[0]):
    print(f"pos {i} same: ", torch.allclose(pos_enc1[i], pos_enc2[i]))

# show distances beween i and i+1 for each encoding for the first 7 positions
print("Distances between consecutive positions for encoding 1")
for i in range(6):
    print(f"pos {i} diff: ", torch.dist(pos_enc1[i], pos_enc1[i+1]))

# show distances beween i and i+1 for each encoding for the first 7 positions
print("Distances between consecutive positions for encoding 2")
for i in range(6):
    print(f"pos {i} diff: ", torch.dist(pos_enc2[i], pos_enc2[i+1]))