Transformer-based embedding model for text typically involves:

1) Embedding + Positional Encoding to convert tokens into a learnable representation with positional information.


2) A stack of self-attention + feed-forward layers (the transformer encoder) to build context-aware embeddings.

3) A pooling mechanism (often [CLS] or mean pooling) to derive a single vector per sentence or document if needed.

4) Pretraining on large-scale corpora (often with masked language modeling),

5) Fine-tuning on specific tasks.

These steps yield high-quality text embeddings that capture semantic relationships and context making transformers the top choice for modern NLP tasks.


In [9]:
# Full Toy Example Python

# Example small vocabulary (toy example)
vocab = {
    "I": 0,
    "love": 1,
    "cats": 2,
    "dogs": 3
}

# Let's define an embedding dimension:
EMBED_DIM = 6

# Manually define an embedding table for demonstration
# This is a list of 'vocab_size' lists, each inner list has length EMBED_DIM.
# In a real scenario, these would be learned parameters.
EMBEDDING_TABLE = [
    [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],  # embedding for vocab ID 0
    [0.7, 0.8, 0.9, 0.1, 0.2, 0.3],  # embedding for vocab ID 1
    [0.4, 0.3, 0.2, 0.1, 0.9, 0.8],  # embedding for vocab ID 2
    [0.1, 0.5, 0.4, 0.4, 0.4, 0.2]   # embedding for vocab ID 3
]

def embed_tokens(token_ids):
    """
    Given a list of token IDs, return a list of embedding vectors.
    Each vector is length EMBED_DIM.
    """
    embedded_sequence = []
    for tid in token_ids:
        embedded_sequence.append(EMBEDDING_TABLE[tid])  # simple lookup
    return embedded_sequence  # shape: (sequence_length x EMBED_DIM)

# Example sentence
sentence_tokens = ["I", "love", "cats"]

# 1. Convert tokens to IDs
token_ids = [vocab[tok] for tok in sentence_tokens]  # e.g. [0, 1, 2]

# 2. Embed these tokens
embedded_seq = embed_tokens(token_ids)
# embedded_seq might look like:
# [
#   [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
#   [0.7, 0.8, 0.9, 0.1, 0.2, 0.3],
#   [0.4, 0.3, 0.2, 0.1, 0.9, 0.8]
# ]
embedded_seq

[[0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
 [0.7, 0.8, 0.9, 0.1, 0.2, 0.3],
 [0.4, 0.3, 0.2, 0.1, 0.9, 0.8]]

In [31]:
import math

# We loop over positions 0 to max_seq_len - 1.
# For each dimension pair (2i, 2i+1), we compute sine and cosine.
# Note that in practice,
# many frameworks vectorize this calculation for speed,
# but the logic remains the same.


def create_positional_encodings(max_seq_len, embedding_dim):
    """
    Generates positional encodings for positions [0..max_seq_len-1].
    Returns a list of size `max_seq_len`, where each item is
    a list of length `embedding_dim`.
    """

    position_encodings = []
    for pos in range(max_seq_len):
        encoding = [0.0]*embedding_dim
        for i in range(0,embedding_dim,2):
            angle_rate = (1.0/math.pow(10000, (2*i)/embedding_dim))
            encoding[i] = math.sin(pos*angle_rate)
            if i+1 < embedding_dim:
                encoding[i+1] = math.cos(pos*angle_rate)
        position_encodings.append(encoding)

    return position_encodings

# 3. Create positional encodings up to max_seq_len = 10 (arbitrary)
pos_enc = create_positional_encodings(10, EMBED_DIM)
# pos_enc is a list with 100 items, each item is a 6-element list

for idx,p in enumerate(pos_enc):
    print(idx,list(f"{x:.2f}" for x in p))

0 ['0.00', '1.00', '0.00', '1.00', '0.00', '1.00']
1 ['0.84', '0.54', '0.00', '1.00', '0.00', '1.00']
2 ['0.91', '-0.42', '0.00', '1.00', '0.00', '1.00']
3 ['0.14', '-0.99', '0.01', '1.00', '0.00', '1.00']
4 ['-0.76', '-0.65', '0.01', '1.00', '0.00', '1.00']
5 ['-0.96', '0.28', '0.01', '1.00', '0.00', '1.00']
6 ['-0.28', '0.96', '0.01', '1.00', '0.00', '1.00']
7 ['0.66', '0.75', '0.02', '1.00', '0.00', '1.00']
8 ['0.99', '-0.15', '0.02', '1.00', '0.00', '1.00']
9 ['0.41', '-0.91', '0.02', '1.00', '0.00', '1.00']


In [29]:
#Combining Token Embeddings with Positional Encodings
#To get the final input to the transformer’s encoder (for a single sequence), we sum:

# 𝐸token +𝐸 positional

#where each is a list (or array) of size EMBED_DIM. Here’s a simple example:

def add_positional_encodings(embedded_sequence, positional_encodings):
    """
    Sums the token embeddings with the positional encodings.
    Both are lists of lists of length `embedding_dim`.

    embedded_sequence: shape (seq_len x EMBED_DIM)
    positional_encodings: shape (max_seq_len x EMBED_DIM)
    """
    assert len(embedded_sequence[0]) == len(positional_encodings[0])

    seq_len = len(embedded_sequence)
    encoded_sequence = list()

    for p in range(seq_len):
        token_vec = embedded_sequence[p]
        pos_vec = positional_encodings[p]

        combined_vec = [token_vec[i] + pos_vec[i] for i in range(len(token_vec))]
        encoded_sequence.append(combined_vec)

    assert len(encoded_sequence) == len(embedded_sequence)

    return encoded_sequence



In [32]:
# 4. Combine the embeddings with the corresponding positional vectors
final_encoded_seq = add_positional_encodings(embedded_seq, pos_enc)
# final_encoded_seq is also shape (3 x 6), representing the input to the first Transformer layer.


print("Embedded sequence (no position):")
for idx, row in enumerate(embedded_seq):
    print(idx, list(f"{x:.2f}" for x in row))

print("\nPositional encodings (first 3 positions):")
for idx in range(len(sentence_tokens)):
    p = pos_enc[idx]
    print(idx, list(f"{x:.2f}" for x in p))

print("\nCombined (token + positional):")
for idx, row in enumerate(final_encoded_seq):
    print(idx, list(f"{x:.2f}" for x in row))

Embedded sequence (no position):
0 ['0.10', '0.20', '0.30', '0.40', '0.50', '0.60']
1 ['0.70', '0.80', '0.90', '0.10', '0.20', '0.30']
2 ['0.40', '0.30', '0.20', '0.10', '0.90', '0.80']

Positional encodings (first 3 positions):
0 ['0.00', '1.00', '0.00', '1.00', '0.00', '1.00']
1 ['0.84', '0.54', '0.00', '1.00', '0.00', '1.00']
2 ['0.91', '-0.42', '0.00', '1.00', '0.00', '1.00']

Combined (token + positional):
0 ['0.10', '1.20', '0.30', '1.40', '0.50', '1.60']
1 ['1.54', '1.34', '0.90', '1.10', '0.20', '1.30']
2 ['1.31', '-0.12', '0.20', '1.10', '0.90', '1.80']


In [35]:
import math
import numpy as np

def create_embedding_matrix(vocab_size, embedding_dim, seed=42):
    rng = np.random.default_rng(seed)
    return rng.normal(loc=0.0, scale=0.1, size=(vocab_size, embedding_dim))

def embed_tokens(token_ids, embedding_matrix):
    batch_size, seq_len = token_ids.shape
    embed_dim = embedding_matrix.shape[1]
    output = np.zeros((batch_size, seq_len, embed_dim), dtype=np.float32)
    for b in range(batch_size):
        for p in range(seq_len):
            tid = token_ids[b, p]
            output[b, p, :] = embedding_matrix[tid]
    return output

def create_positional_encodings(max_len, d_model):
    pos_enc = np.zeros((max_len, d_model), dtype=np.float32)
    for pos in range(max_len):
        for i in range(0, d_model, 2):
            angle_rate = 1.0 / (10000.0 ** ((2.0 * i) / d_model))
            pos_enc[pos, i] = math.sin(pos * angle_rate)
            if i + 1 < d_model:
                pos_enc[pos, i + 1] = math.cos(pos * angle_rate)
    return pos_enc

def add_positional_encoding(embedded_tokens, pos_enc):
    batch_size, seq_len, embed_dim = embedded_tokens.shape
    for p in range(seq_len):
        embedded_tokens[:, p, :] += pos_enc[p]
    return embedded_tokens

# ---- Example usage ----

# 1) Create random embedding weights for a small vocab
vocab_size = 4
embed_dim = 6
embedding_matrix = create_embedding_matrix(vocab_size, embed_dim)

# 2) Prepare a mini-batch of token IDs: shape (batch_size=2, seq_len=3)
#    Example:
#    Sequence 1 -> [0, 1, 2]
input_ids = np.array([
    [0, 1, 2],
], dtype=np.int64)  # shape: (2, 3)

# 3) Embed the tokens
embedded_tokens = embed_tokens(input_ids, embedding_matrix)
# shape: (2, 3, 6)

# 4) Create positional encodings up to max_len=10 (just to be safe)
pos_enc = create_positional_encodings(max_len=10, d_model=embed_dim)
# shape: (10, 6)

# 5) Add positional encodings to the token embeddings
embedded_with_pos = add_positional_encoding(embedded_tokens, pos_enc)
# shape remains (1, 3, 6)

print("Final shape:", embedded_with_pos.shape)
print(embedded_with_pos)


Final shape: (1, 3, 6)
[[[ 3.0471709e-02  8.9600158e-01  7.5045116e-02  1.0940565e+00
   -1.9510353e-01  8.6978203e-01]
  [ 8.5425502e-01  5.0867802e-01  4.7431723e-04  9.1469330e-01
    8.7944441e-02  1.0777792e+00]
  [ 9.1590047e-01 -3.0342272e-01  5.1059790e-02  9.1406143e-01
    3.6884360e-02  9.0411174e-01]]]
