# L4 - MUVERA Embeddings

<p style="background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px"> ‚è≥ <b>Note <code>(Kernel Starting)</code>:</b> This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.</p>

<div style="background-color:#fff6ff; padding:13px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px">
<p> üíª &nbsp; <b>Access <code>requirements.txt</code> and <code>helper.py</code> files:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Open"</em>.

<p> ‚¨á &nbsp; <b>Download Notebooks:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Download as"</em> and select <em>"Notebook (.ipynb)"</em>.</p>

<p> üìí &nbsp; For more help, please see the <em>"Appendix ‚Äì Tips, Help, and Download"</em> Lesson.</p>

</div>

The following cell is not in the video and just ensures output later in this notebook will render properly.

In [1]:
import plotly.io as pio
pio.renderers.default = "notebook"

#### Loading MUVERA

In [2]:
from fastembed.postprocess.muvera import Muvera

muvera = Muvera(
    # Colpali individual token embedding dimensionality
    dim=128,
    # 64 clusters
    k_sim=6,
    # Reduce the dimensionality with Random Projection
    dim_proj=16,
    # Repeat the process 20 times and concat
    # the individual results
    r_reps=20,
    # Random seed to make sure the results
    # are reproducible (default: 42)
    random_seed=42,
)

In [3]:
LOAD_PRECOMPUTED = True

#### Loading Sample Image Embeddings

In [4]:
from helper import load_sample_image_embeddings

# Load or compute image embeddings using helper function
# that only loads a sample of data
images_df = load_sample_image_embeddings(
    load_precomputed=LOAD_PRECOMPUTED,
)

print(f"Loaded {len(images_df)} document pages")
images_df.head(3)

Loaded 100 document pages


Unnamed: 0,image_path,image_embedding
0,ro_shared_data/pdfs/screenshots/AI4E_W1-page-0...,"[[-0.1533203125, 0.030517578125, 0.1357421875,..."
1,ro_shared_data/pdfs/screenshots/AI4E_W1-page-0...,"[[-0.04150390625, 0.1640625, 0.1708984375, 0.0..."
2,ro_shared_data/pdfs/screenshots/AI4E_W1-page-0...,"[[-0.1240234375, -0.025634765625, 0.0668945312..."


#### Generating MUVERA Embeddings

In [5]:
from tqdm import tqdm

# Process all documents with MUVERA
muvera_embeddings = []

for _, row in tqdm(
    images_df.iterrows(), total=len(images_df), desc="MUVERA embeddings"
):
    # Apply MUVERA compression
    muvera_fde = muvera.process_document(row["image_embedding"])
    muvera_embeddings.append(muvera_fde)

# Add to dataframe
images_df["muvera_embedding"] = muvera_embeddings

print(f"\nOriginal shape: {images_df['image_embedding'][0].shape}")
print(f"MUVERA FDE shape: {images_df['muvera_embedding'][0].shape}")

MUVERA embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:01<00:00, 56.85it/s]


Original shape: (1031, 128)
MUVERA FDE shape: (20480,)





#### Generating Query Embeddings

In [6]:
from helper import load_or_compute_query_embeddings

# Load or compute query embeddings using helper function
queries_df = load_or_compute_query_embeddings(
    load_precomputed=LOAD_PRECOMPUTED,
)

# Extract queries and query embeddings for later use
queries = queries_df["query"].tolist()
query_embeddings = queries_df["query_embedding"].tolist()

queries

['coffee mug', 'size vs performance tradeoff', 'one learning algorithm']

In [7]:
import numpy as np

# Process queries with MUVERA
muvera_query_embeddings = []

for qe in query_embeddings:
    qe_array = np.stack(qe)
    muvera_qe = muvera.process_query(qe_array)
    muvera_query_embeddings.append(muvera_qe)

print(f"Original query shape: {np.stack(query_embeddings[0]).shape}")
print(f"MUVERA query FDE shape: {muvera_query_embeddings[0].shape}")

Original query shape: (18, 128)
MUVERA query FDE shape: (20480,)


#### Creating Qdrant Collection and Adding Vectors

In [8]:
from qdrant_client import QdrantClient, models

collection_name = "colpali-optimizations"

# Connect to Qdrant
client = QdrantClient("http://localhost:6333")

# Delete if exists
if client.collection_exists(collection_name):
    client.delete_collection(collection_name)

# Create collection with dual vectors
client.create_collection(
    collection_name,
    vectors_config={
        # Original ColPali multivectors
        "colpali_original": models.VectorParams(
            size=128,
            distance=models.Distance.DOT,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            ),
            hnsw_config=models.HnswConfigDiff(m=0),
            on_disk=True,
        ),
        # MUVERA fixed-dimensional encodings
        "muvera_fde": models.VectorParams(
            size=20480,  # 64 √ó 16 √ó 20
            distance=models.Distance.DOT,
            on_disk=True,
            # No multivector config - single vector with HNSW
        ),
    },
)

True

In [None]:
# Expect this cell may take several minutes to finish
from tqdm import tqdm
from helper import yield_muvera_embeddings

# Stream through embeddings and upsert into Qdrant collection
for i, (image_path, vectors) in enumerate(
    tqdm(
        yield_muvera_embeddings(
            muvera=muvera,
            load_precomputed=LOAD_PRECOMPUTED,
        ),
        desc="Processing and inserting documents",
    )
):
    client.upsert(
        collection_name,
        points=[
            models.PointStruct(
                id=i,
                vector={
                    "colpali_original": vectors["colpali_original"],
                    "muvera_fde": vectors["muvera_fde"],
                },
                payload={
                    "image_path": image_path,
                },
            )
        ],
    )

print(f"\nInserted {i+1} documents into {collection_name}")

Processing and inserting documents: 0it [00:00, ?it/s]

In [None]:
from time import sleep

sleep(5.0)
while True:
    collection_info = client.get_collection(collection_name)
    if collection_info.status == models.CollectionStatus.GREEN:
        break
    sleep(5.0)

print(f"Collection has indexed all the data points")

#### Creating ColPali and MUVERA Search Helper Functions

In [None]:
import time

def search_colpali(query_embedding, limit=5):
    """Search using original ColPali multivectors"""
    start = time.time()
    results = client.query_points(
        collection_name=collection_name,
        query=query_embedding,
        using="colpali_original",
        limit=limit,
        with_payload=True,
    )
    search_time = time.time() - start
    return results.points, search_time

In [None]:
def search_muvera(query_embedding, limit=5):
    """Search using MUVERA compressed vectors"""
    start = time.time()
    results = client.query_points(
        collection_name=collection_name,
        query=query_embedding,
        using="muvera_fde",
        limit=limit,
        with_payload=True,
    )
    search_time = time.time() - start
    return results.points, search_time

#### Comparing ColPali and MUVERA Search Peformance

In [None]:
from helper import compare_search_methods

# Query 1: "coffee mug"
query_idx = 0

# Compare results (runs searches 10 times internally)
result_q1 = compare_search_methods(
    baseline_search_fn=lambda: search_colpali(
        query_embeddings[query_idx], limit=5
    ),
    comparison_search_fn=lambda: search_muvera(
        muvera_query_embeddings[query_idx], limit=5
    ),
    baseline_name="ColPali",
    comparison_name="MUVERA",
    query_text=queries[query_idx],
    limit=5,
    n_runs=10,
)

In [None]:
# Query 2: "size vs performance tradeoff"
query_idx = 1

# Compare results (runs searches 10 times internally)
result_q2 = compare_search_methods(
    baseline_search_fn=lambda: search_colpali(
        query_embeddings[query_idx], limit=5
    ),
    comparison_search_fn=lambda: search_muvera(
        muvera_query_embeddings[query_idx], limit=5
    ),
    baseline_name="ColPali",
    comparison_name="MUVERA",
    query_text=queries[query_idx],
    limit=5,
    n_runs=10,
)

In [None]:
# Query 3: "one learning algorithm"
query_idx = 2

# Compare results (runs searches 10 times internally)
result_q3 = compare_search_methods(
    baseline_search_fn=lambda: search_colpali(
        query_embeddings[query_idx], limit=5
    ),
    comparison_search_fn=lambda: search_muvera(
        muvera_query_embeddings[query_idx], limit=5
    ),
    baseline_name="ColPali",
    comparison_name="MUVERA",
    query_text=queries[query_idx],
    limit=5,
    n_runs=10,
)

In [None]:
# Collect all results
results = [result_q1, result_q2, result_q3]

# Calculate averages
avg_speedup = np.mean([r["avg_speedup"] for r in results])
median_speedup = np.mean([r["median_speedup"] for r in results])
avg_precision = np.mean([r["precision"] for r in results])

print("\n" + "=" * 50)
print("AVERAGE PERFORMANCE (across 3 queries)")
print("=" * 50)
print(f"Average speedup (mean): {avg_speedup:.1f}x faster")
print(f"Average speedup (median): {median_speedup:.1f}x faster")
print(f"Average precision@5: {avg_precision:.1%}")

#### Comparing Two-Stage Retrieval and ColPali

In [None]:
def two_stage_retrieval(query_colpali, query_muvera, limit=5):
    """
    Two-stage retrieval using prefetch:
    1. Fast MUVERA search for candidates
    2. Rerank with ColPali for accuracy

    Returns tuple of (results, search_time) like other search functions.
    """

    start = time.time()

    # Single API call with prefetch mechanism
    final_results = client.query_points(
        prefetch=[
            models.Prefetch(
                query=query_muvera,
                using="muvera_fde",
                limit=limit * 10,  # Ten times more
            )
        ],
        collection_name=collection_name,
        query=query_colpali,
        using="colpali_original",
        limit=limit,
        with_payload=True,
    )

    total_time = time.time() - start

    return final_results.points, total_time

In [None]:
# Test two-stage retrieval on first query
query_idx = 0
print(f'Query: "{queries[query_idx]}"')
print("=" * 60)

# Use compare_search_methods to calculate precision vs ColPali
two_stage_result_q1 = compare_search_methods(
    baseline_search_fn=lambda: search_colpali(
        query_embeddings[query_idx], limit=5
    ),
    comparison_search_fn=lambda: two_stage_retrieval(
        query_embeddings[query_idx],
        muvera_query_embeddings[query_idx],
        limit=5,
    ),
    baseline_name="ColPali",
    comparison_name="Two-stage",
    query_text=queries[query_idx],
    limit=5,
    n_runs=10,
)

In [None]:
# Test two-stage retrieval on second query
query_idx = 1
print(f'Query: "{queries[query_idx]}"')
print("=" * 60)

# Use compare_search_methods to calculate precision vs ColPali
two_stage_result_q2 = compare_search_methods(
    baseline_search_fn=lambda: search_colpali(
        query_embeddings[query_idx], limit=5
    ),
    comparison_search_fn=lambda: two_stage_retrieval(
        query_embeddings[query_idx],
        muvera_query_embeddings[query_idx],
        limit=5,
    ),
    baseline_name="ColPali",
    comparison_name="Two-stage",
    query_text=queries[query_idx],
    limit=5,
    n_runs=10,
)

In [None]:
# Test two-stage retrieval on third query
query_idx = 2
print(f'Query: "{queries[query_idx]}"')
print("=" * 60)

# Use compare_search_methods to calculate precision vs ColPali
two_stage_result_q3 = compare_search_methods(
    baseline_search_fn=lambda: search_colpali(
        query_embeddings[query_idx], limit=5
    ),
    comparison_search_fn=lambda: two_stage_retrieval(
        query_embeddings[query_idx],
        muvera_query_embeddings[query_idx],
        limit=5,
    ),
    baseline_name="ColPali",
    comparison_name="Two-stage",
    query_text=queries[query_idx],
    limit=5,
    n_runs=10,
)

In [None]:
# Collect all two-stage results
two_stage_results = [
    two_stage_result_q1,
    two_stage_result_q2,
    two_stage_result_q3,
]

# Calculate averages
avg_two_stage_time = np.mean(
    [r["comparison_avg_time"] for r in two_stage_results]
)
avg_precision = np.mean([r["precision"] for r in two_stage_results])

print("\n" + "=" * 60)
print("TWO-STAGE RETRIEVAL SUMMARY")
print("=" * 60)
print(f"\nAverage two-stage time: {avg_two_stage_time * 1000:.2f}ms")
print(
    f"  Query 1: {two_stage_results[0]['comparison_avg_time'] * 1000:.2f}ms"
)
print(
    f"  Query 2: {two_stage_results[1]['comparison_avg_time'] * 1000:.2f}ms"
)
print(
    f"  Query 3: {two_stage_results[2]['comparison_avg_time'] * 1000:.2f}ms"
)
print(f"\nAverage precision@5 vs ColPali: {avg_precision:.1%}")