# Qdrant Collection Visualization with Embedding Atlas

This notebook connects to [Qdrant](https://qdrant.tech/), sets up a collection, scrolls through all data and visualizes it using Embedding Atlas.


In [None]:
%pip install qdrant_client embedding_atlas

In [2]:
import pandas as pd
import numpy as np
from qdrant_client import QdrantClient
from embedding_atlas.widget import EmbeddingAtlasWidget
from typing import List, Dict, Any
from tqdm import tqdm

Run Qdrant with

> docker run -p 6333:6333 qdrant/qdrant

You can access the dashboard at http://localhost:6333/dashboard

In [1]:
QDRANT_URL = "http://localhost:6333"
COLLECTION_NAME = "midlib"
SNAPSHOT = "https://snapshots.qdrant.io/midlib.snapshot"

In [None]:
client = QdrantClient(url=QDRANT_URL)

# Load the snapshot of collection that includes vectors of images from the midlibrary dataset
if not client.collection_exists(COLLECTION_NAME):
    client.recover_snapshot(
        collection_name=COLLECTION_NAME,
        location=SNAPSHOT,
    )
    print(f"Collection {COLLECTION_NAME} restored")
else:
    print(f"Collection {COLLECTION_NAME} already exists")

collection_info = client.get_collection(COLLECTION_NAME)
print(
    f"Vector size: {collection_info.config.params.vectors.size} | Total entries: {collection_info.points_count}"
)

In [41]:
def scroll_all_points(
    client: QdrantClient, collection_name: str, batch_size: int = 100
) -> List[Dict[str, Any]]:
    all_points = []
    offset = None

    print("Starting to scroll through collection...")

    with tqdm(desc="Fetching points") as pbar:
        while True:
            points, next_offset = client.scroll(
                collection_name=collection_name,
                limit=batch_size,
                offset=offset,
                with_payload=True,
                with_vectors=True,
            )

            if not points:
                break

            for point in points:
                point_data = {"id": point.id, "vector": point.vector, **point.payload}
                all_points.append(point_data)

            pbar.update(len(points))

            if next_offset is None:
                break

            offset = next_offset

    print(f"Successfully fetched {len(all_points)} points")
    return all_points

In [42]:
BATCH_SIZE = 100
points_data = scroll_all_points(client, COLLECTION_NAME, BATCH_SIZE)

Starting to scroll through collection...


Fetching points: 5417it [00:00, 6166.13it/s]

Successfully fetched 5417 points





In [43]:
df = pd.DataFrame(points_data)

df.head()

Unnamed: 0,id,vector,file_name,image_url,name,url
0,0,"[0.056603346, -0.017431192, 0.022566801, -0.03...",662a3bac7847574f6a510569_Chris_Dyer_V6_p.jpeg,https://storage.googleapis.com/demo-midjourney...,Chris Dyer,/styles/chris-dyer
1,1,"[0.043383807, -0.06374442, -0.013710048, -0.03...",662a30022c88ebf016f172a8_Catherine_Hyde_V6_p.jpeg,https://storage.googleapis.com/demo-midjourney...,Catherine Hyde,/styles/catherine-hyde
2,2,"[-0.05074604, 0.040631093, 0.0011827358, 0.011...",662c577775a44fc22d66d4da_Xavier_Dolan_V6_p.jpeg,https://storage.googleapis.com/demo-midjourney...,Xavier Dolan,/styles/xavier-dolan
3,3,"[0.07768102, -0.023939794, -0.013983787, -0.01...",662b5e881710bd3b3fcd94cd_Peter_Paul_Rubens_V6_...,https://storage.googleapis.com/demo-midjourney...,Peter Paul Rubens,/styles/peter-paul-rubens
4,4,"[-0.019566944, -0.015979603, -0.009070171, -0....",662b95553c241a20a16f0374_Robert_Crumb_V6_p.jpeg,https://storage.googleapis.com/demo-midjourney...,Robert Crumb,/styles/robert-crumb


In [44]:
from umap import UMAP

vectors = np.array(df["vector"].tolist())
print(f"Vector matrix shape: {vectors.shape}")

# Apply UMAP for dimensionality reduction
print("Computing UMAP projection")
umap_reducer = UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42)

embedding_2d = umap_reducer.fit_transform(vectors)

df["projection_x"] = embedding_2d[:, 0]
df["projection_y"] = embedding_2d[:, 1]

print("2D projection completed")

Vector matrix shape: (5417, 512)
Computing UMAP projection


  warn(


2D projection completed


In [None]:
def compute_nearest_neighbors(vectors: np.ndarray, k: int = 10) -> List[Dict[str, Any]]:
    from sklearn.metrics.pairwise import cosine_similarity

    print(f"Computing {k}-nearest neighbors...")

    similarity_matrix = cosine_similarity(vectors)

    neighbors_list = []

    for i in tqdm(range(len(vectors)), desc="Processing neighbors"):
        similarities = similarity_matrix[i]

        neighbor_indices = np.argsort(similarities)[::-1][: k + 1]
        neighbor_indices = neighbor_indices[neighbor_indices != i][:k]

        neighbor_similarities = similarities[neighbor_indices]
        neighbor_distances = 1 - neighbor_similarities

        neighbors = {
            "ids": neighbor_indices.tolist(),
            "distances": neighbor_distances.tolist(),
        }

        neighbors_list.append(neighbors)

    return neighbors_list


neighbors = compute_nearest_neighbors(vectors, k=10)
df["neighbors"] = neighbors
print("Nearest neighbors computed")

Computing 10-nearest neighbors...


Processing neighbors: 100%|██████████| 5417/5417 [00:01<00:00, 2762.68it/s]

Nearest neighbors computed





In [50]:
w = EmbeddingAtlasWidget(
    df,
    x="projection_x",
    y="projection_y",
    row_id="id",
    text="name",
    neighbors="neighbors",
)
print("Widget created!")

Widget created!


Let's take a look at our visualization!

In [None]:
w