In [0]:
%pip install hnswlib

In [0]:
dbutils.library.restartPython() 


In this notebook we’ll:
  1. Load our cleaned, chunked Wikipedia embeddings from Delta  
  2. Build a fast HNSW index in Python (using `hnswlib`)  
  3. Persist the index to DBFS  
  4. Define and demo a simple search function 

In [0]:
%run ./9-Common-Code

#Imports and Configuration

In [0]:
import os
import numpy as np
import pandas as pd
import hnswlib

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Adjust these to match your workspace
CATALOG = "book_ai_ml_lakehouse"
SCHEMA  = "rag"
TABLE   = f"{CATALOG}.{SCHEMA}.lab_wikipedia_text_embeddings"
INDEX_PATH = "/dbfs/vector_index/wiki_hnsw.bin"

# Load Embeddings into Pandas

In [0]:
# Fully-qualified table name, maybe defined earlier
TABLE = f"{CATALOG_NAME}.{SCHEMA_NAME}.lab_wikipedia_text_embeddings"

# Read it into a DataFrame and pick just the columns you need
df = spark.table(TABLE).select(
    "id",
    "article_title",
    "url",
    "content",
    "embedding"
)

df.show()

# Convert to Pandas for in-memory indexing

In [0]:
pdf = df.toPandas()
ids         = pdf["id"].to_numpy(dtype=int)
vectors     = np.vstack(pdf["embedding"].values)
metadata_df = pdf[["id","article_title","url","content"]].set_index("id")

print(f"Loaded {len(ids)} vectors, dim={vectors.shape[1]}")

# BUild the HNSW Index

In [0]:
dim = vectors.shape[1]
max_elements = len(ids)

# Initialize HNSW index (cosine distance)
index = hnswlib.Index(space='cosine', dim=dim)
index.init_index(max_elements=max_elements, ef_construction=200, M=16)
index.add_items(vectors, ids)
index.set_ef(50)  # query-time accuracy/speed trade-off

print("HNSW index built.")

# Persist the index

In [0]:
os.makedirs(os.path.dirname(INDEX_PATH), exist_ok=True)
index.save_index(INDEX_PATH)
print(f"Index saved to {INDEX_PATH}")

# Define a search Function

In [0]:
def load_index(path=INDEX_PATH):
    idx = hnswlib.Index(space='cosine', dim=dim)
    idx.load_index(path)
    idx.set_ef(50)
    return idx

def embed_query(text: str) -> np.ndarray:
    # Replace this stub with your real embedding call,
    # e.g. mlflow.deployments.predict or OpenAI embedding API.
    from mlflow.deployments import get_deploy_client
    client = get_deploy_client("databricks")
    out = client.predict(endpoint="databricks-bge-large-en", inputs={"input":[text]})
    return np.array(out.data[0]["embedding"], dtype=np.float32)

def search(text: str, k: int = 5):
    idx = load_index()
    qv  = embed_query(text)
    labels, distances = idx.knn_query(qv, k=k)
    results = []
    for lbl, dist in zip(labels[0], distances[0]):
        row = metadata_df.loc[int(lbl)]
        results.append({
            "id": int(lbl),
            "score": float(1 - dist),  # cosine similarity
            "title": row["article_title"],
            "url": row["url"],
            "snippet": row["content"][:200] + "…"
        })
    return pd.DataFrame(results)


Demo Query

In [0]:
query = "What is reinforcement learning?"
df_results = search(query, k=5)
display(df_results)