<div style="display:flex; align-items:flex-start; margin-bottom:1rem;">
  <!-- Left: Book cover -->
  <img
    src="https://adb-1376134742576436.16.azuredatabricks.net/files/Images/book_cover.JPG"
    style="width:35%; margin-right:1rem; border-radius:4px; box-shadow:0 2px 6px rgba(0,0,0,0.1);"
    alt="Book Cover"/>
  <!-- Right: Metadata -->
  <div style="flex:1;">
    <!-- O'Reilly logo above title -->
    <div style="display:flex; flex-direction:column; align-items:flex-start; margin-bottom:0.75rem;">
      <img
        src="https://cdn.oreillystatic.com/images/sitewide-headers/oreilly_logo_mark_red.svg"
        style="height:2rem; margin-bottom:0.25rem;"
        alt="O‘Reilly"/>
      <span style="font-size:1.75rem; font-weight:bold; line-height:1.2;">
        AI, ML and GenAI in the Lakehouse
      </span>
    </div>
    <!-- Details, now each on its own line -->
    <div style="font-size:0.9rem; color:#555; margin-bottom:1rem; line-height:1.4;">
      <div><strong>Name:</strong> 09-03-Vector Search</div>
      <div><strong>Author:</strong> Bennie Haelen</div>
      <div><strong>Date:</strong> 7-26-2025</div>
    </div>
    <!-- Purpose -->
    <div style="font-weight:600; margin-bottom:0.75rem;">
      Purpose: This notebook demonstrates how to leverage a Mosaic Vector Search Endpoint in RAG
    </div>
    <!-- Outline -->
    <div style="margin-top:0;">
      <h3 style="margin:0 0 0.25rem;">Table of Contents</h3>
      <ol style="padding-left:1.25rem; margin:0; color:#333;">
        <li>Fetch Wikipedia articles and load them into a DataFrame</li>
        <li>Extract/clean the text content-split it into manageable chunks</li>
        <li>Calculate the embeddings</li>
        <li>Store the embeddings in a Delta file</li>
      </ol>
    </div>
  </div>
</div>


#Pre-Requisites

##Install our requireed libraries

In [0]:
%pip install -U -qq databricks-vectorsearch databricks-sdk flashrank
dbutils.library.restartPython()

##Run our Common Code Notebook for this chapter

In [0]:
%run "./9-00-Common-Code"

#Create our Vector Search Endpoint

##Create the VectorSearchClient

In [0]:
from databricks.vector_search.client import VectorSearchClient
client = VectorSearchClient(disable_notice=True)

##Create the endpoint

In [0]:
# This will try username-based naming first, then fallback if needed
endpoint, endpoint_name = create_endpoint_with_fallback(
    client=client,
    username=USER_NAME,
    endpoint_type=EndpointType.STANDARD,
    wait_for_ready=True
)

print(f"Created endpoint: {endpoint_name}")

full_endpoint_name = endpoint_name

# Create a Vector Search Index

In [0]:
 # Configuration parameters (replace with your actual values)
ENDPOINT_NAME = full_endpoint_name

# Table and index names
source_table_name = "lab_wikipedia_text_embeddings"

# where we want to store our index
vs_index_name = "lab_wikipedia-1"

print("="*80)
print("OPTIMIZED VECTOR INDEX MANAGEMENT EXAMPLE")
print("="*80)

try:
    index = create_or_sync_vector_index(
        client=client,
        catalog_name=CATALOG_NAME,
        schema_name=SCHEMA_NAME,
        table_name=source_table_name,
        index_name=vs_index_name,
        endpoint_name=ENDPOINT_NAME,
        primary_key="id",
        embedding_dimension=1024,  # Match your model embedding size (gte)
        embedding_vector_column="embedding",
        pipeline_type=IndexPipelineType.TRIGGERED,
        wait_for_ready=True,
        max_wait_time=3600  # 1 hour timeout
    )
    print("✓ Index is ready for use!")
    
except Exception as e:
    print(f"✗ Index management failed: {e}")

#Issuing Queries against the Vector Search Index

##Create an embeddings vector against our question

In [0]:
# Import MLflow deployments module for accessing deployed model endpoints
import mlflow.deployments

# Initialize MLflow deployment client for Databricks platform
# This creates a connection to Databricks model serving infrastructure
databricks_client = mlflow.deployments.get_deploy_client("databricks")

# Define the user's text query that will be converted to vector embeddings
# This natural language question will be transformed into numerical representation
user_query = "Is Deep Learning the basis for Generative AI"

# Configuration for the embedding model endpoint
embedding_model_endpoint = "databricks-bge-large-en"  # GTE large English model
model_input_payload = {"input": [user_query]}

# Call the GTE (General Text Embeddings) model endpoint to generate embeddings
# - GTE model produces high-quality embeddings optimized for semantic similarity
# - Input format: {"input": [list_of_text_strings]}
# - Returns 1024-dimensional vectors that capture semantic meaning of the text
embedding_response = databricks_client.predict(
    endpoint=embedding_model_endpoint, 
    inputs=model_input_payload
)

# Extract the numerical embedding vectors from the model response
# Response structure: response.data = [{"embedding": [float, float, ...]}, ...]
# We extract only the vector arrays for downstream similarity search operations
query_embeddings = [embedding_dict["embedding"] for embedding_dict in embedding_response.data]

# Display the generated embeddings for verification
# These 1024-dimensional vectors are ready for similarity search in vector databases
print(f"Generated {len(query_embeddings)} embedding vector(s)")
print(f"Embedding dimension: {len(query_embeddings[0]) if query_embeddings else 0}")
print("Embedding vectors:", query_embeddings)

In [0]:
print(full_endpoint_name)

In [0]:
import pprint
full_index_name = f"{CATALOG_NAME}.{SCHEMA_NAME}.lab_wikipedia-1"
print(f"Full index name: {full_index_name}")


# get similar 5 documents.
results = client.get_index(full_endpoint_name, full_index_name).similarity_search(
  query_vector=query_embeddings[0],
  columns=["title", "content"],
  num_results=5)

import textwrap

print(f"\nTop 5 results for: “{user_query}”\n" + "="*60 + "\n")
rows = results.get("result", {}).get("data_array", [])
print(f"\nTop {len(rows)} results for “{user_query}”")
print("=" * 60)

for rank, row in enumerate(rows, start=1):
    # row is a list with at least two items; guard against missing data
    title   = row[0] if len(row) > 0 else "<no title>"
    content = row[1] if len(row) > 1 else ""
    snippet = textwrap.shorten(content, width=180, placeholder="…")
    
    print(f"\n{rank:>2}. {title}")
    print(textwrap.indent(snippet, "    "))
# # format result to align with reranker lib format. 
# passages = []
# for doc in results.get("result", {}).get("data_array", []):
#     new_doc = {"file": doc[0], "text": doc[1]}
#     passages.append(new_doc)

# print(passages)

In [0]:
from flashrank import Ranker, RerankRequest

# Initialize the reranker using the T5-based Flan model
# (no need to specify cache_dir unless you want to control where Hugging Face stores weights)
reranker = Ranker(model_name="rank-T5-flan")

# Prepare the list of passages for reranking.
# Each passage is a dict containing:
#  - "text": the content to be scored
#  - "file": optional metadata (e.g., article title or source)
passages = [
    {"text": row[1], "file": row[0]}  # row[1] is the article content, row[0] is its title
    for row in rows
    if len(row) > 1                   # only include rows that have both title and content
]

# Build the rerank request with the user’s query and the passages
request = RerankRequest(query=user_query, passages=passages)

# Execute the reranking call; returns a list of results sorted by relevance
reranked_results = reranker.rerank(request)

# Iterate over the top 3 hits and print their details
for i, hit in enumerate(reranked_results[:3], start=1):
    score  = hit["score"]                    # the relevance score (float)
    text   = hit["text"]                     # the passage text
    source = hit.get("file", "<unknown source>")  # the optional file/title metadata

    # Output the formatted result
    print(f"Result #{i}")
    print(f" Source: {source}")
    print(f" Score : {score:.4f}")
    print(f" Text  : {text}\n")


In [0]:
from databricks.sdk import WorkspaceClient

# Initialize the Databricks client (assumes your environment is authenticated)
w = WorkspaceClient()

# Create an OpenAI‑compatible client for Databricks Foundation Models
openai_client = w.serving_endpoints.get_open_ai_client()

# Assume reranked_results and user_query are already defined
TOP_K = 3
# Collect the top K passages with their source labels
enriched_context = "\n\n".join(
    f"Source: {hit.get('file', '<unknown source>')}\n{hit.get('text', '')}"
    for hit in reranked_results[:TOP_K]
)

# Build a concise prompt combining context and query
generation_prompt = (
    "You are an expert assistant. Using only the provided context passages, "
    "answer the user’s question accurately and concisely.\n\n"
    f"{enriched_context}\n\n"
    f"Question: {user_query}\n\n"
    "Answer:"
)

# Invoke the model via the Claude-compatible client
response = openai_client.chat.completions.create(
    model="databricks-claude-3-7-sonnet",
    messages=[{"role": "user", "content": generation_prompt}]
)

# Extract and display the final answer
final_answer = response.choices[0].message.content.strip()
print("Final Answer:\n", final_answer)