# MOBI Docs Vector Search
This notebook demonstrates how to manually create a vector search table

In [None]:
%pip install databricks-vectorsearch mlflow requests
%restart_python

In [None]:
# Setup: minimal deps + add src to sys.path
import sys
from pathlib import Path
src_path = Path.cwd() / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))


In [None]:
import mlflow
CONFIG = mlflow.models.ModelConfig(development_config='config.yaml')

CATALOG = CONFIG.get("catalog")
SCHEMA = CONFIG.get("schema")
print(f"Using catalog.schema: {CATALOG}.{SCHEMA}")



In [None]:
# Show ten rows of the bronze_site table we already proudced

display(spark.table(f"`{CATALOG}`.`{SCHEMA}`.silver_site").limit(10))

In [None]:
sql = (
    f"ALTER TABLE `{CATALOG}`.`{SCHEMA}`.`silver_site` "
    "SET TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true');"
)

spark.sql(sql)

## Create Vector Search Endpoint

In [0]:
from databricks.vector_search.client import VectorSearchClient

ENDPOINT_NAME = "mobi_vs_endpoint"

# Initialize client first
client = VectorSearchClient(disable_notice=True)

# Safely get the list of endpoints (empty list if none)
resp = client.list_endpoints()
endpoints = resp.get("endpoints", [])  # returns [] if key is missing

# Delete existing endpoint if it already exists
if any(ep.get("name") == ENDPOINT_NAME for ep in endpoints):
    client.delete_endpoint(name=ENDPOINT_NAME)
    print(f"Existing endpoint {ENDPOINT_NAME} deleted")

# Create new endpoint
client.create_endpoint(
    name=ENDPOINT_NAME,
    endpoint_type="STANDARD",
)
print(f"Endpoint {ENDPOINT_NAME} created")


## Create Vector Search Index

In [0]:
index = client.create_delta_sync_index(
    endpoint_name="mobi_vs_endpoint",
    source_table_name=f"`{CATALOG}`.`{SCHEMA}`.silver_site",
    index_name=f"{CATALOG}.{SCHEMA}.mobi_site_index",
    pipeline_type="TRIGGERED",
    primary_key="site_page_id",                # Must be present in your table
    embedding_source_column="content_md",  # Text column for embedding
    embedding_model_endpoint_name="databricks-gte-large-en" # or any available model
)

In [0]:
query = f"""
SELECT
  *,
  floor(site_page_id / 5) AS site_page_id_bin_10
FROM vector_search(
  index => "{CATALOG}.{SCHEMA}.mobi_site_index",
  query_text => "What is Mobi?",
  num_results => 50,
  query_type => "hybrid"
)
ORDER BY site_page_id DESC
"""

df = spark.sql(query)
display(df)


In [0]:
query = f"""
CREATE OR REPLACE FUNCTION {CATALOG}.{SCHEMA}.site_search(
  description STRING COMMENT 'A search of mobi documents'
)
RETURNS TABLE (
  site_page_id INTEGER,
  title STRING,
  value STRING,
  search_score STRING
)
COMMENT 'Returns the top three documents matching semantic search.
'
RETURN
SELECT *
FROM vector_search(
  index=>'{CATALOG}.{SCHEMA}.mobi_site_index',
  query_text=>description,
  num_results=>3,
  query_type=>'hybrid'
)
"""
df = spark.sql(query)
display(df)

In [0]:
query = f"""
SELECT * FROM `{CATALOG}`.`{SCHEMA}`.site_search('Trip Fares')
"""

df = spark.sql(query)
display(df)