
Setup Vector Search Index and MCP Server for gold_daily_customer_kwh_summary table
Run this in a Databricks notebook


In [0]:

%sql
CREATE TABLE `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
AS SELECT * FROM `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary;

In [0]:

# Step 6: Enable Change Data Feed (if not already enabled)
print("\n" + "=" * 80)
print("Step 6: Enable Change Data Feed (if needed)")
print("=" * 80)

# Enable Change Data Feed for automatic index synchronization
spark.sql("""
ALTER TABLE `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
SET TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
""")

In [0]:

%sql
ALTER TABLE `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
ADD COLUMN search_text STRING;

UPDATE `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
SET search_text = CONCAT(
    COALESCE(first_name, ''),
    ' ',
    COALESCE(last_name, ''),
    ' customer in ',
    COALESCE(city, ''),
    ' ',
    COALESCE(state, ''),
    ' on ',
    COALESCE(plan_name, ''),
    ' plan using ',
    CAST(total_kwh_daily AS STRING),
    ' kWh daily costing $',
    CAST(calculated_cost_daily AS STRING)
  );

In [0]:
%sql
SELECT * FROM `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne LIMIT 5

In [0]:

# Step 1: Verify table exists and check structure
print("=" * 80)
print("Step 1: Verifying table structure")
print("=" * 80)

# Run in SQL cell first to check table
sql_query = """
DESCRIBE TABLE EXTENDED `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
"""

print("Run this SQL to check table structure:")
print(sql_query)
print("\nOr check a few rows:")
print("SELECT * FROM `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary LIMIT 5")

In [0]:
%sql
SELECT * FROM `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne LIMIT 5

In [0]:

%pip install databricks-vectorsearch
dbutils.library.restartPython()





In [0]:
# Step 3: Create Vector Search Endpoint
print("\n" + "=" * 80)
print("Step 3: Create Vector Search Endpoint")
print("=" * 80)


from databricks.vector_search.client import VectorSearchClient

# Initialize client
vsc = VectorSearchClient(disable_notice=True)

# Endpoint name for gold table
endpoint_name = "customer_kwh_endpoint"

try:
    vsc.create_endpoint(
        name=endpoint_name,
        endpoint_type="STANDARD"
    )
    print(f"‚úÖ Endpoint '{endpoint_name}' created successfully")
except Exception as e:
    if "already exists" in str(e).lower() or "RESOURCE_ALREADY_EXISTS" in str(e):
        print(f"‚ÑπÔ∏è  Endpoint '{endpoint_name}' already exists")
    else:
        print(f"‚ö†Ô∏è  Error: {e}")


In [0]:
%sql
select * from  `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne

In [0]:
# Step 5: Create Vector Search Index
print("\n" + "=" * 80)
print("Step 5: Create Vector Search Index")
print("=" * 80)

from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(disable_notice=True)

# Configuration
endpoint_name = "customer_kwh_endpoint"
index_name = "na-dbxtraining.biju_gold.customer_kwh_embeddingsindex"
source_table = "na-dbxtraining.biju_gold.gold_daily_customer_kwh_summary_ne"

vsc.create_delta_sync_index(
    pipeline_type="TRIGGERED",
    endpoint_name=endpoint_name,
    index_name=index_name,
    primary_key="customer_id",
    source_table_name=source_table,
    embedding_source_column="search_text",
    embedding_model_endpoint_name="databricks-bge-large-en"
)
print(f"‚úÖ Vector Search Index '{index_name}' created successfully")
print(f"   This may take a few minutes to sync...")

In [0]:
# Step 7: Test the Vector Search Index
print("\n" + "=" * 80)
print("Step 7: Test Vector Search Index")
print("=" * 80)


from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(disable_notice=True)

endpoint_name = "customer_kwh_endpoint"
index_name = "na-dbxtraining.biju_gold.customer_kwh_embeddingsindex"

# Get the index
index = vsc.get_index(
    endpoint_name=endpoint_name,
    index_name=index_name
)

# Test search - query for high energy usage customers
results = index.similarity_search(
    query_text="high energy usage customer in California",
    columns=["customer_id", "first_name", "last_name", "city", "state", "plan_name", "total_kwh_daily", "calculated_cost_daily", "reading_date"],
    num_results=5
)

print("üîç Search Results:")
if 'result' in results and 'data_array' in results['result']:
    for i, result in enumerate(results['result']['data_array'], 1):
        print(f"\\n{i}. Customer: {result[1]} {result[2]}")
        print(f"   Location: {result[3]}, {result[4]}")
        print(f"   Plan: {result[5]}")
        print(f"   Daily Usage: {result[6]:.2f} kWh")
        print(f"   Daily Cost: ${result[7]:.2f}")
        print(f"   Date: {result[8]}")
else:
    print("No results found")



In [0]:
# Step 7: Test the Vector Search Index
print("\n" + "=" * 80)
print("Step 7: Test Vector Search Index")
print("=" * 80)


from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(disable_notice=True)

endpoint_name = "customer_kwh_endpoint"
index_name = "na-dbxtraining.biju_gold.customer_kwh_embeddingsindex"

# Get the index
index = vsc.get_index(
    endpoint_name=endpoint_name,
    index_name=index_name
)

# Test search - query for high energy usage customers
results = index.similarity_search(
    query_text="high energy usage customer in California",
    columns=["customer_id", "first_name", "last_name", "city", "state", "plan_name", "total_kwh_daily", "calculated_cost_daily", "reading_date"],
    num_results=5
)

print("üîç Search Results:")
if 'result' in results and 'data_array' in results['result']:
    for i, result in enumerate(results['result']['data_array'], 1):
        print(f"\\n{i}. Customer: {result[1]} {result[2]}")
        print(f"   Location: {result[3]}, {result[4]}")
        print(f"   Plan: {result[5]}")
        print(f"   Daily Usage: {result[6]:.2f} kWh")
        print(f"   Daily Cost: ${result[7]:.2f}")
        print(f"   Date: {result[8]}")
else:
    print("No results found")


In [0]:
# Step 8: MCP Server URL
print("\n" + "=" * 80)
print("Step 8: MCP Server Configuration")
print("=" * 80)

mcp_info = """
https://adb-1952652121322753.13.azuredatabricks.net/api/2.0/mcp/vector-search/na-dbxtraining/customer_kwh

MCP Tool Name:
na-dbxtraining__biju_gold__customer_kwh_embeddingsindex

This will be available after the index is created and synced.
"""

print(mcp_info)

print("\n" + "=" * 80)
print("Setup Complete!")
print("=" * 80)