
Setup Vector Search Index and MCP Server for gold_daily_customer_kwh_summary table
Run this in a Databricks notebook


In [0]:
%sql
select * from `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne

In [0]:

%sql
CREATE TABLE `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
AS SELECT * FROM `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary;

In [0]:
spark.sql("""
ALTER TABLE `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
ADD COLUMN record_id STRING
""")

In [0]:
# Populate record_id with composite key: customer_id + reading_date
spark.sql("""
UPDATE `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
SET record_id = CONCAT(customer_id, '_', DATE_FORMAT(reading_date, 'yyyyMMdd'))
""")

In [0]:
%sql
ALTER TABLE `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
ADD COLUMN search_text STRING;

In [0]:
spark.sql("""
UPDATE `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
SET search_text = CONCAT(
    'Customer: ', COALESCE(first_name, ''), ' ', COALESCE(last_name, ''), '. ',
    'Location: ', COALESCE(city, ''), ', ', COALESCE(state, ''), '. ',
    'Energy plan: ', COALESCE(plan_name, ''), ' (Plan ID: ', COALESCE(plan_id, ''), '). ',
    CASE 
        WHEN total_kwh_daily > 20 THEN 'High energy usage: '
        WHEN total_kwh_daily > 10 THEN 'Medium energy usage: '
        ELSE 'Low energy usage: '
    END,
    CAST(total_kwh_daily AS STRING), ' kWh per day. ',
    'Daily cost: $', CAST(calculated_cost_daily AS STRING), 
    ' at rate $', CAST(rate_per_kwh AS STRING), ' per kWh. ',
    'Reading date: ', CAST(reading_date AS STRING), '.'
)
""")


In [0]:

# Step 6: Enable Change Data Feed (if not already enabled)
print("\n" + "=" * 80)
print("Step 6: Enable Change Data Feed (if needed)")
print("=" * 80)

# Enable Change Data Feed for automatic index synchronization
spark.sql("""
ALTER TABLE `na-dbxtraining`.biju_gold.gold_daily_customer_kwh_summary_ne
SET TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true')
""")

In [0]:

%pip install databricks-vectorsearch
dbutils.library.restartPython()





In [0]:
# Step 3: Create Vector Search Endpoint
print("\n" + "=" * 80)
print("Step 3: Create Vector Search Endpoint")
print("=" * 80)


from databricks.vector_search.client import VectorSearchClient

# Initialize client
vsc = VectorSearchClient(disable_notice=True)

# Endpoint name for gold table
endpoint_name = "customer_kwh_endpoint"

try:
    vsc.create_endpoint(
        name=endpoint_name,
        endpoint_type="STANDARD"
    )
    print(f"‚úÖ Endpoint '{endpoint_name}' created successfully")
except Exception as e:
    if "already exists" in str(e).lower() or "RESOURCE_ALREADY_EXISTS" in str(e):
        print(f"‚ÑπÔ∏è  Endpoint '{endpoint_name}' already exists")
    else:
        print(f"‚ö†Ô∏è  Error: {e}")


In [0]:
# ============================================================================
# PART 3: RECREATE VECTOR SEARCH INDEX WITH CORRECT PRIMARY KEY
# ============================================================================
print("\n" + "="*80)
print("PART 3: Recreating Vector Search Index")
print("="*80)

from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(disable_notice=True)

# Configuration
endpoint_name = "customer_kwh_endpoint"
index_name = "na-dbxtraining.biju_gold.customer_kwh_embeddingsindex"
source_table = "na-dbxtraining.biju_gold.gold_daily_customer_kwh_summary_ne"

# Delete existing index if it exists
print("\nüóëÔ∏è Deleting existing index (if exists)...")
try:
    vsc.delete_index(endpoint_name=endpoint_name, index_name=index_name)
    print("‚úÖ Existing index deleted successfully")
except Exception as e:
    print(f"‚ÑπÔ∏è Note: {e}")

# Wait a bit for deletion to complete
import time
print("‚è≥ Waiting for deletion to complete...")
time.sleep(10)

# Create new index with corrected primary key
print(f"\nüîß Creating new index with primary_key='record_id'...")
try:
    vsc.create_delta_sync_index(
        pipeline_type="TRIGGERED",
        endpoint_name=endpoint_name,
        index_name=index_name,
        primary_key="record_id", 
        source_table_name=source_table,
        embedding_source_column="search_text",
        embedding_model_endpoint_name="databricks-bge-large-en"
    )
    print(f"‚úÖ Vector Search Index '{index_name}' created successfully!")
    print(f"   Primary Key: record_id (customer_id + date)")
    print(f"   Embedding Column: search_text")
    print(f"   Model: databricks-bge-large-en")
except Exception as e:
    print(f"‚ùå Error creating index: {e}")
    raise

In [0]:
# ============================================================================
# PART 4: WAIT FOR INDEX SYNCHRONIZATION
# ============================================================================
print("\n" + "="*80)
print("PART 4: Waiting for Index Synchronization")
print("="*80)

def wait_for_index_sync(vsc, endpoint_name, index_name, timeout=900):
    """Wait for index to be ready with progress updates"""
    start_time = time.time()
    attempts = 0
    
    while time.time() - start_time < timeout:
        attempts += 1
        try:
            index = vsc.get_index(endpoint_name=endpoint_name, index_name=index_name)
            status_info = index.describe()
            status = status_info.get('status', {})
            
            ready = status.get('ready', False)
            message = status.get('message', 'Indexing in progress')
            
            if ready:
                print(f"\n‚úÖ Index is ready after {int(time.time() - start_time)}s!")
                print(f"   Status: {message}")
                return True
            else:
                elapsed = int(time.time() - start_time)
                print(f"‚è≥ [{elapsed}s] Index syncing... {message}")
                time.sleep(30)
                
        except Exception as e:
            print(f"‚è≥ [{attempts}] Waiting for index to be available...")
            time.sleep(30)
    
    print(f"\n‚ö†Ô∏è Timeout after {timeout}s - Index may still be syncing")
    return False

# Wait for index to be ready
index_ready = wait_for_index_sync(vsc, endpoint_name, index_name, timeout=900)

if not index_ready:
    print("\n‚ö†Ô∏è Index is still syncing. You can proceed with testing later.")
    print("   Check status with: vsc.get_index(endpoint_name, index_name).describe()")

# ============================================================================
# PART 5: TEST VECTOR SEARCH WITH MULTIPLE QUERIES
# ============================================================================
print("\n" + "="*80)
print("PART 5: Testing Vector Search")
print("="*80)

if index_ready:
    index = vsc.get_index(endpoint_name=endpoint_name, index_name=index_name)
    
    # Test 1: High energy customers in California
    print("\n" + "-"*80)
    print("TEST 1: High Energy Customers in California (with filter)")
    print("-"*80)
    
    try:
        results = index.similarity_search(
            query_text="customers with high daily energy usage over 15 kWh",
            columns=["customer_id", "first_name", "last_name", "city", "state", 
                     "plan_name", "total_kwh_daily", "calculated_cost_daily", "reading_date"],
            filters={"state": "CA"},
            num_results=5
        )
        
        if 'result' in results and 'data_array' in results['result']:
            print(f"\n‚úÖ Found {len(results['result']['data_array'])} results")
            for i, result in enumerate(results['result']['data_array'], 1):
                print(f"\n{i}. Customer: {result[1]} {result[2]}")
                print(f"   Location: {result[3]}, {result[4]}")
                print(f"   Plan: {result[5]}")
                print(f"   Daily Usage: {result[6]:.2f} kWh")
                print(f"   Daily Cost: ${result[7]:.2f}")
                print(f"   Date: {result[8]}")
        else:
            print("‚ùå No results found")
    except Exception as e:
        print(f"‚ùå Error in search: {e}")
    
    # Test 2: Medium energy usage (no state filter)
    print("\n" + "-"*80)
    print("TEST 2: Medium Energy Usage Customers (no filter)")
    print("-"*80)
    
    try:
        results = index.similarity_search(
            query_text="customer with medium energy consumption around 10-15 kWh per day",
            columns=["customer_id", "first_name", "last_name", "city", "state", 
                     "plan_name", "total_kwh_daily", "calculated_cost_daily", "reading_date"],
            num_results=5
        )
        
        if 'result' in results and 'data_array' in results['result']:
            print(f"\n‚úÖ Found {len(results['result']['data_array'])} results")
            for i, result in enumerate(results['result']['data_array'], 1):
                print(f"\n{i}. {result[1]} {result[2]} | {result[3]}, {result[4]} | "
                      f"{result[5]} | {result[6]:.2f} kWh | ${result[7]:.2f}")
        else:
            print("‚ùå No results found")
    except Exception as e:
        print(f"‚ùå Error in search: {e}")
    
    # Test 3: Specific plan search
    print("\n" + "-"*80)
    print("TEST 3: Fixed Rate 12 Plan Customers")
    print("-"*80)
    
    try:
        results = index.similarity_search(
            query_text="customers on Fixed Rate 12 energy plan",
            columns=["customer_id", "first_name", "last_name", "city", "state", 
                     "plan_name", "total_kwh_daily", "calculated_cost_daily", "reading_date"],
            filters={"plan_name": "Fixed Rate 12"},
            num_results=5
        )
        
        if 'result' in results and 'data_array' in results['result']:
            print(f"\n‚úÖ Found {len(results['result']['data_array'])} results")
            for i, result in enumerate(results['result']['data_array'], 1):
                print(f"\n{i}. {result[1]} {result[2]} | {result[3]}, {result[4]} | "
                      f"{result[6]:.2f} kWh | ${result[7]:.2f}")
        else:
            print("‚ùå No results found")
    except Exception as e:
        print(f"‚ùå Error in search: {e}")
    
    # Test 4: Usage-based filter
    print("\n" + "-"*80)
    print("TEST 4: Customers Using More Than 20 kWh Daily")
    print("-"*80)
    
    try:
        results = index.similarity_search(
            query_text="high energy consumption customer",
            columns=["customer_id", "first_name", "last_name", "city", "state", 
                     "plan_name", "total_kwh_daily", "calculated_cost_daily", "reading_date"],
            filters={"total_kwh_daily": (">", 20.0)},
            num_results=5
        )
        
        if 'result' in results and 'data_array' in results['result']:
            print(f"\n‚úÖ Found {len(results['result']['data_array'])} results")
            for i, result in enumerate(results['result']['data_array'], 1):
                print(f"\n{i}. {result[1]} {result[2]} | {result[3]}, {result[4]} | "
                      f"{result[6]:.2f} kWh (>${result[7]:.2f})")
        else:
            print("‚ùå No results found - may not have customers over 20 kWh")
    except Exception as e:
        print(f"‚ùå Error in search: {e}")

else:
    print("\n‚ö†Ô∏è Skipping tests - Index is not ready yet")
    print("   Re-run this section after the index has synced")

In [0]:
# Step 7: Test the Vector Search Index
print("\n" + "=" * 80)
print("Step 7: Test Vector Search Index")
print("=" * 80)


from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(disable_notice=True)

endpoint_name = "customer_kwh_endpoint"
index_name = "na-dbxtraining.biju_gold.customer_kwh_embeddingsindex"

# Get the index
index = vsc.get_index(
    endpoint_name=endpoint_name,
    index_name=index_name
)

# Test search - query for high energy usage customers
results = index.similarity_search(
    query_text="high energy usage customer in California",
    columns=["customer_id", "first_name", "last_name", "city", "state", "plan_name", "total_kwh_daily", "calculated_cost_daily", "reading_date"],
    num_results=5
)

print("üîç Search Results:")
if 'result' in results and 'data_array' in results['result']:
    for i, result in enumerate(results['result']['data_array'], 1):
        print(f"\\n{i}. Customer: {result[1]} {result[2]}")
        print(f"   Location: {result[3]}, {result[4]}")
        print(f"   Plan: {result[5]}")
        print(f"   Daily Usage: {result[6]:.2f} kWh")
        print(f"   Daily Cost: ${result[7]:.2f}")
        print(f"   Date: {result[8]}")
else:
    print("No results found")



In [0]:
# Step 7: Test the Vector Search Index
print("\n" + "=" * 80)
print("Step 7: Test Vector Search Index")
print("=" * 80)


from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(disable_notice=True)

endpoint_name = "customer_kwh_endpoint"
index_name = "na-dbxtraining.biju_gold.customer_kwh_embeddingsindex"

# Get the index
index = vsc.get_index(
    endpoint_name=endpoint_name,
    index_name=index_name
)

# Test search - query for high energy usage customers
results = index.similarity_search(
    query_text="high energy usage customer in California",
    columns=["customer_id", "first_name", "last_name", "city", "state", "plan_name", "total_kwh_daily", "calculated_cost_daily", "reading_date"],
    num_results=5
)

print("üîç Search Results:")
if 'result' in results and 'data_array' in results['result']:
    for i, result in enumerate(results['result']['data_array'], 1):
        print(f"\\n{i}. Customer: {result[1]} {result[2]}")
        print(f"   Location: {result[3]}, {result[4]}")
        print(f"   Plan: {result[5]}")
        print(f"   Daily Usage: {result[6]:.2f} kWh")
        print(f"   Daily Cost: ${result[7]:.2f}")
        print(f"   Date: {result[8]}")
else:
    print("No results found")


In [0]:
# ============================================================================
# MCP SERVER URL GENERATION 
# ============================================================================

import json

# Step 8: MCP Server URL Generation
print("\n" + "=" * 80)
print("Step 8: MCP Server Configuration")
print("=" * 80)

# Configuration
DATABRICKS_HOST = "https://adb-1952652121322753.13.azuredatabricks.net"
CATALOG = "na-dbxtraining"
SCHEMA = "biju_gold"
INDEX_NAME = "customer_kwh_embeddingsindex"
MCP_SERVER_NAME = "customer_kwh"

# Generate MCP Server URL
MCP_SERVER_URL = f"{DATABRICKS_HOST}/api/2.0/mcp/vector-search/{CATALOG}/{MCP_SERVER_NAME}"
MCP_TOOL_NAME = f"{CATALOG}__{SCHEMA}__{INDEX_NAME}"

print(f"\n‚úÖ MCP Server URL:\n   {MCP_SERVER_URL}")
print(f"\nüîß MCP Tool Name:\n   {MCP_TOOL_NAME}")

# Claude Desktop Configuration
claude_config = {
    "mcpServers": {
        MCP_SERVER_NAME: {
            "type": "url",
            "url": MCP_SERVER_URL,
            "name": MCP_SERVER_NAME
        }
    }
}

print(f"\nüìù Add to claude_desktop_config.json:")
print(json.dumps(claude_config, indent=2))

print("\n" + "=" * 80)
print("MCP Server Ready!")
print("=" * 80)