In [0]:
%pip install -q -r requirements.txt
dbutils.library.restartPython()

In [0]:
import os 
import yaml
from dbruntime.databricks_repl_context import get_context

#for development purposes only
os.environ["DATABRICKS_TOKEN"] = get_context().apiToken
os.environ["DATABRICKS_HOST"] = "https://" + get_context().browserHostName

with open('./smoke_test_config.yaml', 'r') as file:
    config = yaml.safe_load(file)

databricks_config = config["databricks_config"]
catalog = databricks_config["catalog"]
schema = databricks_config["schema"]

vs_config = config["vector_search_config"]

docs_table_name = vs_config["document_table_name"]
vs_endpoint_name = vs_config["vector_search_endpoint_name"]
vs_index_name = vs_config["vector_search_index_name"]
embedding_model = vs_config["embedding_model_endpoint_name"]

In [0]:
try:
  spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")
except Exception as e:
  print(f"Failed to create schema {schema} with exception {str(e)}")
  raise e

In [0]:
try:
  spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}")
except Exception as e:
  print(f"Failed to create schema {schema} with exception {str(e)}")
  raise e

In [0]:
import os
try: 
  input_example = spark.read.parquet(f"file://{os.getcwd()}/data/*.parquet")
  input_example.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.{docs_table_name}")
  spark.sql(f"ALTER TABLE {catalog}.{schema}.smoke_test_documents SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")
except Exception as e:
  print(f"Failed to create test document table with exception {str(e)}")
  raise e

In [0]:
import time
from datetime import timedelta
from databricks.vector_search.client import VectorSearchClient

vsc = VectorSearchClient(disable_notice=True)

try:
    vsc.create_endpoint(name=vs_endpoint_name,
                        endpoint_type="STANDARD")
    time.sleep(5)
    vsc.wait_for_endpoint(name=vs_endpoint_name,
                                timeout=timedelta(minutes=60),
                                verbose=True)
    
    print(f"Endpoint named {vs_endpoint_name} is ready.")
    vs_endpoint = vsc.get_endpoint(name=vs_endpoint_name)

except Exception as e:
    if "already exists" in str(e):
        print(f"Endpoint named {vs_endpoint_name} already exists.")
        vs_endpoint = vsc.get_endpoint(name=vs_endpoint_name)
    else:
        print(f"Failed to create vector search endpoint with error: {str(e)}")
        raise e

In [0]:
try:
  vector_search_index = vsc.create_delta_sync_index_and_wait(
    endpoint_name=vs_endpoint_name,
    index_name=f"{catalog}.{schema}.{vs_index_name}",
    source_table_name=f"{catalog}.{schema}.{docs_table_name}",
    primary_key="chunk_id",
    embedding_source_column="content_chunked",
    embedding_model_endpoint_name=embedding_model,
    verbose=True,
    pipeline_type="TRIGGERED"
  )
except Exception as e:
    if "already exists" in str(e):
      print(f"Index named {vs_index_name} already exists.")
      vector_search_index = vsc.get_index(endpoint_name, vs_index_name)
    else:
      print(f"Failed to create vector search endpoint with error: {str(e)}")
      raise e

In [0]:
print(f"VS Index {catalog}.{schema}.{vs_index_name} created successfully!")