In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

In [0]:
# %pip install databricks-sdk
%pip install --upgrade databricks-vectorsearch pydantic
%restart_python

In [0]:
import sys, os
sys.path.append(os.path.abspath('..'))
from configs.project import ProjectConfig
# from configs.project import projectConfig

# from databricks.vector_search.client import VectorSearchClient
from datetime import timedelta
import time


In [0]:
import yaml

with open("../configs/project.yml", "r") as file:
    data = yaml.safe_load(file)

# projectConfig = ProjectConfig(**data)
projectConfig = ProjectConfig(**data)
projectConfig.model_dump()

In [0]:
type(projectConfig)

In [0]:
for k,v in projectConfig.model_dump():
    print(k,v)

In [0]:
print("projectConfig.uc_catalog:", projectConfig.uc_catalog)
print("projectConfig.uc_schema:", projectConfig.uc_schema)
print("projectConfig.raw_data_volume:", projectConfig.raw_data_volume)
# print("projectConfig.vector_search_endpoint_name:", projectConfig.vector_search_endpoint_name)
# print("embedding_endpoint_name:", projectConfig.embedding_endpoint_name)
# print("table_sec_rag_docs_pages:", projectConfig.table_sec_rag_docs_pages)
# print("projectConfig.vector_search_attributes:", projectConfig.vector_search_attributes)
for k,v in projectConfig.vector_search_attributes.items():
    print("vector search index:", k)
    for kk,vv in v.dict().items():
        print("\t", kk, vv)

In [0]:
projectConfig.vector_search_attributes

In [0]:
vsc = VectorSearchClient(disable_notice=True)

In [0]:
vector_search_endpoint_name = projectConfig.vector_search_endpoint_name
embedding_endpoint_name = projectConfig.embedding_endpoint_name


In [0]:
display(spark.table(projectConfig.table_sec_rag_docs_pages))

In [0]:
try:
    vsc.create_endpoint(name=vector_search_endpoint_name,
                        endpoint_type="STANDARD")
    
    time.sleep(5)

    vsc.wait_for_endpoint(name=vector_search_endpoint_name,
                                timeout=timedelta(minutes=60),
                                verbose=True)
    
    print(f"Endpoint named {vector_search_endpoint_name} is ready.")

    ep = vsc.get_endpoint(name=vector_search_endpoint_name)

except Exception as e:
    if "already exists" in str(e):
        print(f"Endpoint named {vector_search_endpoint_name} already exists.")
        ep = vsc.get_endpoint(name=vector_search_endpoint_name)
    else:
        raise e


In [0]:
from databricks.sdk.service import iam
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()
w.permissions.set(request_object_type="vector-search-endpoints",
                  request_object_id=ep["id"],
                  access_control_list=[
                        iam.AccessControlRequest(group_name="users",
                                                   permission_level=iam.PermissionLevel.CAN_MANAGE)
                      ])

Test Embedding Endpoint

In [0]:
import mlflow
import mlflow.deployments

client = mlflow.deployments.get_deploy_client("databricks")


In [0]:
[ep for ep in client.list_endpoints() if ep["name"]==embedding_endpoint_name]


In [0]:
client.predict(endpoint="databricks-bge-large-en", inputs={"input": ["What is Apache Spark?"]})


#Create Vector Search Index

In [0]:
projectConfig.index_sec_rag_docs_pages

In [0]:

try:
  vector_search_index = vsc.create_delta_sync_index_and_wait(
    endpoint_name=vector_search_endpoint_name,
    index_name=projectConfig.index_sec_rag_docs_pages,
    source_table_name=projectConfig.table_sec_rag_docs_pages,
    primary_key=projectConfig.pk_sec_rag_docs_pages,
    embedding_source_column=projectConfig.source_column_sec_rag_docs_pages,
    embedding_model_endpoint_name=embedding_endpoint_name,
    pipeline_type="TRIGGERED",
    verbose=True
  )
except Exception as e:
    if "already exists" in str(e):
        print(f"Index named {vector_search_endpoint_name} already exists.")
        vector_search_index = vsc.get_index(vector_search_endpoint_name, 
                                                projectConfig.index_sec_rag_docs_pages)
    else:
        raise e

In [0]:
vector_search_index = covid_trial_title_index