In [0]:
%pip install -q databricks-vectorsearch
dbutils.library.restartPython()

In [0]:
import yaml

with open('ingestion_config.yml', 'r') as file:
    config = yaml.safe_load(file)

#load global configs
databricks_config = config['databricks_config']
entity_resolution_config = config['entity_resolution_config']

#load uc configs
catalog=databricks_config['catalog']
schema=databricks_config['schema']
entities_table = entity_resolution_config['gold_entity_table']

#load vs configs
vector_search_endpoint = entity_resolution_config['vector_search_endpoint']
vector_search_index = entity_resolution_config['vector_search_index']
embedding_model = entity_resolution_config['embedding_model']
vector_search_id_column = entity_resolution_config['vector_search_id_column']
embedding_source_column = entity_resolution_config['embedding_source_column']

In [0]:
import os

#load csv with company names
file_path = (os.getcwd() + "/sec_companies.csv")
company_names = spark.read.format("csv").option("header", "true").load("file:" + file_path)
display(company_names)

In [0]:
company_names_list = company_names.select("company_name").rdd.flatMap(lambda x: x).collect()
company_names_list

In [0]:
#write entities to table
company_names.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.{entities_table}")

In [0]:
from databricks.vector_search.client import VectorSearchClient

#create vector search index
spark.sql("ALTER TABLE `doan`.`alphaledger`.`company_entities` SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

client = VectorSearchClient()

index = client.create_delta_sync_index(
  endpoint_name=vector_search_endpoint,
  source_table_name=f"{catalog}.{schema}.{entities_table}",
  index_name=f"{catalog}.{schema}.{vector_search_index}",
  pipeline_type="TRIGGERED",
  primary_key=vector_search_id_column,
  embedding_source_column=embedding_source_column,
  embedding_model_endpoint_name=embedding_model
)