In [0]:
# %pip install databricks-langchain
# %pip install databricks-vectorsearch
# %pip install "mlflow[databricks]"
# %pip install tiktoken
# dbutils.library.restartPython()

In [0]:
# %sql
# CREATE CATALOG IF NOT EXISTS my_rag_catalog;
# CREATE SCHEMA IF NOT EXISTS my_rag_catalog.rag_schema;
# USE CATALOG my_rag_catalog;
# USE SCHEMA rag_schema;

In [0]:
%sql
SHOW TABLES IN databricks_databricks_documentation_dataset.v01;

# Required packages

In [0]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import re
from utils.preprocess import perform_section_chunking,summarize_document
from tqdm.auto import tqdm

# Preprocess Data
- Reads raw documentation, performs semantic + recursive chunking and stores processed chunks into your Unity Catalog as an intermediate table.

In [0]:
spark = SparkSession.builder.getOrCreate()

# 🔹 Configure your Unity Catalog target
catalog_name = "my_rag_catalog"
schema_name = "rag_schema"
target_table = f"{catalog_name}.{schema_name}.docs_chunks_intermediate"

# 🔹 Read your source dataset
source_df = spark.table("databricks_databricks_documentation_dataset.v01.docs")

# Convert to Pandas for chunking (adjust limit for large data)
docs_pd = source_df.toPandas()

print(f"Loaded {len(docs_pd)} documents for chunking")


In [0]:
all_chunks = []

for idx, row in tqdm(docs_pd.iterrows()):
    url = row["url"]
    doc_id = row["id"]
    content = row["content"]

    # Step 1: Document-level summary (for embedding)
    summary = summarize_document(content)

    # Step 2: Chunking for fine-grained retrieval
    chunks = perform_section_chunking(content, url, doc_id)

    # Attach summary to each chunk for easier join later
    for c in chunks:
        c["doc_summary"] = summary

    all_chunks.extend(chunks)

print(f"✅ Generated {len(all_chunks)} chunks across {len(docs_pd)} documents.")

# -------------------------------------------------------------------------
# 🔹 Save to Unity Catalog
# -------------------------------------------------------------------------
chunks_df = spark.createDataFrame(all_chunks)
chunks_df.write.mode("overwrite").saveAsTable(target_table)

print(f"✅ Successfully saved intermediate chunk table to: {target_table}")

In [0]:
chunked_df = spark.table(target_table)


In [0]:
chunked_df.display()

In [0]:
display(chunked_df)

# Creating the vector search end points

In [0]:
from databricks.vector_search.client import VectorSearchClient

vector_search_endpoint_name = "my_vector_search_endpoint"
vsc = VectorSearchClient()
# List existing endpoints
endpoints = vsc.list_endpoints()
display(endpoints)

# Delete the existing endpoint 
vsc.delete_endpoint(name=vector_search_endpoint_name)

# Now create the new endpoint
vsc.create_endpoint(name=vector_search_endpoint_name)

In [0]:
%sql
ALTER TABLE my_rag_catalog.rag_schema.docs_chunks_intermediate SET TBLPROPERTIES (delta.enableChangeDataFeed = true);

In [0]:
index_name = "my_rag_catalog.rag_schema.docs_chunks_intermediate_index"

vsc.create_delta_sync_index(
    endpoint_name=vector_search_endpoint_name,
    source_table_name=target_table,
    index_name=index_name,
    pipeline_type="TRIGGERED",
    primary_key="chunk_id",
    embedding_source_column="chunk_text",
    embedding_model_endpoint_name="databricks-gte-large-en"
)

# Querying the vector db

In [0]:
from databricks.vector_search.client import VectorSearchClient
vsc = VectorSearchClient()


In [0]:
vector_search_endpoint_name = "my_vector_search_endpoint"
index_name = "my_rag_catalog.rag_schema.docs_chunks_intermediate_index"


In [0]:
index = vsc.get_index(
    endpoint_name=vector_search_endpoint_name,
    index_name=index_name
)
print(index.describe()["status"])

In [0]:
query_text = "Set up and manage Unity Catalog"

results = index.similarity_search(
    query_text=query_text,
    columns=["chunk_text", "url", "chunk_id"],   # Include your metadata columns
    num_results=10
)

display(results)


In [0]:
results["result"]["data_array"][2]

In [0]:
results["result"]["data_array"][1]

In [0]:
results["result"]["data_array"][0]

In [0]:
results["result"]["data_array"][4]

In [0]:
results["result"]["data_array"][0]