In [0]:
%pip install sentence-transformers

In [0]:
%restart_python

In [0]:
# Databricks Notebook: 03_build_vector_index.py

from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import mlflow
import os
from pyspark.sql.functions import pandas_udf, col,StringType
from pyspark.sql.types import ArrayType, FloatType
import json

In [0]:
# Load chunked text from Gold layer
df_chunks = spark.read.table("`docai-dbx`.gold.doc_chunks")

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Define UDF to generate embeddings
@pandas_udf(ArrayType(FloatType()))
def generate_embeddings_udf(text_series: pd.Series) -> pd.Series:
    embeddings = model.encode(text_series.tolist(), convert_to_numpy=True)
    return pd.Series([embedding.tolist() for embedding in embeddings])

# UDF to convert embedding array to string
def embedding_to_string(embedding):
    return json.dumps(embedding)

# Generate embeddings
df_embed = df_chunks.withColumn("embedding", generate_embeddings_udf(col("text_chunk")))

embedding_to_string_udf = udf(embedding_to_string, StringType())

# Add stringified embedding column
df_embed = df_embed.withColumn("embedding_str", embedding_to_string_udf(col("embedding")))

# COMMAND ----------
# Save to Delta table
df_embed.write.format("delta").option("mergeSchema", "true").mode("overwrite").saveAsTable(f"`docai-dbx`.gold.doc_embeddings")

# COMMAND ----------
print("✅ Embedding generation completed. Ready for vector search or LLM Q&A.")


In [0]:
%sql
select * from `docai-dbx`.gold.doc_embeddings