In [0]:
%pip install transformers

In [0]:
%restart_python

In [0]:
# Databricks Notebook: 02_generate_embeddings.py

from pyspark.sql.functions import col, explode, monotonically_increasing_id, udf, length, lower, trim
from pyspark.sql.types import ArrayType, StringType
from transformers import AutoTokenizer

In [0]:
# Load raw documents from bronze layer
df_raw = spark.read.format("delta").table(f"`docai-dbx`.bronze.documentdata")

# COMMAND ----------
# Create Silver layer: clean and enrich documents
df_silver = (
    df_raw.withColumn("clean_text", trim(lower(col("raw_text"))))
           .withColumn("char_count", length("raw_text"))
)

# Save to Silver layer
(df_silver.write
 .format("delta")
 .mode("overwrite")
 .saveAsTable(f"`docai-dbx`.silver.clean_documents"))

# Tokenizer to split into approximate chunks
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Chunking function
@udf(ArrayType(StringType()))
def chunk_text(text):
    max_tokens = 500
    if text is None:
        return []
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_tokens):
        chunk = " ".join(words[i:i+max_tokens])
        if len(chunk.strip()) > 20:
            chunks.append(chunk.strip())
    return chunks

df_silver = df_silver.filter(length("raw_text") > 100)

# Apply chunking on Silver data
df_chunks = (df_silver
    .withColumn("chunks", chunk_text(col("clean_text")))
    .select("file_name", "doc_id", explode("chunks").alias("text_chunk"))
    .withColumn("chunk_id", monotonically_increasing_id())
    .withColumn("source_doc", col("file_name"))
    .withColumn("chunk_len", length("text_chunk"))
)


In [0]:
# Save to Gold Delta table
(df_chunks
 .write
 .format("delta")
 .mode("overwrite")
 .option("mergeSchema", "true")
 .saveAsTable(f"`docai-dbx`.gold.doc_chunks"))

print("✅ Chunking and preprocessing completed. Ready for embeddings.")