In [0]:
%pip install -q pyyaml

In [0]:
import yaml

with open('ingestion_config.yml', 'r') as file:
    config = yaml.safe_load(file)
  
#load global configs
databricks_config = config['databricks_config']
chunk_extraction_config = config['chunk_extraction_config']

#load uc configs
catalog=databricks_config['catalog']
schema=databricks_config['schema']
silver_chunk_table = chunk_extraction_config["silver_table"]["name"]
gold_chunk_table_name = chunk_extraction_config["gold_table"]["name"]
gold_chunk_table_schema = chunk_extraction_config["gold_table"]["schema"]

scoring_llm_endpoint = chunk_extraction_config["gold_table"]["summarization_llm_config"]["llm_endpoint_name"]
scoring_llm_prompt = chunk_extraction_config["gold_table"]["summarization_llm_config"]["prompt"]

In [0]:
from pyspark.sql.functions import col, lower
silver_doc_chunks=spark.table(f"{catalog}.{schema}.{silver_chunk_table}").filter(col("quality_score")=="1")

In [0]:
import uuid
def generate_uuid(text):
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, text))

generate_uuid_udf = udf(generate_uuid)

In [0]:
from pyspark.sql.functions import lit, concat, substring

gold_chunk_table = (
    silver_doc_chunks.selectExpr(
        "*",
        f"ai_query('{scoring_llm_endpoint}', CONCAT('{scoring_llm_prompt}', doc_content)) as doc_summarization",
    )
    .withColumn(
        "doc_content",
        concat(
            lit("COMPANY: "),
            col("resolved_company"),
            lit("\nDOCUMENT TYPE: "),
            col("document_type"),
            lit("\nDOCUMENT YEAR: "),
            col("year"),
            lit("\nDOC SUMMARY:\n"),
            col("doc_summarization"),
            lit("\n\nDOC CONTENT:\n"),
            col("doc_content"),
        ),
    )
    .withColumn(
        "chunk_id", generate_uuid_udf(substring(col("doc_summarization"), 1, 200))
    )
    .select(gold_chunk_table_schema)
)

In [0]:
display(gold_chunk_table.limit(10))

In [0]:
gold_chunk_table.write.mode("overwrite").saveAsTable(f"{catalog}.{schema}.{gold_chunk_table_name}")

In [0]:
spark.sql(f"ALTER TABLE {catalog}.{schema}.{gold_chunk_table_name} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")