In [0]:
import yaml

with open('ingestion_config.yml', 'r') as file:
    config = yaml.safe_load(file)

databricks_config = config['databricks_config']
llm_config = config['llm_config']

catalog=databricks_config['catalog']
schema=databricks_config['schema']
silver_table=databricks_config['silver_table']

In [0]:
from pyspark.sql.functions import col, lower
silver=spark.table("doan.alphaledger.sec_docs_silver").select("*", "extracted_data.*").drop("extracted_data").withColumn("company", lower(col("company")))
display(silver)

In [0]:
display(silver.groupBy("document_type").count())
display(silver.groupBy("company").count())

In [0]:
#manually cleanup some company names
from pyspark.sql.functions import when, col

silver = silver.withColumn("company", when(col("company") == "activsionblizzard", "activisionblizzard").otherwise(col("company")))\
  .withColumn("company", when(col("company") == "johnson johnson", "johnsonjohnson").otherwise(col("company")))\
  .withColumn("company", when(col("company") == "johnson_johnson", "johnsonjohnson").otherwise(col("company")))\
  .withColumn("company", when(col("company") == "pg_e", "pg&e").otherwise(col("company")))

In [0]:
display(silver.groupBy("company").count())

In [0]:
summarization_prompt = """The following document chunk was extracted from an SEC filing (10k, 8k, 10q, etc.). Summarize and situate the chunk in the larger context of an SEC doc. You should return a concise but descriptive summarization. Additionally, if the chunk contains any financial information, you should clearly highlight the financial metrics, data, and figures in the chunk. Additionally, you should identify the SEC filing items and section that the chunk falls into, if possible, in your summarization. RETURN ONLY THE SUMMARIZATION AND NOTHING ELSE. Below is some additional information to aid in your summarization.

"""

In [0]:
from pyspark.sql.functions import concat, lit
silver_with_summarization_prompt = silver.withColumn(
  "summarization_prompt",
  concat(
    lit(summarization_prompt),
    lit("Comapny Name: "),
    col("company"),
    lit("\nSEC Document Type: "),
    col("document_type"),
    lit("\nDocument Year: "),
    col("year"),
    lit("\n\nDOCUMENT CHUNK:\n"),
    col("page_content")
  )
)

display(silver_with_summarization_prompt)

In [0]:
display(silver_with_summarization_prompt.limit(10).selectExpr("*", "ai_summarize(summarization_prompt, 500) as ai_summarization"))

In [0]:
sec_docs_gold = silver_with_summarization_prompt.limit(10).selectExpr("*", "ai_summarize(summarization_prompt, 500) as ai_summarization").select("chunk_id","path","page_content","document_type","year","company","ai_summarization")