# OA Parse Raw
Parse Operational Agreement documents from `md_file_history` using `ai_parse_document()` and write to Bronze table.

**Input**: `bronze_md.md_file_history` (documents tagged with OPERATIONAL_AGREEMENT)

**Output**: `bronze_md.md_oa_file_raw`


In [None]:
# Cell 0: Imports
import json
from pyspark.sql import functions as F
from clinical_data_standards_framework.utils import save_with_audit

print("✅ Framework loaded successfully")


In [None]:
# Cell 1: Read config & globals from setup task

globals_dict = json.loads(
    dbutils.jobs.taskValues.get(taskKey="setup", key="globals")
)
services_dict = json.loads(
    dbutils.jobs.taskValues.get(taskKey="setup", key="services")
)

# Debug print: show we really got something
print("Globals from YAML:", globals_dict)
print("Services from YAML:", list(services_dict.keys()))

flow_name = dbutils.jobs.taskValues.get(taskKey="setup", key="flow_name")
print(f"Flow from setup: {flow_name}")

# Audit globals
created_by_principal = dbutils.jobs.taskValues.get(taskKey="setup", key="created_by_principal")
databricks_job_id    = dbutils.jobs.taskValues.get(taskKey="setup", key="databricks_job_id")
databricks_job_name  = dbutils.jobs.taskValues.get(taskKey="setup", key="databricks_job_name")
databricks_run_id    = dbutils.jobs.taskValues.get(taskKey="setup", key="databricks_run_id")

# Try to get pipeline_config; fail with clear message if missing
try:
    pipeline_config_raw = dbutils.jobs.taskValues.get(taskKey="setup", key="pipeline_config")
    print(f"pipeline_config (raw from taskValues): {pipeline_config_raw[:200]}...")
    if not pipeline_config_raw:
        raise ValueError("pipeline_config in taskValues is empty")
    pipeline_config = json.loads(pipeline_config_raw)
except Exception as e:
    raise ValueError(
        "pipeline_config task value is missing or invalid. "
        "Make sure:\n"
        "  1) The 'setup' task ran successfully, and\n"
        "  2) flow_name='operational_agreement_processor' exists in your YAML under 'pipelines', and\n"
        "  3) job_populate_config_cache has been run after YAML changes.\n"
        f"Root cause: {e}"
    )

print(f"Top-level keys in pipeline_config: {list(pipeline_config.keys())}")


In [None]:
# Cell 2: Resolve OA pipeline configuration

docs_cfg   = pipeline_config["documents"]["operational_agreement"]
source_cfg = docs_cfg["source"]
output_cfg = docs_cfg["output"]

# From globals
catalog       = globals_dict["catalog"]        # e.g. "dta_poc"
bronze_schema = globals_dict["bronze_schema"]  # e.g. "bronze_md"

# From pipeline-specific config
source_table_name = source_cfg["source_table"]          # e.g. "md_dta_history"
filter_tags       = source_cfg.get("filter_tags", [])   # list of tags
filter_active     = source_cfg.get("filter_active", True)
filter_status     = source_cfg.get("filter_status")     # e.g. "READY_FOR_PROCESSING"
file_extensions   = source_cfg.get("file_extensions", [".docx"])

raw_table_name = output_cfg["raw_table_name"]           # e.g. "md_oa_file_raw"

source_table_full = f"{catalog}.{bronze_schema}.{source_table_name}"
raw_table_full    = f"{catalog}.{bronze_schema}.{raw_table_name}"

print("\nResolved OA config:")
print(f"  catalog            = {catalog}")
print(f"  bronze_schema      = {bronze_schema}")
print(f"  source_table_full  = {source_table_full}")
print(f"  raw_table_full     = {raw_table_full}")
print(f"  filter_tags        = {filter_tags}")
print(f"  filter_active      = {filter_active}")
print(f"  filter_status      = {filter_status}")
print(f"  file_extensions    = {file_extensions}")


In [None]:
# Cell 3: Runtime widgets (override mode/tags if needed)

dbutils.widgets.dropdown("write_mode", "append", ["overwrite", "append"], "Write mode")
dbutils.widgets.text("tag_filter_override", "", "Tag filter override (comma-separated, optional)")

write_mode          = dbutils.widgets.get("write_mode")
tag_filter_override = dbutils.widgets.get("tag_filter_override").strip()

if tag_filter_override:
    override_tags = [t.strip() for t in tag_filter_override.split(",") if t.strip()]
    if override_tags:
        print(f"\nOverriding config filter_tags with: {override_tags}")
        filter_tags = override_tags

print(f"\nRuntime settings:")
print(f"  write_mode         = {write_mode}")
print(f"  effective_tags     = {filter_tags}")


In [None]:
# Cell 4: Utility: catalog & schema

def use_catalog_and_schema(catalog: str, schema: str):
    spark.sql(f"USE CATALOG `{catalog}`")
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS `{schema}`")
    spark.sql(f"USE SCHEMA `{schema}`")

use_catalog_and_schema(catalog, bronze_schema)


In [None]:
# Cell 5: Generic function: md_dta_history → md_oa_file_raw

def process_history_to_oa_raw(
    catalog: str,
    schema: str,
    source_table_full: str,
    raw_table_full: str,
    filter_tags: list,
    file_extensions: list,
    filter_active: bool = True,
    filter_status: str | None = None,
    parse_version: str = "2.0",
    write_mode: str = "append",
):
    """
    1) Reads candidate docs from source_table_full (e.g. md_dta_history).
    2) Applies generic filters:
       - file_extension IN file_extensions
       - document_tags contains ANY of filter_tags (if provided)
       - active == True (optional)
       - is_current == True (fixed rule for history)
       - status == filter_status (if provided)
    3) Uses ai_parse_document(unbase64(content_base64), map('version', parse_version)).
    4) Builds cleaned 'content' string from parsed.document.elements.
    5) Writes to raw_table_full using save_with_audit.
    """

    use_catalog_and_schema(catalog, schema)

    print(f"\n[process_history_to_oa_raw] Reading from: {source_table_full}")

    df = spark.table(source_table_full)

    # --- Build filters generically ---
    conds = []

    # File extensions
    if file_extensions:
        conds.append(F.col("file_extension").isin(file_extensions))

    # Tags: ANY of filter_tags in document_tags
    if filter_tags:
        tag_conds = [F.array_contains(F.col("document_tags"), t) for t in filter_tags]
        tag_expr = None
        for c in tag_conds:
            tag_expr = c if tag_expr is None else (tag_expr | c)
        conds.append(tag_expr)

    # Active flag
    if filter_active:
        conds.append(F.col("active") == True)

    # Always current
    conds.append(F.col("is_current") == True)

    # Status filter
    if filter_status:
        conds.append(F.col("status") == filter_status)

    # Apply all conditions
    from functools import reduce
    import operator

    if conds:
        combined = reduce(operator.and_, conds)
        df_raw = df.filter(combined)
    else:
        df_raw = df

    src_count = df_raw.count()
    print(f"[process_history_to_oa_raw] Filtered history rows: {src_count}")

    if src_count == 0:
        print("[process_history_to_oa_raw] No matching rows to process; exiting early.")
        dbutils.jobs.taskValues.set(key="oa_parse_status", value="NO_DATA")
        dbutils.jobs.taskValues.set(key="oa_parse_count", value="0")
        return

    # --- Parsing ---
    # Note: file_name, file_extension, file_version removed - available in md_file_history
    df_parsed = (
        df_raw
        .withColumn(
            "parsed",
            F.expr(
                f"ai_parse_document("
                f"  unbase64(content_base64), "
                f"  map('version','{parse_version}')"
                f")"
            )
        )
    )

    # --- Build 'content' from elements (generic, checkbox-aware) ---
    # Parent Document ID Logic:
    # - For files extracted from a ZIP: parent_document_id = ZIP's document_id (use it)
    # - For standalone uploads: parent_document_id = NULL, use document_id as fallback
    df_clean = (
        df_parsed
        .select(
            "document_id",
            # Use parent_document_id if from ZIP, else fallback to document_id
            F.coalesce(F.col("parent_document_id"), F.col("document_id")).alias("parent_document_id"),
            "extracted_path",
            F.expr(
                """
                concat_ws(
                  '\\n\\n',
                  transform(
                    try_cast(parsed:document:elements AS ARRAY<VARIANT>),
                    e ->
                      CASE
                        -- Checkbox-like element
                        WHEN lower(try_cast(e:type AS STRING)) = 'checkbox' THEN
                          (CASE
                             WHEN coalesce(try_cast(e:checked AS BOOLEAN), false) THEN '☒ '
                             ELSE '□ '
                           END)
                          ||
                          coalesce(
                            try_cast(e:label   AS STRING),
                            try_cast(e:text    AS STRING),
                            try_cast(e:content AS STRING),
                            ''
                          )

                        -- Fallback: normal text element
                        ELSE coalesce(
                               try_cast(e:content AS STRING),
                               try_cast(e:text    AS STRING),
                               ''
                             )
                      END
                  )
                )
                """
            ).alias("content")
        )
        .where(F.col("content").isNotNull() & (F.col("content") != ""))
    )

    clean_count = df_clean.count()
    print(f"[process_history_to_oa_raw] Clean rows: {clean_count}")

    if clean_count == 0:
        print("[process_history_to_oa_raw] No non-empty parsed content rows; nothing to write.")
        dbutils.jobs.taskValues.set(key="oa_parse_status", value="NO_DATA")
        dbutils.jobs.taskValues.set(key="oa_parse_count", value="0")
        return

    # --- WRITE using save_with_audit ---
    save_with_audit(
        df=df_clean,
        table_name=raw_table_full,
        created_by_principal=created_by_principal,
        databricks_job_id=databricks_job_id,
        databricks_job_name=databricks_job_name,
        databricks_run_id=databricks_run_id,
        mode=write_mode,
    )

    print(
        f"[process_history_to_oa_raw] Saved {clean_count} row(s) to "
        f"{catalog}.{raw_table_full} with save_with_audit (mode={write_mode})"
    )
    
    # Set task values for downstream
    dbutils.jobs.taskValues.set(key="oa_parse_status", value="SUCCESS")
    dbutils.jobs.taskValues.set(key="oa_parse_count", value=str(clean_count))


In [None]:
# Cell 6: MAIN - run parse-only step

process_history_to_oa_raw(
    catalog=catalog,
    schema=bronze_schema,
    source_table_full=source_table_full,
    raw_table_full=raw_table_full,
    filter_tags=filter_tags,
    file_extensions=file_extensions,
    filter_active=filter_active,
    filter_status=filter_status,
    parse_version="2.0",
    write_mode=write_mode,
)

print("\n✅ Operational Agreement history → md_oa_file_raw pipeline completed (parse-only, with audits).")
