# Approve DTA Version (Create DTA Major)

Promotes the current draft from Silver to DTA Major in Gold library for ALL configured library types.

**Operation:** `APPROVE_DTA` / `CREATE_DTA_APPROVED`

**Modes:**
- `HISTORICAL`: Promotes draft to DTA Major (auto-approval for historical imports)
- `UI`: Promotes draft to DTA Major (triggered by workflow approval)

**What it does:**
1. Gets DTA ID(s) from widget or task values
2. Iterates over ALL configured library types (transfer_variables, test_concepts, etc.)
3. For each library type with records:
   - Reads draft records from Silver
   - Creates DTA Major version in Gold library (copies records with `version` = major version)
   - Updates Silver records: `version_status=APPROVED`
   - Registers DTA_APPROVED version in registry
4. Updates DTA entity: `version`, `status=ACTIVE` (aligns 1:1 with md_version_registry.status)

**Version handling:**
- **Silver**: Keeps draft version tag (e.g., `1.0-DTA001-draft1`) with `version_status=APPROVED`
- **Gold**: Gets major version in `version` column (e.g., `1.0-DTA001-v1.0`)

**Supported Library Types:**
- `transfer_variables` - md_dta_transfer_variables
- `test_concepts` - md_dta_vendor_test_concepts

**Parameters:**
- `dta_id` - Single DTA identifier (for UI/API call)
- `source` - HISTORICAL or UI
- `library_type` - Optional filter (empty = process ALL library types)


In [None]:
# Cell 1: Imports
import json
from datetime import datetime
from pyspark.sql import functions as F
from clinical_data_standards_framework.versioning import get_version_registry_schema
from clinical_data_standards_framework.utils import log_dta_version_event, log_dta_workflow_event, compute_definition_hash_spark

print("="*80)
print("APPROVE DTA VERSION (Create DTA Major)")
print("="*80)


In [None]:
# Cell 2: Get Configuration from Setup Task
globals_dict = json.loads(dbutils.jobs.taskValues.get(taskKey="setup", key="globals"))
versioning_dict = json.loads(dbutils.jobs.taskValues.get(taskKey="setup", key="versioning"))
created_by_principal = dbutils.jobs.taskValues.get(taskKey="setup", key="created_by_principal")
databricks_job_id = dbutils.jobs.taskValues.get(taskKey="setup", key="databricks_job_id")
databricks_job_name = dbutils.jobs.taskValues.get(taskKey="setup", key="databricks_job_name")
databricks_run_id = dbutils.jobs.taskValues.get(taskKey="setup", key="databricks_run_id")

catalog = globals_dict['catalog']
silver_schema = globals_dict['silver_schema']
gold_schema = globals_dict['gold_schema']

# Versioning configuration from YAML
library_tables = versioning_dict.get('library_tables', [])
registry_table_name = versioning_dict.get('registry_table', 'md_version_registry')

# Definition hash configuration (all from versioning section)
# Common fields are always included in ALL library type hashes
definition_hash_common_fields = versioning_dict.get('definition_hash_common_fields', ['data_provider_name', 'data_stream_type'])

# Per-library type hash fields (from versioning section, not pipeline_config)
definition_hash_fields_config = versioning_dict.get('definition_hash_fields_by_type', {})
definition_hash_fields_by_type = {}
for lib_type in definition_hash_fields_config:
    definition_hash_fields_by_type[lib_type] = definition_hash_fields_config.get(lib_type, [])

# Build gold_columns map from config (no more hardcoded values)
# Format: {library_type: [column_names]}
gold_columns_by_type = {}
for lib_config in library_tables:
    lib_type = lib_config.get('library_type')
    gold_cols = lib_config.get('gold_columns', [])
    if lib_type:
        gold_columns_by_type[lib_type] = gold_cols

# Build related_tables map from config
# Format: {library_type: [{silver_table, gold_table, gold_columns}, ...]}
related_tables_by_type = {}
for lib_config in library_tables:
    lib_type = lib_config.get('library_type')
    related_tables = lib_config.get('related_tables', [])
    if lib_type and related_tables:
        related_tables_by_type[lib_type] = related_tables

print(f"Catalog: {catalog}")
print(f"Silver Schema: {silver_schema}")
print(f"Gold Schema: {gold_schema}")
print(f"Created by: {created_by_principal}")
print(f"Definition hash common fields: {definition_hash_common_fields}")
print(f"Definition hash fields by type: {list(definition_hash_fields_by_type.keys())}")
print(f"Gold columns configured for: {list(gold_columns_by_type.keys())}")
print(f"Related tables configured for: {list(related_tables_by_type.keys())}")


In [None]:
# Cell 3: Get Parameters from Task Values (set by create_dta_instance)
# This notebook runs as part of job_cdm_dta_create after save_draft

# Get DTA IDs and parameters from create_dta_instance task values
try:
    dta_ids = json.loads(dbutils.jobs.taskValues.get(taskKey="create_dta_instance", key="created_dta_ids"))
    source = dbutils.jobs.taskValues.get(taskKey="create_dta_instance", key="source")
    library_type_param = dbutils.jobs.taskValues.get(taskKey="create_dta_instance", key="library_type")
    print(f"Mode: Reading from create_dta_instance task values")
except Exception as e:
    raise ValueError(
        f"Could not get task values from create_dta_instance: {e}. "
        "This notebook must run after create_dta_instance task in the job pipeline."
    )

# Validate we have at least one DTA ID
if not dta_ids:
    raise ValueError("No DTAs created by create_dta_instance task")

print(f"Source: {source}")
print(f"DTAs to process: {len(dta_ids)}")
print(f"DTA IDs: {dta_ids}")

# Determine which library types to process
# If library_type is specified, only process that type
# If empty, process ALL configured library types
if library_type_param:
    library_types_to_process = [lib for lib in library_tables if lib.get('library_type') == library_type_param]
    if not library_types_to_process:
        raise ValueError(
            f"library_type '{library_type_param}' not found in config. "
            f"Available types: {[lib.get('library_type') for lib in library_tables]}"
        )
else:
    # Process ALL configured library types
    if not library_tables:
        raise ValueError(
            "No library_tables configured in versioning config. "
            "Check that md_config_cache is populated correctly."
        )
    library_types_to_process = library_tables

print(f"Library types to process: {[lib.get('library_type') for lib in library_types_to_process]}")


In [None]:
# Cell 4: Configure Table Names
dta_table = f"{catalog}.{gold_schema}.dta"
registry_table = f"{catalog}.{gold_schema}.{registry_table_name}"

# Build table maps from config
silver_table_map = {}
library_table_map = {}

for lib_config in library_types_to_process:
    lib_type = lib_config.get('library_type')
    
    # Silver table
    silver_name = lib_config.get('silver_table', 'md_dta_transfer_variables_draft')
    silver_sch = lib_config.get('silver_schema', silver_schema)
    silver_table_map[lib_type] = f"{catalog}.{silver_sch}.{silver_name}"
    
    # Gold library table
    lib_name = lib_config.get('name')
    lib_schema = lib_config.get('schema', gold_schema)
    if lib_type and lib_name:
        library_table_map[lib_type] = f"{catalog}.{lib_schema}.{lib_name}"

print(f"\nDTA Table: {dta_table}")
print(f"Registry Table: {registry_table}")
print(f"\nSilver Tables:")
for lib_type, table in silver_table_map.items():
    print(f"  {lib_type}: {table}")
print(f"\nGold Library Tables:")
for lib_type, table in library_table_map.items():
    print(f"  {lib_type}: {table}")


In [None]:
# Cell 5: Load DTA Metadata
now = datetime.now()
base_template_version = "1.0"

df_dta = spark.table(dta_table).filter(F.col("dta_id").isin(dta_ids))
dta_metadata = {row["dta_id"]: row.asDict() for row in df_dta.collect()}
print(f"\nLoaded metadata for {len(dta_metadata)} DTAs")

# Get draft version tags from save_draft or DTA metadata
try:
    draft_versions = json.loads(dbutils.jobs.taskValues.get(taskKey="save_draft", key="draft_versions"))
except:
    draft_versions = {dta_id: dta_metadata.get(dta_id, {}).get("current_draft_version") for dta_id in dta_ids}


In [None]:
# Cell 6: Create DTA Major Versions - Update Silver and Prepare Major Versions
print(f"\n{'='*80}")
print("Creating DTA Major versions (ALL library types)...")
print(f"{'='*80}")

major_versions = {}
# Structure: {dta_id: {library_type: {major_version, draft_count, gold_count}}}
major_versions_by_dta = {}

for dta_id in dta_ids:
    meta = dta_metadata.get(dta_id, {})
    dta_number = meta.get("dta_number", "DTA000")
    draft_version = draft_versions.get(dta_id, f"{base_template_version}-{dta_number}-draft1")
    
    # Create major version tag
    major_version = f"{base_template_version}-{dta_number}-v1.0"
    major_versions[dta_id] = major_version
    major_versions_by_dta[dta_id] = {}
    
    print(f"\nProcessing DTA: {dta_number}")
    print(f"  Draft: {draft_version} → Major: {major_version}")
    
    # Process each library type
    for library_type, silver_table in silver_table_map.items():
        print(f"\n  [{library_type}]")
        
        # Check if silver table exists
        if not spark.catalog.tableExists(silver_table):
            print(f"    ⚠ Table does not exist, skipping")
            major_versions_by_dta[dta_id][library_type] = {"draft_count": 0, "gold_count": 0}
            continue
        
        # Read draft records from Silver
        df_draft = spark.table(silver_table).filter(
            (F.col("dta_id") == dta_id) & 
            (F.col("version_status") == "DRAFT")
        )
        draft_count = df_draft.count()
        print(f"    Silver draft records: {draft_count}")
        
        if draft_count == 0:
            print(f"    ⚠ No draft records found, skipping")
            major_versions_by_dta[dta_id][library_type] = {"draft_count": 0, "gold_count": 0}
            continue
        
        # Update Silver: mark as APPROVED but KEEP the draft version
        spark.sql(f"""
            UPDATE {silver_table}
            SET version_status = 'APPROVED',
                is_current_draft = false
            WHERE dta_id = '{dta_id}' AND version_status = 'DRAFT'
        """)
        print(f"    ✓ Updated silver records to APPROVED")
        
        # Also update related tables if configured (e.g., OA attributes, options, other)
        related_tables = related_tables_by_type.get(library_type, [])
        for rel_config in related_tables:
            rel_silver_name = rel_config.get('silver_table')
            if not rel_silver_name:
                continue
            
            rel_silver_table = f"{catalog}.{silver_schema}.{rel_silver_name}"
            
            # Check if related silver table exists
            if not spark.catalog.tableExists(rel_silver_table):
                print(f"    ⚠ Related table {rel_silver_name} does not exist, skipping update")
                continue
            
            # Update related table records to APPROVED
            spark.sql(f"""
                UPDATE {rel_silver_table}
                SET version_status = 'APPROVED',
                    is_current_draft = false
                WHERE dta_id = '{dta_id}' AND version_status = 'DRAFT'
            """)
            print(f"    ✓ Updated related table {rel_silver_name} to APPROVED")
        
        # Store draft count for this library type
        major_versions_by_dta[dta_id][library_type] = {"draft_count": draft_count, "gold_count": 0}


In [None]:
# Cell 7: Copy Records to Gold Library (All Library Types)
# Computes definition_hash during promotion (silver has no hash column)
# gold_columns_by_type is now loaded from config in Cell 2
print(f"\n{'='*80}")
print("Copying DTA Major records to Gold library...")
print(f"{'='*80}")

# Helper function to copy records from silver to gold
def copy_to_gold_table(df_source, gold_columns, hash_fields, major_version, library_table, include_hash=True):
    """
    Copy records from silver to gold table with version and optional definition_hash.
    
    Args:
        df_source: Source DataFrame (filtered to approved records)
        gold_columns: List of columns to copy from silver
        hash_fields: List of fields for definition_hash computation
        major_version: Version string for gold records
        library_table: Target gold table name
        include_hash: Whether to compute definition_hash (True for main tables, False for related)
    
    Returns:
        Number of records written
    """
    # Select columns that exist in the source table
    available_cols = [c for c in gold_columns if c in df_source.columns]
    
    # Select and add gold-specific columns
    df_gold = df_source.select(
        *[F.col(c) for c in available_cols]
    ).withColumn("version", F.lit(major_version)
    ).withColumn("is_major_version", F.lit(True)
    ).withColumn("is_dta_major", F.lit(True)
    ).withColumn("parent_version", F.lit(base_template_version)
    ).withColumn("effective_start_ts", F.lit(now)
    ).withColumn("effective_end_ts", F.lit(None).cast("timestamp")
    ).withColumn("is_current", F.lit(True))
    
    # Compute definition_hash for main library tables (related tables don't have hash)
    if include_hash and hash_fields:
        # Filter to columns that actually exist in the DataFrame
        hash_fields_available = [f for f in hash_fields if f in df_gold.columns]
        if hash_fields_available:
            df_gold = compute_definition_hash_spark(df_gold, hash_fields_available, "definition_hash")
            print(f"      → Computed definition_hash using fields: {hash_fields_available}")
        else:
            df_gold = df_gold.withColumn("definition_hash", F.lit(None).cast("string"))
            print(f"      ⚠ No hash fields available, definition_hash set to NULL")
    elif include_hash:
        df_gold = df_gold.withColumn("definition_hash", F.lit(None).cast("string"))
        print(f"      ⚠ No hash fields configured, definition_hash set to NULL")
    
    # Append to gold library with schema evolution
    df_gold.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable(library_table)
    return df_gold.count()

for dta_id in dta_ids:
    meta = dta_metadata.get(dta_id, {})
    major_version = major_versions[dta_id]
    lib_type_data = major_versions_by_dta.get(dta_id, {})
    
    print(f"\n  DTA: {meta.get('dta_number')}")
    
    # Copy to each library type
    for library_type, silver_table in silver_table_map.items():
        # Get library table for this type
        library_table = library_table_map.get(library_type)
        if not library_table:
            print(f"    [{library_type}] No gold table configured, skipping")
            continue
        
        # Check if we have records to copy
        type_info = lib_type_data.get(library_type, {})
        if type_info.get("draft_count", 0) == 0:
            print(f"    [{library_type}] No records to copy, skipping")
            continue
        
        print(f"    [{library_type}]")
        
        # Check if silver table exists
        if not spark.catalog.tableExists(silver_table):
            print(f"      ⚠ Silver table does not exist, skipping")
            continue
        
        # Read approved records from Silver
        df_approved = spark.table(silver_table).filter(
            (F.col("dta_id") == dta_id) & (F.col("version_status") == "APPROVED")
        )
        
        # Get columns for this library type from config
        gold_columns = gold_columns_by_type.get(library_type, [])
        if not gold_columns:
            print(f"      ⚠ No gold_columns configured for {library_type}, skipping")
            continue
        
        # Get hash fields for this library type (type-specific + common fields)
        type_hash_fields = definition_hash_fields_by_type.get(library_type, [])
        all_hash_fields = type_hash_fields + definition_hash_common_fields
        
        # Copy main table using helper function
        gold_count = copy_to_gold_table(
            df_source=df_approved,
            gold_columns=gold_columns,
            hash_fields=all_hash_fields,
            major_version=major_version,
            library_table=library_table,
            include_hash=True
        )
        
        # Update tracking
        major_versions_by_dta[dta_id][library_type]["gold_count"] = gold_count
        print(f"      ✓ Added {gold_count} records to {library_table}")
        
        # Process related tables if configured (e.g., OA attributes, options, other)
        related_tables = related_tables_by_type.get(library_type, [])
        for rel_config in related_tables:
            rel_silver_name = rel_config.get('silver_table')
            rel_gold_name = rel_config.get('gold_table')
            rel_gold_columns = rel_config.get('gold_columns', [])
            
            if not rel_silver_name or not rel_gold_name:
                continue
            
            # Build full table names
            rel_silver_table = f"{catalog}.{silver_schema}.{rel_silver_name}"
            rel_gold_table = f"{catalog}.{gold_schema}.{rel_gold_name}"
            
            print(f"      [{rel_silver_name} → {rel_gold_name}]")
            
            # Check if silver table exists
            if not spark.catalog.tableExists(rel_silver_table):
                print(f"        ⚠ Silver table does not exist, skipping")
                continue
            
            # Read approved records from silver related table
            df_rel_approved = spark.table(rel_silver_table).filter(
                (F.col("dta_id") == dta_id) & (F.col("version_status") == "APPROVED")
            )
            
            rel_count = df_rel_approved.count()
            if rel_count == 0:
                print(f"        ⚠ No approved records found, skipping")
                continue
            
            # Copy related table (no definition_hash for related tables)
            rel_gold_count = copy_to_gold_table(
                df_source=df_rel_approved,
                gold_columns=rel_gold_columns,
                hash_fields=[],  # Related tables don't have definition_hash
                major_version=major_version,
                library_table=rel_gold_table,
                include_hash=False
            )
            print(f"        ✓ Added {rel_gold_count} records to {rel_gold_table}")


In [None]:
# Cell 8: Register Versions and Update DTA Entities
print(f"\n{'='*80}")
print("Registering versions and updating DTA entities...")
print(f"{'='*80}")

registry_records = []

for dta_id, major_version in major_versions.items():
    meta = dta_metadata.get(dta_id, {})
    draft_version = draft_versions.get(dta_id)
    lib_type_data = major_versions_by_dta.get(dta_id, {})
    
    print(f"\n  DTA: {meta.get('dta_number')}")
    
    # Update draft registry entries to ARCHIVED with effective_end_ts (for all library types)
    if draft_version:
        spark.sql(f"""
            UPDATE {registry_table}
            SET status = 'ARCHIVED', 
                effective_end_ts = current_timestamp(),
                last_updated_ts = current_timestamp()
            WHERE version = '{draft_version}' AND dta_id = '{dta_id}'
        """)
        print(f"    ✓ Marked draft versions as ARCHIVED")
    
    # Register DTA_APPROVED for each library type with records
    for library_type, type_info in lib_type_data.items():
        gold_count = type_info.get("gold_count", 0)
        
        if gold_count == 0:
            print(f"    [{library_type}] Skipped (0 records)")
            continue
        
        registry_record = {
            "version": major_version,
            "library_type": library_type,
            "version_type": "DTA_APPROVED",
            "dta_id": dta_id,
            "parent_version": base_template_version,
            "record_count": gold_count,
            "status": "ACTIVE",
            "data_provider_name": meta.get("data_provider_name"),
            "data_stream_type": meta.get("data_stream_type"),
            "included_dta_ids": None,  # Not applicable for DTA_APPROVED
            "created_by_principal": created_by_principal,
            "created_ts": now,
            "last_updated_by_principal": created_by_principal,
            "last_updated_ts": now,
            "databricks_job_id": databricks_job_id,
            "databricks_job_name": databricks_job_name,
            "databricks_run_id": databricks_run_id
        }
        registry_records.append(registry_record)
        print(f"    [{library_type}] Registered: {major_version} ({gold_count} records)")
    
    # Update DTA entity (once per DTA) - status aligns 1:1 with md_version_registry
    spark.sql(f"""
        UPDATE {dta_table}
        SET version = '{major_version}',
            status = 'ACTIVE',
            workflow_state = 'APPROVED',
            last_updated_ts = current_timestamp()
        WHERE dta_id = '{dta_id}'
    """)
    print(f"    ✓ Updated DTA entity")
    
    # Log activity: DTA Major created
    log_dta_version_event(
        spark=spark,
        catalog=catalog,
        dta_id=dta_id,
        activity_type="DTA_APPROVED_CREATED",
        version=major_version,
        performed_by=created_by_principal,
        parent_version=draft_version
    )
    
    # Log workflow approval (for historical: auto-approved)
    if source == "HISTORICAL":
        log_dta_workflow_event(
            spark=spark,
            catalog=catalog,
            dta_id=dta_id,
            activity_type="APPROVED",
            performed_by=created_by_principal,
            workflow_iteration=1,
            approver_role="SYSTEM",
            approver_name="Historical Import",
            comment="Auto-approved from historical DTA import"
        )
    
    print(f"    ✓ Activity logged")

# Save all registry records
if registry_records:
    registry_df = spark.createDataFrame(registry_records, schema=get_version_registry_schema())
    registry_df.write.format("delta").mode("append").saveAsTable(registry_table)
    print(f"\n✓ Registered {len(registry_records)} DTA_APPROVED version(s) in registry")


In [None]:
# Cell 9: Set Task Values and Summary
dbutils.jobs.taskValues.set(key="major_versions", value=json.dumps(major_versions))
dbutils.jobs.taskValues.set(key="major_versions_by_dta", value=json.dumps(major_versions_by_dta))
dbutils.jobs.taskValues.set(key="dta_ids", value=json.dumps(dta_ids))
dbutils.jobs.taskValues.set(key="action", value="APPROVE_DTA")

print(f"\n{'='*80}")
print("SUMMARY")
print(f"{'='*80}")
print(f"Source: {source}")
print(f"Library Types: {list(library_table_map.keys())}")
print(f"DTAs approved: {len(major_versions)}")

for dta_id, version in major_versions.items():
    meta = dta_metadata.get(dta_id, {})
    lib_type_data = major_versions_by_dta.get(dta_id, {})
    print(f"\n  {meta.get('dta_number')}: {version}")
    for lib_type, type_info in lib_type_data.items():
        print(f"    - {lib_type}: {type_info.get('gold_count', 0)} records")

print(f"\n✅ DTA Major versions created successfully!")
