In [0]:
%sql
USE CATALOG `nokia-assginment-catalog`;
-- drop schema patent_data cascade;

In [0]:
# Try to create a widget to control schema dropping
try:
    dbutils.widgets.dropdown("drop_patent_data_schema", "false", ["true", "false"], "Drop schema patent_data cascade")
    drop_patent_data_schema = dbutils.widgets.get("drop_patent_data_schema") == "true"
except:
    # Default to not dropping schema in job mode
    drop_patent_data_schema = False

print(f"Drop patent_data schema setting: {drop_patent_data_schema}")

# Execute SQL to drop schema if requested
if drop_patent_data_schema:
    try:
        print("Dropping schema patent_data cascade...")
        spark.sql("DROP SCHEMA IF EXISTS patent_data CASCADE")
        print("Schema patent_data successfully dropped")
    except Exception as e:
        print(f"Error dropping schema: {str(e)}")

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, lit, current_timestamp
from pyspark.sql.types import StringType, TimestampType, StructType, StructField
import pandas as pd
import os
from datetime import datetime

def initialize_spark():
    """Initialize Spark session with Delta Lake support"""
    return SparkSession.builder \
        .appName("Patent Silver Layer Processor") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.cores", "4") \
        .getOrCreate()

def get_nested_fields(schema, prefix="", max_depth=None, current_depth=0):
    """Recursively extract field names with parent-child hierarchy up to max_depth"""
    fields = []
    for field in schema.fields:
        field_name = f"{prefix}.{field.name}" if prefix else field.name
        if hasattr(field.dataType, "fields") and (max_depth is None or current_depth < max_depth):  # Check if it's a struct (nested)
            fields.extend(get_nested_fields(field.dataType, field_name, max_depth, current_depth + 1))
        else:
            fields.append(field_name)
    return fields

def analyze_field_occurrence(df, max_depth=None):
    """Analyze field occurrence percentage and return hierarchical view"""
    # Calculate total number of records
    total_count = df.count()
    
    # Get hierarchical field paths
    nested_fields = get_nested_fields(df.schema, max_depth=max_depth)
    
    # Compute occurrence percentage of each column
    col_counts = (
        df.select([(count(col(c)) / total_count * 100).alias(c) for c in nested_fields])
        .toPandas()
        .transpose()
        .reset_index()
    )
    
    # Rename columns
    col_counts.columns = ["Tag", "Occurrence (%)"]
    
    # Sort fields hierarchically
    col_counts["Tag"] = col_counts["Tag"].apply(lambda x: x.replace(".", " → "))  # Format for readability
    col_counts = col_counts.sort_values("Occurrence (%)", ascending=False)  # Sort by descending occurrence
    
    # Convert back to Spark DataFrame
    spark = df.sparkSession
    hierarchical_df = spark.createDataFrame(col_counts)
    
    return hierarchical_df, col_counts

def transform_and_rename_fields(df, occurrence_threshold=80):
    """Transform the bronze dataframe into silver with selected fields and column renaming"""
    # Calculate total number of records
    total_count = df.count()
    
    # Get hierarchical field paths
    nested_fields = get_nested_fields(df.schema)
    
    # Compute occurrence percentage of each column
    col_counts = (
        df.select([(count(col(c)) / total_count * 100).alias(c) for c in nested_fields])
        .toPandas()
        .transpose()
        .reset_index()
    )
    
    # Rename columns
    col_counts.columns = ["Tag", "Occurrence (%)"]
    
    # Select columns with more than threshold% occurrence
    selected_tags = col_counts[col_counts["Occurrence (%)"] >= occurrence_threshold]["Tag"].tolist()
    
    # Replace back the formatted tags
    selected_columns = [tag for tag in selected_tags]
    
    # Detect all column names in the DataFrame
    all_columns = df.columns
    print(f"Available columns: {all_columns}")
    
    # Create a mapping for the ambiguous column names
    column_mapping = {
        # Abstract, claims, and description
        '_VALUE': 'abstract_text',  # First occurrence
        'claim': 'claims',
        '_VALUE': 'description_text',  # Second occurrence
        'heading': 'description_sections',
        
        # Application reference
        'country': 'application_country',  # First occurrence
        'date': 'application_date',  # First occurrence
        'doc-number': 'application_number',  # First occurrence
        
        # CPC classifications
        'classification-cpc': 'cpc_secondary',
        'date': 'cpc_action_date',  # Second occurrence
        'class': 'cpc_class',
        'classification-data-source': 'cpc_data_source',
        'classification-status': 'cpc_status',
        'classification-value': 'cpc_value',
        'date': 'cpc_version_date',  # Third occurrence
        'country': 'cpc_office_country',  # Second occurrence
        'main-group': 'cpc_main_group',
        'scheme-origination-code': 'cpc_scheme_origin',
        'section': 'cpc_section',
        'subclass': 'cpc_subclass',
        'subgroup': 'cpc_subgroup',
        'symbol-position': 'cpc_symbol_position',
        
        # IPCR classification
        'classification-ipcr': 'ipc_classification',
        
        # Publication reference
        'invention-title': 'invention_title',
        'country': 'publication_country',  # Third occurrence
        'date': 'publication_date',  # Fourth occurrence
        'doc-number': 'publication_number',  # Second occurrence
        'kind': 'publication_kind',
        
        # Other fields
        'us-application-series-code': 'application_series_code',
        'inventor': 'inventors',
        'us-applicant': 'applicants',
        'source_file': 'source_file',
        'ingestion_date': 'ingestion_date'
    }
    
    # Since there are duplicate column names in selected_columns, 
    # we need to create a list that maintains the order and handles duplicates
    new_column_names = [
        'abstract_text',
        'claims',
        'description_text',
        'description_sections',
        'application_country',
        'application_date',
        'application_number',
        'cpc_secondary',
        'cpc_action_date',
        'cpc_class',
        'cpc_data_source',
        'cpc_status',
        'cpc_value',
        'cpc_version_date',
        'cpc_office_country',
        'cpc_main_group',
        'cpc_scheme_origin',
        'cpc_section',
        'cpc_subclass',
        'cpc_subgroup',
        'cpc_symbol_position',
        'ipc_classification',
        'invention_title',
        'publication_country',
        'publication_date',
        'publication_number',
        'publication_kind',
        'application_series_code',
        'inventors',
        'applicants',
        'source_file',  # Added missing column
        'ingestion_date'  # Added missing column
    ]
    
    # Select these columns from the original DataFrame
    try:
        print(f"Number of selected columns: {len(selected_columns)}")
        print(f"Number of new column names: {len(new_column_names)}")
        
        # Make sure number of columns match
        if len(selected_columns) != len(new_column_names):
            print(f"WARNING: Column count mismatch - using simpler approach")
            # Add a safer fallback that doesn't rely on exact column matching
            silver_df = df
            for old_col, new_col in column_mapping.items():
                if old_col in df.columns:
                    silver_df = silver_df.withColumnRenamed(old_col, new_col)
        else:
            selected_df = df.select(*selected_columns)
            # Create the renamed DataFrame
            silver_df = selected_df.toDF(*new_column_names)
        
        # Add ingestion metadata
        silver_df = silver_df.withColumn("silver_ingestion_date", current_timestamp())
    except Exception as e:
        print(f"Error in transform operation: {str(e)}")
        # Fallback to simpler approach if column mapping fails
        silver_df = df.withColumn("silver_ingestion_date", current_timestamp())
    
    return silver_df, selected_columns

def check_checkpoint_exists(spark, file_name, checkpoint_location):
    """Check if a checkpoint exists for the given file name"""
    checkpoint_path = f"{checkpoint_location}/{file_name}"
    
    try:
        # Try to read the checkpoint file
        dbutils.fs.ls(checkpoint_path)
        # If we get here, the checkpoint exists
        return True
    except:
        # Checkpoint doesn't exist
        return False

def create_checkpoint_file(spark, checkpoint_path, dir_name):
    """Create a checkpoint file with explicit schema to avoid type inference issues"""
    try:
        # Define schema explicitly
        checkpoint_schema = StructType([
            StructField("file_name", StringType(), False),
            StructField("processed_timestamp", TimestampType(), False)
        ])
        
        # Create a row with current timestamp
        current_time = datetime.now()
        
        # Create DataFrame with explicit schema
        checkpoint_df = spark.createDataFrame(
            [(dir_name, current_time)],
            schema=checkpoint_schema
        )
        
        # Write checkpoint
        checkpoint_df.write.format("delta").mode("overwrite").save(checkpoint_path)
        print(f"Created checkpoint at {checkpoint_path}")
        return True
    except Exception as e:
        print(f"Warning: Could not create checkpoint file: {str(e)}")
        import traceback
        print(f"Checkpoint error details: {traceback.format_exc()}")
        return False

def silver_layer_processing():
    """Process bronze Parquet files into silver Delta tables using incremental loading"""
    print("Starting silver layer processing")
    
    # Try to create a widget to control reprocessing (only works in interactive mode)
    try:
        dbutils.widgets.dropdown("force_reprocess", "false", ["true", "false"], "Force Reprocessing")
        force_reprocess = dbutils.widgets.get("force_reprocess") == "true"
    except:
        # Default to incremental processing in job mode
        force_reprocess = False
    
    print(f"Force reprocessing mode: {force_reprocess}")
    
    spark = initialize_spark()
    
    # Unity Catalog paths
    bronze_path = "/Volumes/nokia-assginment-catalog/bronze/raw_data"
    silver_path = "/Volumes/nokia-assginment-catalog/silver"
    checkpoint_location = "/Volumes/nokia-assginment-catalog/checkpoints/checkpoints_data/silver_autoloader/"
    
    # Use a clear path for Delta output
    delta_output_path = f"{silver_path}/patent_data"
    
    try:
        # Check bronze directory to confirm files exist
        print(f"Checking bronze directory: {bronze_path}")
        try:
            bronze_items = dbutils.fs.ls(bronze_path)
            patent_dirs = [d for d in bronze_items if not d.name.endswith('.parquet')]
            print(f"Found {len(patent_dirs)} patent directories in bronze layer")
            for dir_item in patent_dirs[:5]:  # List first 5 directories
                print(f"  {dir_item.name}")
            
            if len(patent_dirs) == 0:
                print("ERROR: No patent directories found in bronze layer!")
                return False, None
        except Exception as e:
            print(f"Error listing bronze files: {str(e)}")
            return False, None
        
        # Handle checkpoint directory based on force_reprocess flag
        if force_reprocess:
            print(f"Force reprocessing requested, clearing checkpoint location: {checkpoint_location}")
            try:
                dbutils.fs.rm(checkpoint_location, True)
                print("Checkpoint directory cleared")
            except:
                print("No checkpoint directory to clear")
            
            dbutils.fs.mkdirs(checkpoint_location)
            print("Created new checkpoint directory")
        else:
            print("Using existing checkpoint directory for incremental processing")
            # Just ensure the directory exists
            try:
                dbutils.fs.ls(checkpoint_location)
                print("Checkpoint directory exists")
            except:
                dbutils.fs.mkdirs(checkpoint_location)
                print("Created checkpoint directory")
        
        # Handle output directory based on force_reprocess flag
        if force_reprocess:
            print(f"Force reprocessing requested, clearing output directory: {delta_output_path}")
            try:
                dbutils.fs.rm(delta_output_path, True)
                print("Removed existing Delta directory")
            except:
                print("No existing Delta directory to remove")
            
            dbutils.fs.mkdirs(delta_output_path)
            print("Created Delta output directory")
        else:
            print("Using existing output directory for incremental files")
            # Just ensure the directory exists
            try:
                dbutils.fs.ls(delta_output_path)
                print("Delta output directory exists")
            except:
                dbutils.fs.mkdirs(delta_output_path)
                print("Created Delta output directory")
        
        # Process each patent directory incrementally
        total_processed = 0
        stats_df = None
        
        for dir_index, dir_item in enumerate(patent_dirs):
            dir_path = dir_item.path
            dir_name = dir_item.name
            
            # Use checkpoint location for tracking progress
            file_checkpoint_path = f"{checkpoint_location}/{dir_name}"
            
            # Check if directory was already processed using checkpoint (instead of output directory)
            if not force_reprocess and check_checkpoint_exists(spark, dir_name, checkpoint_location):
                print(f"Skipping already processed directory (checkpoint found): {dir_name}")
                continue
            
            print(f"Processing directory {dir_index+1}/{len(patent_dirs)}: {dir_name}")
            
            try:
                # Read the Parquet files from bronze layer
                print(f"Reading Parquet files from {dir_path}")
                bronze_df = spark.read.parquet(dir_path)
                
                print(f"Loaded {bronze_df.count()} records from bronze layer")
                
                # Transform and rename fields
                print("Transforming and renaming fields")
                silver_df, selected_columns = transform_and_rename_fields(bronze_df)
                
                # Write to Delta table
                delta_table_path = f"{delta_output_path}/{dir_name}"
                print(f"Writing to Delta table at {delta_table_path}")
                
                (silver_df.write
                    .format("delta")
                    .mode("overwrite")
                    .option("overwriteSchema", "true")
                    .save(delta_table_path))
                
                # Create checkpoint file to mark successful processing with fixed schema
                try:
                    # Ensure checkpoint directory exists
                    dbutils.fs.mkdirs(os.path.dirname(file_checkpoint_path))
                    
                    # Create checkpoint with proper schema
                    create_checkpoint_file(spark, file_checkpoint_path, dir_name)
                except Exception as checkpoint_error:
                    print(f"Warning: Could not create checkpoint file: {str(checkpoint_error)}")
                
                # Get statistics on this batch
                batch_stats_df, _ = analyze_field_occurrence(silver_df, max_depth=None)  # No depth limit
                
                # Store or merge statistics
                if stats_df is None:
                    stats_df = batch_stats_df
                else:
                    # In a real implementation, you'd merge statistics more intelligently
                    stats_df = stats_df.union(batch_stats_df)
                
                total_processed += 1
                print(f"Successfully processed {dir_name} to Delta format")
                
            except Exception as e:
                print(f"Error processing {dir_name}: {str(e)}")
                import traceback
                print(f"Exception traceback: {traceback.format_exc()}")
        
        print(f"Completed silver layer processing. Total directories processed: {total_processed}")
        
        # Return success and the stats dataframe
        return True, stats_df
    
    except Exception as e:
        print(f"Error in silver layer processing: {str(e)}")
        import traceback
        print(f"Exception traceback: {traceback.format_exc()}")
        return False, None

# Execute the function and capture the return values
success, stats_df = silver_layer_processing()

# Show statistics if available
if success and stats_df is not None:
    print("\nField Occurrence Statistics:")
    stats_df.show(truncate=False)
    
    # Convert to pandas for more detailed analysis if needed
    stats_pd = stats_df.toPandas()
    print(f"\nTotal unique fields analyzed: {len(stats_pd)}")
    print(f"Fields with >80% occurrence: {len(stats_pd[stats_pd['Occurrence (%)'] > 80])}")
    print(f"Fields with <20% occurrence: {len(stats_pd[stats_pd['Occurrence (%)'] < 20])}")