In [0]:
%pip install beautifulsoup4 -U --quiet
%pip install lxml -U --quiet



%restart_python

In [0]:
%fs ls '/Volumes/nokia-assginment-catalog/bronze/cleaned_raw_xml_data'

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit
import re
import os
from io import BytesIO
import json

# Define the HTML tags to remove (keep their content)
html_tags = {
    "b", "i", "u", "h1", "h2", "h3", "h4", "h5", "h6", "p", 
    "span", "div", "br", "strong", "em", "sub", "sup", "drawings", 
    "figure", "img", "ol", "ul", "li", "ol", "table", "tr", "td", "o", "us-math", "us-chemistry", 
    "us-sequence-list-doc", "sequence-list", "us-claim-statement",
    "th", "tbody", "thead", "tfoot", "figref", "description-of-drawings", 
    "summary-of-invention", "brief-description-of-drawings", "figure"
}

def clean_html_tags_lxml(xml_content):
    """Remove only the specified HTML tags from the html_tags set using exact matching"""
    try:
        # Log the XML content size before cleaning
        print(f"XML content before cleaning: {len(xml_content)} bytes")
        
        # Only remove tags that match exactly our html_tags set
        for tag in html_tags:
            # Create patterns that exactly match our tags
            # Remove opening tags with attributes but keep content
            xml_content = re.sub(fr'<{tag}(\s+[^>]*)?>', '', xml_content)
            # Remove closing tags
            xml_content = re.sub(fr'</{tag}>', '', xml_content)
            # Handle processing instruction format
            xml_content = re.sub(fr'<\?{tag}(\s+[^>]*)?\?>', '', xml_content)
        
        # Log the XML content size after cleaning
        print(f"XML content after cleaning: {len(xml_content)} bytes")
        
        # Validate that the cleaned XML still has the required structure
        if "<us-patent-application" not in xml_content or "</us-patent-application>" not in xml_content:
            print("WARNING: Cleaned XML may be missing required us-patent-application tags!")
        
        return xml_content
    
    except Exception as e:
        print(f"Error in XML cleaning: {str(e)}")
        # Return original content if cleaning fails
        return xml_content

def process_xml_file(input_path, output_path):
    """Process the XML file to remove HTML tags and save to a new file"""
    # Create output directory if it doesn't exist
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    # Read the entire file
    with open(input_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Find the XML declaration if present and preserve it
    xml_declaration = ""
    xml_declaration_match = re.match(r'(<\?xml[^>]*\?>)', content)
    if xml_declaration_match:
        xml_declaration = xml_declaration_match.group(1)
    
    # Find the root tag
    root_start = content.find('<us-patent-applications>')
    if root_start == -1:
        root_start = content.find('<us-patent-application')
    
    # Write the start of the file
    with open(output_path, 'w', encoding='utf-8') as outfile:
        # Write the XML declaration first
        if xml_declaration:
            outfile.write(xml_declaration + '\n')
        
        # Write root tag if it exists
        if '<us-patent-applications>' in content:
            outfile.write('<us-patent-applications>\n')
        
        # Process each patent application separately
        start_tag = '<us-patent-application'
        end_tag = '</us-patent-application>'
        
        pos = content.find(start_tag)
        applications_processed = 0
        
        while pos != -1:
            # Find the end of this patent application
            end_pos = content.find(end_tag, pos) + len(end_tag)
            if end_pos == -1:
                break
            
            # Extract and clean this patent application
            patent_xml = content[pos:end_pos]
            patent_size = len(patent_xml)
            print(f"Processing application {applications_processed+1} at position {pos}-{end_pos}")
            print(f"Patent XML size: {patent_size} bytes")
            
            # Clean the patent XML
            cleaned_patent = clean_html_tags_lxml(patent_xml)
            cleaned_size = len(cleaned_patent)
            print(f"Cleaned patent size: {cleaned_size} bytes")
            print(f"Size reduction: {patent_size - cleaned_size} bytes ({(patent_size - cleaned_size) / patent_size * 100:.2f}%)")
            
            outfile.write(cleaned_patent + '\n')
            applications_processed += 1
            
            # Move to next patent application
            pos = content.find(start_tag, end_pos)
        
        # Close the root element if it exists
        if '<us-patent-applications>' in content:
            outfile.write('</us-patent-applications>')
        
        print(f"Processed {applications_processed} patent applications")
        
        # Verify output file
        if os.path.exists(output_path):
            outfile_size = os.path.getsize(output_path)
            print(f"Output file size: {outfile_size} bytes")

def initialize_spark():
    """Initialize Spark session with XML reader configurations"""
    return SparkSession.builder \
        .appName("XML Patent Processor") \
        .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.15.0") \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.cores", "4") \
        .config("spark.sql.legacy.allowUntypedScalaUDF", "true") \
        .config("spark.executor.memoryOverhead", "2g") \
        .config("spark.dynamicAllocation.enabled", "true") \
        .config("spark.shuffle.service.enabled", "true") \
        .getOrCreate()

def bronze_layer_processing():
    """Process XML files using optimized parallel processing"""
    print("Starting bronze layer processing")
    
    spark = initialize_spark()
    
    # Unity Catalog paths
    input_path = "/Volumes/nokia-assginment-catalog/assignment_data/xml_raw_data"
    bronze_path = "/Volumes/nokia-assginment-catalog/bronze"
    temp_processing_dir = "/tmp/xml_processing/"
    checkpoint_location = "/Volumes/nokia-assginment-catalog/checkpoints/checkpoints_data/xml_autoloader/"
    
    # Use a new path for Parquet output to avoid Delta conflicts
    parquet_output_path = f"{bronze_path}/raw_data"
    
    # Clear temp directory if it exists
    os.system(f"rm -rf {temp_processing_dir}")
    print("Cleared temp processing directory")
    
    try:
        # Check input directory to confirm files exist
        print(f"Checking input directory: {input_path}")
        try:
            input_files = dbutils.fs.ls(input_path)
            xml_files = [f for f in input_files if f.name.endswith('.xml')]
            print(f"Found {len(xml_files)} XML files in input directory")
            for xml_file in xml_files[:5]:  # List first 5 files
                print(f"  {xml_file.name} ({xml_file.size} bytes)")
            
            if len(xml_files) == 0:
                print("ERROR: No XML files found in input directory!")
                return False
        except Exception as e:
            print(f"Error listing input files: {str(e)}")
        
        # Clear checkpoint to force reprocessing
        print(f"Clearing checkpoint location: {checkpoint_location}")
        try:
            dbutils.fs.rm(checkpoint_location, True)
            print("Checkpoint directory cleared")
        except:
            print("No checkpoint directory to clear")
        
        dbutils.fs.mkdirs(checkpoint_location)
        print("Created new checkpoint directory")
        
        # Create necessary directories
        os.makedirs(os.path.join(temp_processing_dir, "input"), exist_ok=True)
        os.makedirs(os.path.join(temp_processing_dir, "cleaned"), exist_ok=True)
        
        # Create output directory for Parquet files
        print(f"Setting up Parquet output directory: {parquet_output_path}")
        try:
            dbutils.fs.rm(parquet_output_path, True)
            print("Removed existing Parquet directory")
        except:
            print("No existing Parquet directory to remove")
        
        dbutils.fs.mkdirs(parquet_output_path)
        print("Created Parquet output directory")
        
        # Check if bronze cleaned_raw_xml_data directory exists
        try:
            dbutils.fs.ls(f"{bronze_path}/cleaned_raw_xml_data")
            print("Bronze cleaned directory exists")
        except:
            print("Creating bronze cleaned directory")
            dbutils.fs.mkdirs(f"{bronze_path}/cleaned_raw_xml_data")
        
        # Set up Auto Loader in batch mode
        print("Setting up Auto Loader")
        autoloader_df = (spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "binaryFile")
            .option("cloudFiles.schemaLocation", checkpoint_location)
            .option("pathGlobFilter", "*.xml")
            .load(input_path)
        )
        
        # Define a function to process each file in the batch
        def process_batch(batch_df, batch_id):
            # Process only if there are files in this batch
            batch_count = batch_df.count()
            print(f"Processing batch {batch_id} with {batch_count} files")
            
            if batch_count > 0:
                # Process each file in the batch
                file_list = batch_df.select("path").collect()
                
                for file_index, file_row in enumerate(file_list):
                    file_path = file_row.path
                    file_name = os.path.basename(file_path)
                    
                    print(f"Processing file {file_index+1}/{batch_count}: {file_name}")
                    
                    # Local paths for processing
                    local_input_path = os.path.join(temp_processing_dir, "input", file_name)
                    local_output_path = os.path.join(temp_processing_dir, "cleaned", file_name)
                    
                    try:
                        # Download file from Unity Catalog volume to local temp storage
                        print(f"Downloading file from {file_path}")
                        dbutils.fs.cp(file_path, f"file:{local_input_path}")
                        
                        # Clean the XML file
                        print(f"Processing {file_name}...")
                        process_xml_file(local_input_path, local_output_path)
                        
                        # Verify the cleaned file has content
                        if os.path.exists(local_output_path) and os.path.getsize(local_output_path) > 0:
                            print(f"Cleaned file created successfully, size: {os.path.getsize(local_output_path)} bytes")
                        else:
                            print("Warning: Cleaned file is empty or doesn't exist")
                            continue
                        
                        # Upload cleaned file to bronze layer
                        bronze_file_path = f"{bronze_path}/cleaned_raw_xml_data/{file_name.replace('.xml', '_cleaned.xml')}"
                        print(f"Uploading cleaned file to {bronze_file_path}")
                        dbutils.fs.cp(f"file:{local_output_path}", bronze_file_path)
                        
                        # Read the cleaned XML file using Spark
                        print("Reading with Spark XML reader")
                        try:
                            df = (
                                spark.read.format("xml")
                                .option("rowTag", "us-patent-application")
                                .option("charset", "UTF-8")
                                .option("ignoreSurroundingSpaces", "true")
                                .option("mode", "PERMISSIVE")
                                .option("excludeAttribute", "true")
                                .option("includeMetadata", "true")
                                .option("valueTag", "_VALUE")
                                .load(bronze_file_path)
                                .withColumn("source_file", lit(file_name))
                                .withColumn("ingestion_date", current_timestamp())
                            ).drop('_VALUE')
                            
                            row_count = df.count()
                            print(f"Dataframe created with {row_count} rows")
                            
                            if row_count > 0:
                                # Write to Parquet files with simplified options
                                file_output_path = f"{parquet_output_path}/{file_name.replace('.xml', '')}"
                                print(f"Writing to Parquet files at {file_output_path}")
                                
                                # Simple Parquet write with just the essential options
                                (df.write
                                   .format("parquet")
                                   .mode("overwrite")
                                   .save(file_output_path))
                                
                                print(f"Added {row_count} records from {file_name} to Parquet output")
                            else:
                                print(f"WARNING: No records found in {file_name} after XML parsing")
                                
                                # Try with different rowTag as a diagnostic
                                try:
                                    alt_df = spark.read.format("xml") \
                                        .option("rowTag", "patent-application") \
                                        .option("mode", "PERMISSIVE") \
                                        .load(bronze_file_path)
                                    alt_count = alt_df.count()
                                    print(f"Alternative rowTag found {alt_count} records")
                                except Exception as diag_error:
                                    print(f"Diagnostic error: {str(diag_error)}")
                        
                        except Exception as spark_error:
                            print(f"Error processing with Spark: {str(spark_error)}")
                    
                    except Exception as e:
                        print(f"Error processing {file_name}: {str(e)}")
            else:
                print("No files to process in this batch")
        
        # Use Structured Streaming to run Auto Loader in batch mode
        print("Starting Auto Loader stream")
        stream = (autoloader_df.writeStream
            .foreachBatch(process_batch)
            .option("checkpointLocation", checkpoint_location)
            .trigger(once=True)  # Run once and stop
            .start())
        
        print("Waiting for stream to complete")
        stream.awaitTermination()
        print("Stream completed")
        
        # Check if output was created
        try:
            files = dbutils.fs.ls(parquet_output_path)
            print(f"Found {len(files)} items in Parquet output directory")
            
            total_parquet_files = 0
            for item in files:
                if item.name.endswith('.parquet'):
                    total_parquet_files += 1
                else:
                    # Check subdirectories
                    try:
                        subfiles = dbutils.fs.ls(item.path)
                        parquet_in_dir = [f for f in subfiles if f.name.endswith('.parquet')]
                        total_parquet_files += len(parquet_in_dir)
                        print(f"  Directory {item.name}: {len(parquet_in_dir)} Parquet files")
                    except:
                        pass
            
            print(f"Found a total of {total_parquet_files} Parquet files")
            if total_parquet_files == 0:
                print("WARNING: No Parquet files were created!")
        except Exception as e:
            print(f"Error checking output directory: {str(e)}")
        
        # Cleanup temp directory
        os.system(f"rm -rf {temp_processing_dir}")
        print("Cleaned up temporary directory")
        
        return True
    
    except Exception as e:
        print(f"Error in bronze layer processing: {str(e)}")
        import traceback
        print(f"Exception traceback: {traceback.format_exc()}")
        return False

# Execute the function
bronze_layer_processing()