In [0]:
import xml.etree.ElementTree as ET
from datetime import datetime
import os
import time
from pyspark.sql.functions import split, first, collect_list, concat_ws, lit
from pyspark.sql import SparkSession

# Initialize Spark session 
spark = SparkSession.builder.appName("Merge Example").getOrCreate()

# # Set your storage account SAS token and name
# sa = "erdccalearning"  # Replace with your actual storage account name
# container_name = "nndemo"  # Replace with your actual container name

# spark.conf.set(f"fs.azure.account.auth.type.{sa}.dfs.core.windows.net", "SAS")
# spark.conf.set(f"fs.azure.sas.token.provider.type.{sa}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
# spark.conf.set(f"fs.azure.sas.nndemo.{sa}.blob.core.windows.net", f"{sasToken}")

sa = "stlpdel01dev"
container_name = "codeconverter" # Replace with your actual container name
spark.conf.set("fs.azure.account.auth.type", "CustomAccessToken")
spark.conf.set("fs.azure.account.custom.token.provider.class", spark.conf.get("spark.databricks.passthrough.adls.gen2.tokenProviderClassName"))

# Set the base URL for the container and directories
base_path = f"abfss://{container_name}@{sa}.dfs.core.windows.net/AcceleratorSAPFiles/InputFiles/"

# Function to list all directories (DF1, DF2, ..., DF48)
def list_subdirectories(base_path):
    directories = []
    # List files and directories under base_path
    for file_info in dbutils.fs.ls(base_path):
        if file_info.isDir() and file_info.name.startswith("DF"):  # Check for directories DF1, DF2, ...
            directories.append(file_info.path)
    return directories

# Function to list all files in a given directory (no filtering by extension)
def list_all_files(directory_path):
    all_files = []
    for file_info in dbutils.fs.ls(directory_path):
        if file_info.isDir():
            # Recursively list files in subdirectories
            all_files.extend(list_all_files(file_info.path))
        else:
            # Add file to the list
            all_files.append(file_info.path)
    return all_files

# Function to extract 'defaultDescription' from XML in the file using Spark DataFrame
def extract_description(file_path):
    try:
        # Read the file content using Spark DataFrame
        df = spark.read.text(file_path)
        
        # Collect all the lines in the file as a list
        xml_content = df.rdd.map(lambda row: row[0]).collect()
        
        # Join the list of lines into a single string representing the XML content
        xml_string = "\n".join(xml_content)
        
        # Parse the XML content using ElementTree
        root = ET.fromstring(xml_string)
        
        # Find the <descriptions> tag and get the 'defaultDescription' attribute
        descriptions = root.findall('.//descriptions')  # Find all <descriptions> tags
        
        for description in descriptions:
            # Get the 'defaultDescription' attribute
            default_description = description.get('defaultDescription')
            return default_description
        
        return "Descriptions tag not found"
        
    except Exception as e:
        return f"Error processing file: {e}"

# Main logic to process all directories and files
def main():
    # List to hold file names, subdirectory names, and their corresponding descriptions
    file_data = []
    
    # List all DF directories (DF1, DF2, ..., DF48)
    directories = list_subdirectories(base_path)
    
    # Loop through each directory
    for directory in directories:
        print(f"Processing directory: {directory}")
        
        # Extract the subdirectory name (e.g., DF1, DF2, etc.)
        subdirectory_name = os.path.basename(directory.rstrip('/'))
        
        # List all files in the directory (no filtering)
        all_files = list_all_files(directory)
        
        # Process each file
        for file_path in all_files:
            print(f"Processing file: {file_path}")
            
            # Extract the description for the file
            description = extract_description(file_path)
            
            # Get the file name (last part of the path)
            file_name = os.path.basename(file_path)
            
            # Determine the SourceSystem based on the file name
            # If the file name ends with 'calculationview', set SourceSystem to 'HANA'
            if file_name.lower().endswith('calculationview'):
                source_system = "HANA"
            # If the file name ends with '.html' (case-insensitive), set SourceSystem to 'BW'
            elif file_name.lower().endswith('.html'):
                source_system = "BW"
            else:
                source_system = "Other"
            
            # Append the file name, subdirectory name, description, and source system to the list
            file_data.append((subdirectory_name, file_name, description, source_system))
    
    # Create a DataFrame from the file_data list
    if file_data:
        df = spark.createDataFrame(file_data, ["DataFlow", "FileName", "Description", "SourceSystem"])
        
        # Create the DataBricksDescription by splitting Description (if applicable)
        df = df.withColumn("DataBricksDescription", split(df["Description"], "CV_")[1])
        
        # Show the DataFrame
        df.show()
   
    return df

# Run the script
main()


In [0]:
df = main()

In [0]:
# Assuming df1 is your DataFrame
df1 = df  # Copy your DataFrame

# Create a temporary view for the aggregated DataFrame
df1.createOrReplaceTempView("temp_table")

# Create or replace the temporary target table from the original table
spark.sql("""
CREATE OR REPLACE TEMPORARY VIEW flowtableinfo_temp AS
SELECT * FROM codeconverter_config.dataflowtableinfo
""")

# Execute the MERGE SQL query on the temporary table
merge_query_temp = """
MERGE INTO flowtableinfo_temp AS tv
USING temp_table AS tt
ON tv.DataFlow = tt.DataFlow AND tv.SAPFileName = tt.FileName
WHEN MATCHED THEN
    UPDATE SET
        tv.DataBricksTableName = tt.DataBricksDescription,
        tv.SAPTableName = tt.Description,
        tv.SourceSystem = tt.SourceSystem  -- Added SourceSystem update
WHEN NOT MATCHED THEN
    INSERT (DataFlow, SAPFileName, SAPTableName, DataBricksTableName, SourceSystem, group_id)
    VALUES (tt.DataFlow, tt.FileName, tt.Description, tt.DataBricksDescription, tt.SourceSystem, NULL);
"""
spark.sql(merge_query_temp)

# Update the original table with the content of the temporary table
update_original_query = """
MERGE INTO codeconverter_config.dataflowtableinfo AS orig
USING flowtableinfo_temp AS temp
ON orig.DataFlow = temp.DataFlow AND orig.SAPFileName = temp.SAPFileName
WHEN MATCHED THEN
    UPDATE SET
        orig.SAPTableName = temp.SAPTableName,
        orig.DataBricksTableName = temp.DataBricksTableName,
        orig.SchemaName = temp.SchemaName,
        orig.SourceSystem = temp.SourceSystem,  -- Added SourceSystem update
        orig.group_id = temp.group_id
WHEN NOT MATCHED THEN
    INSERT (DataFlow, SAPFileName, SAPTableName, DataBricksTableName, SchemaName, SourceSystem, group_id)
    VALUES (temp.DataFlow, temp.SAPFileName, temp.SAPTableName, temp.DataBricksTableName, temp.SchemaName, temp.SourceSystem, temp.group_id);
"""
spark.sql(update_original_query)

# Retrieve and show results from the original target table
result_df = spark.sql("SELECT * FROM codeconverter_config.dataflowtableinfo")
result_df.show()
