In [0]:
%run ./init

In [0]:
from datetime import datetime
import os
import re

# Set your storage account SAS token and name
sasToken = "placeholder"  # Replace with your actual SAS token
sa = "erdccalearning"  # Replace with your actual storage account name
container_name = "nndemo"  # Replace with your actual container name

spark.conf.set(f"fs.azure.account.auth.type.{sa}.dfs.core.windows.net", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{sa}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.nndemo.{sa}.blob.core.windows.net", f"{sasToken}")

# Set the base URL for the container and directories
base_url = f"wasbs://{container_name}@{sa}.blob.core.windows.net/AcceleratorSAPFiles/"

source_directory_path = f"{base_url}ValidatedFiles/"
cleaned_directory_path = f"{base_url}CleanedSQLFiles/"
archive_directory_path = f"{base_url}ArchivedValidatedFiles/"

# Create an archive folder with the current date
current_date = datetime.now().strftime('%Y-%m-%d')
archive_date_folder = f"{archive_directory_path.rstrip('/')}/{current_date}/"
dbutils.fs.mkdirs(archive_date_folder)

# List files in the source directory
files = dbutils.fs.ls(source_directory_path)


# This function performs a double cleaning operation on the model's output to ensure that only valid and well-formatted SQL queries are retained.

def cleanse_sql_output(output: str) -> str:

    # Remove the '```sql' markers (both the opening and closing markers)
    cleaned_output = re.sub(r'```sql', '', output)
    
    # Remove all groups of backticks (`` ` `` or ````` or `````` etc.)
    cleaned_output = re.sub(r'`+', '', cleaned_output)
    
    # Return the cleaned output
    return cleaned_output


def read_file_content(file_path):
    """Read the content of a file using Spark's higher-level API (spark.read.text)."""
    try:
        # Read the file content using spark.read.text
        df = spark.read.text(file_path)
        file_content = df.rdd.map(lambda r: r[0]).collect()
        return file_content
    except Exception as e:
        print(f"Error reading file: {file_path}")
        print(f"Exception: {e}")
        return None

# Process each file in the source directory
for file_info in files:
    file_path = file_info.path

    if file_info.isFile():
        print(f"Reading file: {file_path}")

        try:
            # Read file content
            file_content = read_file_content(file_path)
            if file_content is None:
                continue

            # Convert file content to a single string
            file_content_as_string = "\n".join(file_content)

            # Clean the SQL code using the model
            prompt = f"Clean the following SQL code by removing comments and unwanted characters, making it ready to run automatically by the engine without altering the logic. Please clean the following query by removing all backticks (`) from the text and keep only the last version of the sql squery. :\n{file_content_as_string}"
            model_output = call_model_o1(deployment_name="o1", prompt=prompt)

            # Reassess the cleaned SQL code
            reassess_prompt = (f"Please confirm that the following SQL code has been cleaned correctly, without altering its logic, and is ready for automatic execution by the engine. Please make sure that the following query has all backticks (`) removed from the text and has only the last version of the sql query. When you write the output make sure to keep only the query:\n\nOriginal SQL code:\n{file_content_as_string}\n\n"
                               f"Cleaned SQL code:\n{model_output}")
            reassessed_output = call_model_o1(deployment_name="o1", prompt=reassess_prompt)

            # double cleaning operation
            reassessed_output = cleanse_sql_output(reassessed_output)

            # Define the output file path
            output_file_path = file_path.replace(source_directory_path, cleaned_directory_path)
            
            # Save the reassessed output to the cleaned directory
            dbutils.fs.put(output_file_path, reassessed_output, overwrite=True)
            print(f"Successfully processed and saved cleaned SQL code for file: {file_path}")

            # Archive the original validated file to the archive directory
            archive_file_path = f"{archive_date_folder}{os.path.basename(file_path)}"
            dbutils.fs.mv(file_path, archive_file_path)
            print(f"Successfully archived the original validated file to: {archive_file_path}")

        except Exception as e:
            print(f"Error processing file {file_path}")
            print(f"Exception: {e}")
    else:
        print(f"Skipping directory: {file_path}")