In [0]:
%run ./init

In [0]:
from datetime import datetime
import os
import re

# Create a text input widget to select the folder
dbutils.widgets.text("insert_data_flow_folder", "", "Enter the name of the DataFlow source folder to process (e.g., DF1, DF2..DFn, etc)")

# Create a dropdown widget for selecting model type (o1 or o1 mini)
dbutils.widgets.dropdown("model_choice", "o1", ["o1", "o1-mini"], "Choose model type")

# Retrieve the selected folder and model from the widgets
selected_folder = dbutils.widgets.get("insert_data_flow_folder")
selected_folder = selected_folder.upper()

if not selected_folder:
    raise ValueError("Please provide a folder name in the widget.")

# Retrieve the selected model type
selected_model = dbutils.widgets.get("model_choice")

# # Set your storage account SAS token and name
# sasToken = "placeholder"  # Replace with your actual SAS token
# sa = "erdccalearning"  # Replace with your actual storage account name
# container_name = "nndemo"  # Replace with your actual container name
# spark.conf.set(f"fs.azure.account.auth.type.{sa}.dfs.core.windows.net", "SAS")
# spark.conf.set(f"fs.azure.sas.token.provider.type.{sa}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
# spark.conf.set(f"fs.azure.sas.nndemo.{sa}.blob.core.windows.net", f"{sasToken}")

# # Set the base URL for the container and directories
# base_url = f"wasbs://{container_name}@{sa}.blob.core.windows.net/AcceleratorSAPFiles"

sa = "stlpdel01dev"
container_name = "codeconverter" # Replace with your actual container name
spark.conf.set("fs.azure.account.auth.type", "CustomAccessToken")
spark.conf.set("fs.azure.account.custom.token.provider.class", spark.conf.get("spark.databricks.passthrough.adls.gen2.tokenProviderClassName"))

base_path = f"abfss://{container_name}@{sa}.dfs.core.windows.net/AcceleratorSAPFiles/"

validated_files_directory = "ValidatedFiles"
cleaned_files_directory = "CleansedSQLFiles"
archived_files_directory = "ArchivedValidatedFiles"


# Function to list all files in a directory recursively
def list_all_files(directory_path):
    files = []
    for file_info in dbutils.fs.ls(directory_path):
        if file_info.isFile():
            files.append(file_info.path)
        else:
            files.extend(list_all_files(file_info.path))
    return files

# Function to perform double cleaning operation on SQL output
def cleanse_sql_output(output: str) -> str:
    cleaned_output = re.sub(r'```sql', '', output)
    cleaned_output = re.sub(r'`+', '', cleaned_output)
    return cleaned_output

def read_file_content(file_path):
    """Read the content of a file using Spark's higher-level API (spark.read.text)."""
    try:
        df = spark.read.text(file_path)
        file_content = df.rdd.map(lambda r: r[0]).collect()
        return file_content
    except Exception as e:
        print(f"Error reading file: {file_path}")
        print(f"Exception: {e}")
        return None

# Function to call the appropriate model based on selected type
def call_model(selected_model, prompt):
    if selected_model == "o1":
        return call_model_o1(deployment_name="o1", prompt=prompt)  # Call the o1 model function
    elif selected_model == "o1-mini":
        return callmodelo1(deployment_name="o1-mini", prompt=prompt)  # Call the o1-mini model function
    else:
        raise ValueError("Invalid model choice")


# Helper function to join paths correctly (handling double slashes)
def join_paths(base, *paths):
    # Strip leading slash from subsequent paths to avoid double slashes
    return os.path.join(base, *[path.lstrip("/") for path in paths])


# Function to process selected folder
def process_folder(source_directory):
    current_date = datetime.now().strftime('%Y-%m-%d')
    
    # Update the path to correctly point to the selected subfolder in the ValidatedFiles directory
    source_directory_path = join_paths(base_path, validated_files_directory, source_directory)
    cleaned_directory_path = join_paths(base_path, cleaned_files_directory, source_directory, current_date)
    archive_directory_path = join_paths(base_path, archived_files_directory, source_directory, current_date)

    all_files = list_all_files(source_directory_path)

    for file_path in all_files:
        print(f"Reading file: {file_path}")
        try:
            file_content = read_file_content(file_path)
            if file_content is None:
                continue
            
            file_content_as_string = "\n".join(file_content)
            prompt = f"""
            Clean the following SQL query by removing comments and unwanted characters, making it ready for execution without changing its logic. Remove all backticks (`) and only retain the final version of the SQL query. After cleaning, optimize the query for performance, ensuring to avoid using Common Table Expressions (CTEs) wherever possible. If the query contains an inner query (subquery), please replace the inner query with a Common Table Expression (CTE) for improved readability and structure.
            
            When writing SQL queries, please make sure to explicitly define each column you are selecting. Avoid using SELECT *, and instead, list out the specific columns needed. This will help in optimizing performance, making the query more readable, and ensuring clarity in the data being fetched.
            
            
            SQL Code: \n{file_content_as_string}
            """

            model_output = call_model(selected_model, prompt)
            
            reassess_prompt = f"""

                "Please confirm that the following SQL code has been properly cleaned and optimized. The optimization should focus on improving performance, specifically avoiding the use of CTEs (Common Table Expressions) if feasible. After cleaning and optimization, ensure that:

                All backticks (`) are removed from the text.


                The code is optimized for performance, considering any opportunities to remove CTEs and refactor the query for better efficiency. If the query contains an inner query (subquery), please replace the inner query with a Common Table Expression (CTE) for improved readability and structure.

                Only the latest version of the SQL query is returned, with all unnecessary complexity reduced.

                Please make sure that the final output is a clean, optimized SQL query ready for automatic execution by the engine.

                When writing SQL queries, please make sure to explicitly define each column you are selecting. Avoid using SELECT *, and instead, list out the specific columns needed. This will help in optimizing performance, making the query more readable, and ensuring clarity in the data being fetched.

                Original SQL code: {file_content_as_string}

                Cleaned and optimized SQL code: {model_output}"

            """
            reassessed_output = call_model(selected_model, reassess_prompt)
            
            reassessed_output = cleanse_sql_output(reassessed_output)
            
            # Ensure the subfolder structure is preserved when saving the cleaned file
            relative_path = file_path.replace(source_directory_path, '').strip('/')

            # Construct the output file path
            output_file_path = join_paths(cleaned_directory_path, relative_path)

            # Ensure that the output directory exists
            cleaned_subdirectory = os.path.dirname(output_file_path)

            # Ensure the cleaned subdirectory exists
            if not dbutils.fs.mkdirs(cleaned_subdirectory):
                print(f"Directory {cleaned_subdirectory} already exists or created successfully.")
            
            # Save the cleaned file
            dbutils.fs.put(output_file_path, reassessed_output, overwrite=True)
            print(f"Successfully processed and saved cleaned SQL code for file: {file_path}")
            
            # Archive the original file in the correct location
            archive_file_path = join_paths(archive_directory_path, relative_path)
            archive_date_folder = os.path.dirname(archive_file_path)

            # Ensure the archive directory exists
            if not dbutils.fs.mkdirs(archive_date_folder):
                print(f"Directory {archive_date_folder} already exists or created successfully.")
            
            final_archive_path = join_paths(archive_date_folder, os.path.basename(file_path))

            dbutils.fs.mv(file_path, final_archive_path)
            print(f"Successfully archived the original validated file to: {final_archive_path}")
        except Exception as e:
            print(f"Error processing file {file_path}")
            print(f"Exception: {e}")
    print("All files processed.")


# Process the selected folder
process_folder(selected_folder)
