# SAP to Databricks Code Conversion

# How to use:
Temporary until SQL read from Hana is in place. 
1. Export XMLs from Hana
2.Open blob storage and save files in 
Blob container->nndemo->AcceleratorSAPFiles->InputFiles
3. Select DF from Widget on top of the page: Select directory. 
4. Run script to read uploaded file into config table: extract_filename_description_from_xml_v2:
5. Update the Config table if needed. (use cell below)
6. Run  all below
Result:
Result files will be automatically posted in Blob container->nndemo->AcceleratorSAPFiles-> Validated files


# Config File maintenance

Table below is used for setting where the files are stored and where the prompts are stored. usually no change needed unless there are new flows

In [0]:
%sql
select * from codeconverter_config.dataflowbaseinfo
--if there is no entry for the DF then use the insert below
--

The table below is used to store the list of all objects per data flow. Maintenance can be done in order to:
- group the CV together. 
- rename CVs
Use this to maintain:



In [0]:
%sql
select * from codeconverter_config.dataflowtableinfo where DataFlow = 'DF34'
--where DataFlow = 'DF34'

-- for maintenance run the below:


## Import required modules (libraries)

In [0]:
# Import required modules (libraries)
from datetime import datetime
import os
import time
import pandas as pd
import re
import concurrent.futures


In [0]:
 # This will allow widget creation in the source notebook
is_target_notebook = True 

## Import required functions

In [0]:
%run ./init

In [0]:
# Function to call the appropriate model based on selected type
def call_model(selected_model, prompt):
    if selected_model == "o1":
        return call_model_o1(deployment_name="o1", prompt=prompt)  # Call the o1 model function
    elif selected_model == "o1-preview":
        return callmodelo1(deployment_name="o1-preview", prompt=prompt)  # Call the o1-preview model function
    elif selected_model == "o1-mini":
        return callmodelo1(deployment_name="o1-mini", prompt=prompt)  # Call the o1-mini model function
    else:
        raise ValueError("Invalid model choice")


## Get parameter value(s)

In [0]:

# Create a dropdown widget for selecting model type (o1 or o1 mini)
dbutils.widgets.dropdown("model_choice", "o1", ["o1", "o1-mini", "o1-preview"], "Choose model type")
# Retrieve the selected model type
selected_model = dbutils.widgets.get("model_choice")
print(f"Selected model: {selected_model}")


In [0]:
# Get parameters value
data_flow_man = dbutils.widgets.get("data_flow_insertion")
data_flow_man = data_flow_man.upper()
print(f"Current working directory: {data_flow_man}")

In [0]:
# # Set your storage account SAS token and name
# sasToken = dbutils.secrets.get(scope="codeconv", key="sasTokenKey") # Create a new scope (directly from databricks UI) and secret (in Key-Vault)
# #sa = "erdccalearning" # Replace with your actual storage account name
# sa = dbutils.secrets.get(scope="codeconv", key="storageAccountName")
# container_name = "nndemo" # Replace with your actual container name
# spark.conf.set(f"fs.azure.account.auth.type.{sa}.dfs.core.windows.net", "SAS")
# spark.conf.set(f"fs.azure.sas.token.provider.type.{sa}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
# spark.conf.set(f"fs.azure.sas.nndemo.{sa}.blob.core.windows.net", f"{sasToken}")

sa = "stlpdel01dev"
container_name = "codeconverter" # Replace with your actual container name
spark.conf.set("fs.azure.account.auth.type", "CustomAccessToken")
spark.conf.set("fs.azure.account.custom.token.provider.class", spark.conf.get("spark.databricks.passthrough.adls.gen2.tokenProviderClassName"))

In [0]:
# Get file info from config table (DataFlowTableInfo)
get_dataflowbaseinfo = f"""
    SELECT * FROM codeconverter_config.dataflowbaseinfo
    WHERE DataFlow = '{data_flow_man}'
"""

# Execute the query and assign to a DataFrame
result_flowbaseinfo = spark.sql(get_dataflowbaseinfo)

# Get values from DataFlowBaseInfo table
dataflow_value = result_flowbaseinfo.select('DataFlow').first()[0]
dataflow_files_sourcepath_value = result_flowbaseinfo.select('DataFlowFilesSourceFolder').first()[0]
prompt_value = result_flowbaseinfo.select('PathToPrompt').first()[0]


In [0]:
print("DataFlow: " + dataflow_value, "Source Path: " + dataflow_files_sourcepath_value, "Prompt Path: " + prompt_value, sep="\n")

In [0]:
# Set the base URL for the container and directories
base_url = f"abfss://{container_name}@{sa}.dfs.core.windows.net/AcceleratorSAPFiles/"

# Set the source directory path based on the selected folder
source_directory_path = f"{base_url}{dataflow_files_sourcepath_value}/"

# Create subdirectories under ValidatedFiles, ArchivedFiles, and Logs using the data_flow_sel
target_directory_path = f"{base_url}ValidatedFiles/{data_flow_man}/"
archive_directory_path = f"{base_url}ArchivedFiles/{data_flow_man}/"

# Create an archive folder with the current date
current_date = datetime.now().strftime('%Y-%m-%d')
archive_date_folder = f"{archive_directory_path.rstrip('/')}/{current_date}/"
dbutils.fs.mkdirs(archive_date_folder)

# Create a logs directory in the Blob Storage container
logs_directory_path = f"{base_url}Logs/{data_flow_man}/{current_date}/"
dbutils.fs.mkdirs(logs_directory_path)

# Define Prompt Path
prompt_path = base_url + prompt_value

In [0]:
print("Prompt Path: " + prompt_path)

## Get prompt

In [0]:
prompt = read_file_content(prompt_path)
detailed_prompt = '\n'.join(prompt) # can also use prompt[0]

In [0]:
print(prompt)

## Loop through CVs

In [0]:
# List files in the source directory
files = dbutils.fs.ls(source_directory_path)

# Retrieve the HANA-related files
hana_files_query = f"SELECT SAPFileName FROM codeconverter_config.DataFlowTableInfo WHERE DataFlow = '{data_flow_man}' AND SourceSystem = 'HANA'"
hana_files_df = spark.sql(hana_files_query)

# Convert the result into a list of file names
hana_files = [row['SAPFileName'] for row in hana_files_df.collect()]

# Keep only files with source system HANA
files = [file for file in files if file.name in hana_files]

# Initialize a list to store all log messages
log_messages = []

# Configuration query
get_dataflowtableinfo = f"""
SELECT * FROM codeconverter_config.DataFlowTableInfo
WHERE DataFlow = '{data_flow_man}'
"""

# Execute the query and convert the results to Pandas DataFrame
result_flowtableinfo = spark.sql(get_dataflowtableinfo)
metadata_pd = result_flowtableinfo.toPandas()

# Cast 'group_id' column to object dtype and fill null values with 'withoutgroupid'
metadata_pd['group_id'] = metadata_pd['group_id'].astype(object)
metadata_pd['group_id'].fillna('withoutgroupid', inplace=True)

# Group files by group_id (or process individually if group_id is 'withoutgroupid')
grouped_files = metadata_pd.groupby('group_id')['SAPFileName'].apply(list).to_dict()

# Define default schema to cover cases where it's not defined
default_schema = 'nntst'

# Function to process each file or group of files
def process_file_or_group(group_id, available_file_list):
    iteration_log = []
    iteration_log.append(f"**Start Processing** for Group {group_id}\n")
    
    try:
        if not available_file_list:
            iteration_log.append(f"No files found for Group {group_id}\n")
            return iteration_log

        # Log message if the number of files being processed is less than the expected
        if len(available_file_list) < len(grouped_files[group_id]):
            log_message = f"Only {len(available_file_list)} out of {len(grouped_files[group_id])} files found and processed from the blob storage for Group {group_id}: {', '.join(available_file_list)}"
            log_to_blob(log_message, "Processing")
            iteration_log.append(f"**Processing Info**: {log_message}\n")

        if group_id == 'withoutgroupid':
            for file in available_file_list:
                file_path = next((f.path for f in files if f.name == file), None)
                if file_path:
                    iteration_log.append(f"**Start Processing** for file {file}\n")
                    log_message = f"Reading file: {file_path}"
                    log_to_blob(log_message, "Read")
                    iteration_log.append(f"Reading file: {file_path}\n")
                    try:
                        file_content = read_file_content(file_path)
                        if file_content is None:
                            continue
                        file_content_as_string = "\n".join(file_content)
                        file_metadata = metadata_pd[metadata_pd['SAPFileName'] == file].iloc[0]
                        schema = file_metadata['SchemaName'] if not pd.isnull(file_metadata['SchemaName']) else default_schema
                        table = file_metadata['DataBricksTableName']
                        start_time_initial = time.time()
                        prompt = f"{detailed_prompt}. Take into account to use the schema '{schema}' and table '{table}':\n{file_content_as_string}"
                        model_output = call_model(selected_model, prompt=prompt)
                        end_time_initial = time.time()
                        initial_conversion_time = end_time_initial - start_time_initial
                        log_message = f"Initial model conversion for file {file} took {initial_conversion_time:.2f} seconds"
                        log_to_blob(log_message, "Initial Conversion")
                        iteration_log.append(f"**Initial Conversion** took {initial_conversion_time:.2f} seconds for file {file}\n")
                        reassess_prompt = f"Please confirm that the following translation of SAP code to Databricks SQL code using the schema '{schema}' and table '{table}' is correct. "
                        reassess_prompt += f"If necessary, improve the translation, but make sure to **provide the corrected Databricks SQL code** at the end:\n{file_content_as_string}\n\n"
                        reassess_prompt += f"Initial Databricks SQL code:\n{model_output}\n\n"
                        reassess_prompt += "Please review and provide the **final SQL code** as the output, keeping the language cast to SQL."
                        reassess_prompt += "Keep the documentation/comments generated previously, correct any errors in the descriptions, and update them as needed to reflect any changes or new information. Ensure the documentation remains accurate, clear, and up-to-date based on the latest changes."
                        reassessed_output = call_model(selected_model, prompt=reassess_prompt)
                        output_file_path = f"{target_directory_path}{os.path.basename(file_path).replace('.txt', '_validated_code.txt')}"
                        dbutils.fs.put(output_file_path, reassessed_output, overwrite=True)
                        log_message = f"Successfully processed and saved validated output for file {file}"
                        log_to_blob(log_message, "Validated Output")
                        iteration_log.append(f"**Validated Output**: Successfully processed and saved for file {file}\n")
                        archive_file_path = f"{archive_date_folder}{file}"
                        dbutils.fs.mv(file_path, archive_file_path)
                        log_message = f"Successfully archived the original file {file} to: {archive_file_path}"
                        log_to_blob(log_message, "Archive")
                        iteration_log.append(f"**Archive**: Successfully archived the original file {file}\n")
                    except Exception as e:
                        log_message = f"Error processing file {file}: {str(e)}"
                        log_to_blob(log_message, "Error")
                        iteration_log.append(f"**Error**: {log_message}\n")
                    current_time = datetime.now().strftime('%H-%M-%S')
                    iteration_log_filename = f"log_{os.path.basename(file_path).replace('.txt', '')}_{current_date}_{current_time}.log"
                    iteration_log_file_path = f"{logs_directory_path}{iteration_log_filename}"
                    try:
                        dbutils.fs.put(iteration_log_file_path, "\n".join(iteration_log), overwrite=True)
                        print(f"Iteration log written for file {file} to: {iteration_log_file_path}")
                    except Exception as e:
                        print(f"Error writing iteration log for file {file}: {e}")
        else:
            combined_content = []
            combined_file_names = []
            sorted_file_list = sorted(available_file_list)
            for file in sorted_file_list:
                file_path = next((f.path for f in files if f.name == file), None)
                if file_path:
                    log_message = f"Reading file: {file_path}"
                    log_to_blob(log_message, "Read")
                    iteration_log.append(f"Reading file: {file_path}\n")
                    file_content = read_file_content(file_path)
                    if file_content is None:
                        continue
                    combined_file_names.append(os.path.basename(file_path).replace(".txt", ""))
                    combined_content.append(f"CV_{combined_file_names[-1]} starts here\n")
                    combined_content.extend(file_content)
            if len(sorted_file_list) > 1:
                final_parts = [name.split('.')[0].split('_')[-2:] for name in combined_file_names]
                combined_filename_parts = [part for sublist in final_parts for part in sublist]
                combined_filename = "CV_" + "_".join(combined_filename_parts)
            else:
                combined_filename = "CV_" + "_".join(combined_file_names[0].split('.')[0].split('_')[-2:])
            combined_content_as_string = "\n".join(combined_content)
            group_metadata = metadata_pd[metadata_pd['group_id'] == group_id].iloc[0]
            schema = group_metadata['SchemaName'] if not pd.isnull(group_metadata['SchemaName']) else default_schema
            table = group_metadata['DataBricksTableName']
            start_time_initial = time.time()
            prompt = f"{detailed_prompt}. Take into account to use the schema '{schema}' and table '{table}':\n{combined_content_as_string}"
            model_output = call_model(selected_model, prompt=prompt)
            end_time_initial = time.time()
            initial_conversion_time = end_time_initial - start_time_initial
            log_message = f"Initial model conversion for Group {group_id} took {initial_conversion_time:.2f} seconds"
            log_to_blob(log_message, "Initial Conversion")
            iteration_log.append(f"**Initial Conversion** took {initial_conversion_time:.2f} seconds for Group {group_id}\n")
            reassess_prompt = f"Please confirm that the following translation of SAP code to Databricks SQL code using the schema '{schema}' and table '{table}' is correct. "
            reassess_prompt += f"If necessary, improve the translation, but make sure to **provide the corrected Databricks SQL code** at the end:\n{combined_content_as_string}\n\n"
            reassess_prompt += f"Initial Databricks SQL code:\n{model_output}\n\n"
            reassess_prompt += "Please review and provide the **final SQL code** as the output, keeping the language cast to SQL."
            reassess_prompt += "Keep the documentation/comments generated previously, correct any errors in the descriptions, and update them as needed to reflect any changes or new information. Ensure the documentation remains accurate, clear, and up-to-date based on the latest changes."
            reassessed_output = call_model(selected_model, prompt=reassess_prompt)
            output_file_path = f"{target_directory_path}{combined_filename}.validated_code.txt"
            dbutils.fs.put(output_file_path, reassessed_output, overwrite=True)
            log_message = f"Successfully processed and saved validated output for Group {group_id}"
            log_to_blob(log_message, "Validated Output")
            iteration_log.append(f"**Validated Output**: Successfully processed and saved for Group {group_id}\n")
            for file in available_file_list:
                file_path = next((f.path for f in files if f.name == file), None)
                if file_path:
                    archive_file_path = f"{archive_date_folder}{file}"
                    dbutils.fs.mv(file_path, archive_file_path)
                    log_message = f"Successfully archived the original file {file} to: {archive_file_path}"
                    log_to_blob(log_message, "Archive")
                    iteration_log.append(f"**Archive**: Successfully archived the original file {file}\n")
            iteration_log_filename = f"log_group{group_id}_{current_date}.log"
            iteration_log_file_path = f"{logs_directory_path}{iteration_log_filename}"
            try:
                dbutils.fs.put(iteration_log_file_path, "\n".join(iteration_log), overwrite=True)
                print(f"Iteration log written for Group {group_id} to: {iteration_log_file_path}")
            except Exception as e:
                print(f"Error writing iteration log for Group {group_id}: {e}")
    except Exception as e:
        log_message = f"Error processing Group {group_id}: {str(e)}"
        log_to_blob(log_message, "Error")
        iteration_log.append(f"**Error**: {log_message}\n")
    return iteration_log


# Submit tasks for processing only available files
with concurrent.futures.ThreadPoolExecutor() as executor:
    future_to_group = {
        executor.submit(process_file_or_group, group_id, available_file_list): group_id
        for group_id, file_list in grouped_files.items()
        for available_file_list in [
            [f for f in file_list if any(f == blob_file.name for blob_file in files)]
        ]
    }

    # Wait for all futures to complete and process results
    for future in concurrent.futures.as_completed(future_to_group):
        group_id = future_to_group[future]
        try:
            result = future.result()
            log_messages.extend(result)
        except Exception as e:
            log_messages.append(f"Error processing Group {group_id}: {e}\n")

# The `with` block ensures that the executor is properly shut down after the tasks are completed.
current_time = datetime.now().strftime('%H-%M-%S')
final_log_filename = f"final_log_{current_time}.log"
final_log_file_path = f"{logs_directory_path}{final_log_filename}"

try:
    dbutils.fs.put(final_log_file_path, "\n".join(log_messages), overwrite=True)
    print(f"Final log written to: {final_log_file_path}")
except Exception as e:
    print(f"Error writing final log: {e}")
