In [0]:
%run ./init

In [0]:
import re
import os
import pandas as pd
from bs4 import BeautifulSoup
from openai import AzureOpenAI
from datetime import datetime
import time


In [0]:
# Create a dropdown widget for selecting model type (o1 or o1 mini)
dbutils.widgets.dropdown("model_choice", "o1", ["o1", "o1-mini", "o1-preview"], "Choose model type")
# Retrieve the selected model type
selected_model = dbutils.widgets.get("model_choice")
print(f"Selected model: {selected_model}")

In [0]:
# Function to call the appropriate model based on selected type
def call_model(selected_model, prompt):
    if selected_model == "o1":
        return call_model_o1(deployment_name="o1", prompt=prompt)  # Call the o1 model function
    elif selected_model == "o1-preview":
        return callmodelo1(deployment_name="o1-preview", prompt=prompt)  # Call the o1-preview model function
    elif selected_model == "o1-mini":
        return callmodelo1(deployment_name="o1-mini", prompt=prompt)  # Call the o1-mini model function
    else:
        raise ValueError("Invalid model choice")


In [0]:
# # Set your storage account SAS token and name
# sasToken = dbutils.secrets.get(scope="codeconv", key="sasTokenKey")  # Create your own Scope in DataBricks and assign a secret for it
# sa = "erdccalearning"  # Replace with your actual storage account name
# container_name = "nndemo"  # Replace with your actual container name
# spark.conf.set(f"fs.azure.account.auth.type.{sa}.dfs.core.windows.net", "SAS")
# spark.conf.set(f"fs.azure.sas.token.provider.type.{sa}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
# spark.conf.set(f"fs.azure.sas.nndemo.{sa}.blob.core.windows.net", f"{sasToken}")

# # Set the base URL for the container and directories
# base_url = f"wasbs://{container_name}@{sa}.blob.core.windows.net/AcceleratorSAPFiles/"

sa = "stlpdel01dev"
container_name = "codeconverter" # Replace with your actual container name
spark.conf.set("fs.azure.account.auth.type", "CustomAccessToken")
spark.conf.set("fs.azure.account.custom.token.provider.class", spark.conf.get("spark.databricks.passthrough.adls.gen2.tokenProviderClassName"))
base_url = f"abfss://{container_name}@{sa}.dfs.core.windows.net/AcceleratorSAPFiles/"

# Define paths
validated_directory_base_path = f"{base_url}ValidatedFiles/"
archived_directory_base_path = f"{base_url}ArchivedFiles/"
logs_directory_base_path = f"{base_url}Logs/"
input_directory_base_path = f"{base_url}InputFiles/"

# Ensure base directories exist
dbutils.fs.mkdirs(validated_directory_base_path)
dbutils.fs.mkdirs(archived_directory_base_path)
dbutils.fs.mkdirs(logs_directory_base_path)

# Create a widget to capture DataFlow input from the user
dbutils.widgets.text("DataFlow", "")
dataflow_input = dbutils.widgets.get("DataFlow")

# Ensure the DataFlow input is uppercase
dataflow = dataflow_input.upper()

# Define functions
def process_html_spark(file_path):
    # Read the HTML file as plain text into a DataFrame
    df = spark.read.text(file_path)
    # Combine all lines of the HTML file into one string
    html = "\n".join([row['value'] for row in df.collect()])
    # Remove everything inside <HEAD>...</HEAD>
    html = re.sub(r'<HEAD>.*?</HEAD>', '', html, flags=re.DOTALL | re.IGNORECASE)
    # Remove all <img ...> tags
    html = re.sub(r'<img([\w\W]+?)>', '', html, flags=re.IGNORECASE)
    return html

def extract_first_table(html):
    match = re.search(r'(<table width="600" class="SAPBEXBorderlessFlexBox" cellspacing="0" cellpadding="0" border="2">.*?</table>)', html, flags=re.DOTALL | re.IGNORECASE)
    return match.group(1) if match else "Table not found"

def table_to_dataframe(html):
    soup = BeautifulSoup(html, "html.parser")
    # Find the table that contains the data
    data_table = soup.find("table", class_="SAPBEXTableGrid")
    # If no table is found, return an empty DataFrame with expected columns
    if not data_table:
        print("Table with class 'SAPBEXTableGrid' not found.")
        return pd.DataFrame(columns=["Object Type", "Name", "Technical Name", "Source System", "HANA CV Name"])

    rows = data_table.find_all("tr")
    data = []
    current_object_type = ""

    # Iterate over table rows, skipping the header row
    for row in rows[1:]:  # Skip the header
        cells = row.find_all("td")
        # Check if we have 3 columns in this row
        if len(cells) == 3:
            object_type = cells[0].text.strip() or current_object_type
            name = cells[1].text.strip()  # Keep the original name unchanged
            technical_name = cells[2].text.strip()
            # Initialize additional columns
            source_system = ""
            hana_cv_name = ""

            # Check if object type is Data Source (case-insensitive) and technical name contains "HANA"
            if "data" in object_type.lower() and "source" in object_type.lower() and "hana" in technical_name.lower():
                # Collapse multiple spaces inside the technical name to a single space
                technical_name = re.sub(r'\s+', ' ', technical_name).strip()  # Collapse spaces
                # Split the technical name by the first space
                parts = technical_name.split(" ", 1)
                # Limit Technical Name to the first part (before the first space)
                technical_name = parts[0]
                # Extract Source System (everything after the first space)
                if len(parts) > 1:
                    source_system = parts[1].strip()  # Everything after the first part
                # Extract Hana Calculation View Name from the transformed version of the 'Name'
                transformed_name = re.sub(r'[/:->]', ' ', name)  # Replace the delimiters with spaces
                hana_cv_name = transformed_name.split()[-1]  # Take the last part of the transformed string
            # Append the row with the new columns
            data.append([object_type, name, technical_name, source_system, hana_cv_name])
            if cells[0].text.strip():
                current_object_type = cells[0].text.strip()

    # Create a pandas DataFrame with additional columns
    df = pd.DataFrame(data, columns=["Object Type", "Name", "Technical Name", "Source System", "HANA CV Name"])
    return df

def extract_transformation_names(df):
    """
    Extracts values from the 'Technical Name' column for rows where 'Object Type' contains 'Transformation'.
    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    Returns:
    list: A list of extracted 'Technical Name' values.
    """
    return df.loc[df['Object Type'].str.contains('Transformation', na=False, case=False), 'Technical Name'].tolist()

def extract_tables_from_html(html, transformation_names):
    """
    Extracts parent <table> elements containing <a name="TRFN{item}"> from an HTML file.
    Parameters:
    html (str): The HTML file content.
    transformation_names (list): List of transformation names.
    Returns:
    list: A list of extracted table HTML content as strings.
    """
    extracted_tables = []
    # Read the HTML file
    soup = BeautifulSoup(html, "html.parser")
    # Iterate through transformation names
    for name in transformation_names:
        anchor_tag = soup.find("a", {"name": f"TRFN{name}"})  # Find the first matching <a> tag
        if anchor_tag:
            table_tag = anchor_tag.find_parent("table", {
                "width": "600",
                "class": "SAPBEXBorderlessFlexBox",
                "cellspacing": "0",
                "cellpadding": "0",
                "border": "1"
            })
            if table_tag:
                extracted_tables.append(str(table_tag))  # Store as HTML string
    return extracted_tables

def extract_transf_details(tables_list, file_names, output_dir):
    """
    Extracts SAP transformation details from HTML tables using Azure OpenAI and saves the details to text files.
    Parameters:
    tables_list (list): List of HTML table strings.
    file_names (list): List of filenames for storing transformation details.
    output_dir (str): Directory where transformation details will be saved.
    Returns:
    list: List of model-generated transformation/table details.
    """
    summaries = []
    routines = []
    for index, table_html in enumerate(tables_list):
        extraction_prompt = f"""You are given an HTML structure below describing a SAP transformation.
        Focus on the Key Rules and/or the Data Rules. Extract the required fields in csv format with
        the following header: Source Name, Target Name, Source Field Name, Source Field Description,
        Source Field Data Type, Source Field Length, Rule Type, Target InfoObject Name, Target InfoObject Description,
        Target InfoObject Data Type, Target InfoObject Length. \n\nHTML structure:\n{table_html}"""
        abap_prompt = f"""You are given an HTML structure below describing a SAP transformation.
        Focus on the ABAP code present in the structure. Extract the required code and store in JSON format
        with the following keys: Start Routine, Global Code, Global Code 2, End Routine, Invers Endroutine.
        Ignore any statements that say '... "insert your code here', do not include them in the extract.
        If you cannot find code for one key simply leave it blank. \n\nHTML structure:\n{table_html}"""
        try:
            # Call Azure OpenAI model for chat completion
            details_response = call_model(selected_model, prompt=extraction_prompt)
            #details_response = call_model_o1(deployment_name="o1", prompt=extraction_prompt)
            summaries.append(details_response)
            abap_response = call_model(selected_model, prompt=abap_prompt)
            #abap_response = call_model_o1(deployment_name="o1", prompt=abap_prompt)
            routines.append(abap_response)
            # Write the summary to a text file
            file_name = f"{file_names[index]}.txt"
            file_path = f"{output_dir}/{file_name}"
            dbutils.fs.put(file_path, f"Transformation details:\n{details_response}\n\nTransformation ABAP code:\n{abap_response}\n", overwrite=True)
        except Exception as e:
            log_message = f"Error processing table {index}: {e}"
            dbutils.fs.put(f"{output_dir}/error_{file_names[index]}.log", log_message, overwrite=True)
            log_messages.append(f"**Error**: {log_message}\n")
            print(log_message)
    return summaries, routines

def convert_transf(tables_details, routine_codes, file_names, output_dir):
    """
    Translate SAP transformation details from detail tables using Azure OpenAI and saves the code to text files.
    Parameters:
    tables_details (list): List of table transformation details.
    routine_codes (list): List of transformation ABAP code.
    file_names (list): List of filenames for storing transformation details.
    output_dir (str): Directory where transformation details will be saved.
    Returns:
    list: List of model-generated code that replicates transformation behaviour on Databricks.
    """
    conversions = []
    for index, table_html in enumerate(tables_details):
        conversion_prompt = f"""Below you are given two major pieces of a SAP transformation, the transformation
        details with source and target fields and the routine code used in the transformation. Using these details
        do the following:
        1. Write SQL code that runs on Databricks that builds the source table structure.
        2. Write SQL code that runs on Databricks that builds the target table structure.
        3. Write SQL code that runs on Databricks that writes data from source table to target table in the following order:
            i. 1:1 mappings (i.e. "[DIRECT]")
            ii. "[CONSTANT]" fields
            iii. Transformation rules/logic in the routine code
            iv. Any other logic deduced elsewhere (mention it explicitly in the output)
        Optimize the code for Databricks. Assume that the environment in Databricks has already been created,
        thus adhere strictly to generating the code for the two tables without any setup steps. Write comments
        explaining code logic. \n\nTRANSFORMATION DETAILS:\n{table_html} \n\nROUTINE CODE:\n{routine_codes[index]}"""
        try:
            # Call Azure OpenAI model for chat completion
            #response = call_model_o1(deployment_name="o1", prompt=conversion_prompt)
            response = call_model(selected_model, prompt=conversion_prompt)
            conversions.append(response)
            # Write the summary to a text file
            file_name = f"{file_names[index]}.txt"
            file_path = f"{output_dir}/{file_name}"
            dbutils.fs.put(file_path, f"Transformation code:\n{response}", overwrite=True)
        except Exception as e:
            log_message = f"Error processing table {index}: {e}"
            dbutils.fs.put(f"{output_dir}/error_{file_names[index]}.log", log_message, overwrite=True)
            log_messages.append(f"**Error**: {log_message}\n")
            print(log_message)
    return conversions

# Process the DataFlow
print(f"Processing DataFlow: {dataflow}")
current_date = datetime.now().strftime('%Y-%m-%d')
validated_directory_path = f"{validated_directory_base_path}{dataflow}/"
archive_date_folder = f"{archived_directory_base_path}{dataflow}/{current_date}/"
logs_directory_path = f"{logs_directory_base_path}{dataflow}/{current_date}/"
input_directory_path = f"{input_directory_base_path}{dataflow}/"

# Ensure output directories for the current dataflow exist
dbutils.fs.mkdirs(validated_directory_path)
dbutils.fs.mkdirs(archive_date_folder)
dbutils.fs.mkdirs(logs_directory_path)

# Get file info from config table (DataFlowBaseInfo)
get_dataflowbaseinfo = f"""
    SELECT * FROM codeconverter_config.dataflowbaseinfo
    WHERE DataFlow = '{dataflow}'
"""
# Execute the query and assign to a DataFrame
result_flowbaseinfo = spark.sql(get_dataflowbaseinfo)
# Get values from DataFlowBaseInfo table
dataflow_files_sourcepath_value = result_flowbaseinfo.select('DataFlowFilesSourceFolder').first()[0]

# Set the source directory path based on the selected folder
source_directory_path = f"{base_url}{dataflow_files_sourcepath_value}/"

# List files in the source directory for the current dataflow
files = dbutils.fs.ls(input_directory_path)

# Retrieve the BW-related files for the current DataFlow
bw_files_query = f"SELECT SAPFileName FROM codeconverter_config.dataflowtableinfo WHERE SourceSystem = 'BW' AND DataFlow = '{dataflow}'"
bw_files_df = spark.sql(bw_files_query)

# Convert the result into a list of file names
bw_files = [row['SAPFileName'] for row in bw_files_df.collect()]

# Keep only BW source files for processing
bw_files_to_process = [file for file in files if file.name in bw_files]

# Initialize a list to store all log messages for the current DataFlow
log_messages = []

# Process each file in the source directory
for file_info in bw_files_to_process:
    file_path = file_info.path
    processed_html = process_html_spark(file_path)
    extracted_table = extract_first_table(processed_html)
    df = table_to_dataframe(extracted_table)
    print(f'extracted_table = \n{display(df)}')
    transformations_technical_names = extract_transformation_names(df)
    print(f'transformations_technical_names = {transformations_technical_names}')
    transformation_tables = extract_tables_from_html(processed_html, transformations_technical_names)
    transformation_details, transformation_routines = extract_transf_details(transformation_tables, transformations_technical_names, logs_directory_path)
    print(f'transformation_details = {transformation_details}')
    print(f'transformation_routines = {transformation_routines}')
    transformation_code = convert_transf(transformation_details, transformation_routines, transformations_technical_names, validated_directory_path)
    print(f'transformation_code = {transformation_code}')

# Archive the original BW source files
for file_info in bw_files_to_process:
    file_name = file_info.name
    file_path = file_info.path
    try:
        archive_file_path = f"{archive_date_folder}{file_name}"
        dbutils.fs.mv(file_path, archive_file_path)
        log_message = f"Successfully archived the original file {file_name} to: {archive_file_path}"
        dbutils.fs.put(f"{logs_directory_path}archive_{file_name}.log", log_message, overwrite=True)
        print(log_message)
    except Exception as e:
        log_message = f"Error archiving file {file_name}: {e}"
        dbutils.fs.put(f"{logs_directory_path}error_archive_{file_name}.log", log_message, overwrite=True)
        log_messages.append(f"**Error**: {log_message}\n")
        print(log_message)

# Write all accumulated log messages to a final log file
current_time = datetime.now().strftime('%H-%M-%S')
final_log_filename = f"final_log_{current_time}.log"
final_log_file_path = f"{logs_directory_path}{final_log_filename}"
try:
    dbutils.fs.put(final_log_file_path, "\n".join(log_messages), overwrite=True)
    print(f"Final log written to: {final_log_file_path}")
except Exception as e:
    print(f"Error writing final log: {e}")
