In [0]:
%run ./init

In [0]:
import re
import os
import pandas as pd
from bs4 import BeautifulSoup
from openai import AzureOpenAI
from datetime import datetime
import time
import lxml
import math

In [0]:
# Create a dropdown widget for selecting model type (o1 or o1 mini)
dbutils.widgets.dropdown("model_choice", "o1", ["o1", "o1-mini", "o1-preview"], "Choose model type")
# Retrieve the selected model type
selected_model = dbutils.widgets.get("model_choice")
print(f"Selected model: {selected_model}")

In [0]:
# Function to call the appropriate model based on selected type
def call_model(selected_model, prompt):
    if selected_model == "o1":
        return call_model_o1(deployment_name="o1", prompt=prompt)  # Call the o1 model function
    elif selected_model == "o1-preview":
        return callmodelo1(deployment_name="o1-preview", prompt=prompt)  # Call the o1-preview model function
    elif selected_model == "o1-mini":
        return callmodelo1(deployment_name="o1-mini", prompt=prompt)  # Call the o1-mini model function
    else:
        raise ValueError("Invalid model choice")

In [0]:
list_of_filtered_trans =['01DQ1YT1TSXD859H6OJ6Y69PN4ZVNZQX'] #['087DSWSR0WU60CWHZ9OEEZB5N1J9RAK8', '048S2CO3DJUJ0RSSGU6N8HM2V22QF1RE', '0KVSPP1Y4QU15UUF4BS9UOOKJXJT32RN','0STEEWOV3FBPK2SKQUCOSZW6O8M4QLKP']

In [0]:
pip install html_form_to_dict

In [0]:
import html_form_to_dict

In [0]:
# Define paths
for subdir, _, files in os.walk("html/"):
    for file in files:
        HTML_FILE = subdir + file
        base_url = subdir
        input_directory_base_path = subdir

# Define functions
def process_html_spark(file_path):
    # Read the HTML file as plain text into a DataFrame
    df = spark.read.text(f"file:{os.getcwd()}/{file_path}")
    # Combine all lines of the HTML file into one string
    html = "\n".join([row['value'] for row in df.collect()])
    # Remove everything inside <HEAD>...</HEAD>
    html = re.sub(r'<HEAD>.*?</HEAD>', '', html, flags=re.DOTALL | re.IGNORECASE)
    # Remove all <img ...> tags
    html = re.sub(r'<img([\w\W]+?)>', '', html, flags=re.IGNORECASE)
    return html

def extract_first_table(html):
    match = re.search(r'(<table width="600" class="SAPBEXBorderlessFlexBox" cellspacing="0" cellpadding="0" border="2">.*?</table>)', html, flags=re.DOTALL | re.IGNORECASE)
    return match.group(1) if match else "Table not found"

def table_to_dataframe(html):
    soup = BeautifulSoup(html, "html.parser")
    # Find the table that contains the data
    data_table = soup.find("table", class_="SAPBEXTableGrid")
    # If no table is found, return an empty DataFrame with expected columns
    if not data_table:
        print("Table with class 'SAPBEXTableGrid' not found.")
        return pd.DataFrame(columns=["Object Type", "Name", "Technical Name", "Source System", "HANA CV Name"])

    rows = data_table.find_all("tr")
    data = []
    current_object_type = ""

    # Iterate over table rows, skipping the header row
    for row in rows[1:]:  # Skip the header
        cells = row.find_all("td")
        # Check if we have 3 columns in this row
        if len(cells) == 3:
            object_type = cells[0].text.strip() or current_object_type
            name = cells[1].text.strip()  # Keep the original name unchanged
            technical_name = cells[2].text.strip()
            # Initialize additional columns
            source_system = ""
            hana_cv_name = ""

            # Check if object type is Data Source (case-insensitive) and technical name contains "HANA"
            if "data" in object_type.lower() and "source" in object_type.lower() and "hana" in technical_name.lower():
                # Collapse multiple spaces inside the technical name to a single space
                technical_name = re.sub(r'\s+', ' ', technical_name).strip()  # Collapse spaces
                # Split the technical name by the first space
                parts = technical_name.split(" ", 1)
                # Limit Technical Name to the first part (before the first space)
                technical_name = parts[0]
                # Extract Source System (everything after the first space)
                if len(parts) > 1:
                    source_system = parts[1].strip()  # Everything after the first part
                # Extract Hana Calculation View Name from the transformed version of the 'Name'
                transformed_name = re.sub(r'[/:->]', ' ', name)  # Replace the delimiters with spaces
                hana_cv_name = transformed_name.split()[-1]  # Take the last part of the transformed string
            # Append the row with the new columns
            data.append([object_type, name, technical_name, source_system, hana_cv_name])
            if cells[0].text.strip():
                current_object_type = cells[0].text.strip()

    # Create a pandas DataFrame with additional columns
    df = pd.DataFrame(data, columns=["Object Type", "Name", "Technical Name", "Source System", "HANA CV Name"])
    return df

def extract_transformation_names(df):
    """
    Extracts values from the 'Technical Name' column for rows where 'Object Type' contains 'Transformation'.
    Parameters:
    df (pandas.DataFrame): The input DataFrame.
    Returns:
    list: A list of extracted 'Technical Name' values.
    """
    return df.loc[df['Object Type'].str.contains('Transformation', na=False, case=False), 'Technical Name'].tolist()

def extract_tables_from_html(html, transformation_names):
    """
    Extracts parent <table> elements containing <a name="TRFN{item}"> from an HTML file.
    Parameters:
    html (str): The HTML file content.
    transformation_names (list): List of transformation names.
    Returns:
    list: A list of extracted table HTML content as strings.
    """
    extracted_tables = []
    # Read the HTML file
    soup = BeautifulSoup(html, "html.parser")
    # Iterate through transformation names
    for name in transformation_names:
        anchor_tag = soup.find("a", {"name": f"TRFN{name}"})  # Find the first matching <a> tag
        if anchor_tag:
            table_tag = anchor_tag.find_parent("table", {
                "width": "600",
                "class": "SAPBEXBorderlessFlexBox",
                "cellspacing": "0",
                "cellpadding": "0",
                "border": "1"
            })
            if table_tag:
                extracted_tables.append(str(table_tag))  # Store as HTML string
    return extracted_tables

def extract_transf_details(tables_list, file_names, output_dir):
    """
    Extracts SAP transformation details from HTML tables using Azure OpenAI and saves the details to text files.
    Parameters:
    tables_list (list): List of HTML table strings.
    file_names (list): List of filenames for storing transformation details.
    output_dir (str): Directory where transformation details will be saved.
    Returns:
    list: List of model-generated transformation/table details.
    """
    summaries = []
    routines = []
    for index, table_html in enumerate(tables_list):
        dfs = pd.read_html(table_html)
        table_html_df = dfs[0].dropna(how='all')
        if len(table_html_df.columns) == 2:
            table_html_df.columns = ['Logic', 'Field_name']   
            table_processed_df = table_html_df

            table_description_html = table_processed_df.iloc[-1]['Logic']

            start_index = table_description_html.find("Routine ") + len("Routine ")
            end_index = table_description_html.find(" Required Objects Object Type Name Technical Name Routine")
            routine_value = table_description_html[start_index:end_index] 

            start_index = table_description_html.find("Technical Name: ") + len("Technical Name: ")
            end_index = table_description_html.find(" Description (Short): ")
            technical_name_value = table_description_html[start_index:end_index]

            start_index = table_description_html.find("Description (Short): ") + len("Description (Short): ")
            end_index = table_description_html.find(" Description (Long): ")
            short_desciption_value = table_description_html[start_index:end_index]

            start_index = table_description_html.find("Description (Long): ") + len("Description (Long): ")
            end_index = table_description_html.find(" Object Version: ")
            long_desciption_value = table_description_html[start_index:end_index]

            start_index = table_description_html.find("Source: ") + len("Source: ")
            end_index = table_description_html.find(" Target: ")
            source_value = table_description_html[start_index:end_index]

            start_index = table_description_html.find("Target: ") + len("Target: ")
            target_value = table_description_html[start_index:start_index+20]

            transformation_info = pd.DataFrame({
                'Technical_name': [technical_name_value],
                'Short_description': [short_desciption_value],
                'Long_description': [long_desciption_value],
                'Source': [source_value],
                'Target': [target_value],
                'Routine': [routine_value]
            })

            #display(transformation_info)
            
        if len(table_html_df.columns) == 11:
            table_html_df.columns = ['Logic', 'Field_name', 'Description', 'Data_Type', 'Length', 'Rule_Type', 'Target', 'InfoObject', 'Description1', 'Data_Type1', 'Length1']
            
            table_html_df = table_html_df[table_html_df['Field_name'].str[:9] != "Unit Info"]
            table_html_df = table_html_df[(table_html_df['Field_name'].notnull()) | (table_html_df['InfoObject'].notnull())].reset_index(drop=True)

            same_value_rows = {index: row['Logic'] for index, row in table_html_df.iterrows() if len(set(row)) == 1 and index not in [table_html_df.index.max(), table_html_df.index.max() - 1]}
            rule_rows = {index: row['Rule_Type'] for index, row in table_html_df.iterrows()}

            for i in range(len(same_value_rows) -3):
                if list(same_value_rows.keys())[i+1] - list(same_value_rows.keys())[i] == 1 and 'Constant' in str(list(same_value_rows.values())[i+1]):
                    same_value_rows[list(same_value_rows.keys())[i]] = same_value_rows[list(same_value_rows.keys())[i]] + '\n' + same_value_rows[list(same_value_rows.keys())[i+1]]
                    del same_value_rows[list(same_value_rows.keys())[i+1]]
                
            table_processed_df = table_html_df

            for key, value in same_value_rows.items():
                index = key - 1
                if not(pd.isna(table_processed_df.iloc[index]['Rule_Type'])):
                    table_processed_df.at[index, 'Logic'] = str(value)
                else:
                    index -= 1
                    if not(pd.isna(table_processed_df.iloc[index]['Rule_Type'])):
                        table_processed_df.at[index, 'Logic'] = str(value)

            for index, row in table_processed_df.iterrows():
                if len(set(row)) == 1 and index not in [table_html_df.index.max(), table_html_df.index.max() - 1]:
                    table_processed_df = table_processed_df.drop(index)

            table_description_html = table_processed_df.iloc[-1]['Logic']

            start_index = table_description_html.find("Technical Name: ") + len("Technical Name: ")
            end_index = table_description_html.find(" Description (Short): ")
            technical_name_value = table_description_html[start_index:end_index]

            start_index = table_description_html.find("Description (Short): ") + len("Description (Short): ")
            end_index = table_description_html.find(" Description (Long): ")
            short_desciption_value = table_description_html[start_index:end_index]

            start_index = table_description_html.find("Description (Long): ") + len("Description (Long): ")
            end_index = table_description_html.find(" Object Version: ")
            long_desciption_value = table_description_html[start_index:end_index]

            start_index = table_description_html.find("Source: ") + len("Source: ")
            end_index = table_description_html.find(" Target: ")
            source_value = table_description_html[start_index:end_index]

            start_index = table_description_html.find("Target: ") + len("Target: ")
            target_value = table_description_html[start_index:start_index+20]
            if 'Routine' in table_description_html:
                start_index = table_description_html.find("Code for Start Routine ") + len("Code for Start Routine ")
                if "Inverse Startroutine Routine Code for Inverse Start Routine" in table_description_html:
                    start_index = table_description_html.find("Inverse Startroutine Routine Code for Inverse Start Routine ") + len("Inverse Startroutine Routine Code for Inverse Start Routine ")
                if "Routine Code for End Routine" in table_description_html:
                    start_index = table_description_html.find("Routine Code for End Routine ") + len("Routine Code for End Routine ")
                end_index = table_description_html.find(" Required Objects Object Type Name Technical Name")
                if "Source Field name Description Data Type Length Rule Type Target InfoObject Description Data Type Length" in table_description_html:
                    end_index = table_description_html.find(" Source Field name Description Data Type Length Rule Type Target InfoObject Description Data Type Length")
                if "Source InfoObject Description Data Type Length Rule Type Target InfoObject Description Data Type Length" in table_description_html:
                    end_index = table_description_html.find(" Source InfoObject Description Data Type Length Rule Type Target InfoObject Description Data Type Length")
                routine_value = table_description_html[start_index:end_index]
            else:
                routine_value = None

            table_processed_df = table_processed_df.drop(table_processed_df.tail(2).index)
            table_processed_df = table_processed_df.drop(columns=['Target'])
            table_processed_df.insert(0, 'Target', target_value)
            table_processed_df.insert(0, 'Source', source_value)
            table_processed_df.insert(0, 'Long_description', long_desciption_value)
            table_processed_df.insert(0, 'Short_description', short_desciption_value)
            table_processed_df.insert(0, 'Technical_name', technical_name_value)
            table_processed_df.loc[table_processed_df['Logic'].notnull(), 'Routine'] = routine_value
            table_processed_df = table_processed_df[['Technical_name','Short_description','Long_description','Source','Target','Field_name','Logic','Description','Data_Type','Length','Rule_Type','InfoObject','Description1','Data_Type1', 'Length1', 'Routine']]
            #display(table_processed_df.assign(result=table_processed_df.to_dict(orient='records')).loc[table_processed_df['Routine'].notnull().idxmax(), ['Technical_name', 'Routine', 'result']].to_frame().T)
            display(table_processed_df)

        # extraction_prompt = f"""You are given an HTML structure below describing a SAP transformation.
        # Focus on the Key Rules and/or the Data Rules. Extract the required fields in csv format with
        # the following header: Source Name, Target Name, Source Field Name, Source Field Description,
        # Source Field Data Type, Source Field Length, Rule Type, Target InfoObject Name, Target InfoObject Description,
        # Target InfoObject Data Type, Target InfoObject Length. \n\nHTML structure:\n{table_html}"""
        # abap_prompt = f"""You are given an HTML structure below describing a SAP transformation.
        # Focus on the ABAP code present in the structure. Extract the required code and store in JSON format
        # with the following keys: Start Routine, Global Code, Global Code 2, End Routine, Invers Endroutine.
        # Ignore any statements that say '... "insert your code here', do not include them in the extract.
        # If you cannot find code for one key simply leave it blank. \n\nHTML structure:\n{table_html}"""
        # try:
        #     # Call Azure OpenAI model for chat completion
        #     details_response = call_model(selected_model, prompt=extraction_prompt)
        #     #details_response = call_model_o1(deployment_name="o1", prompt=extraction_prompt)
        #     summaries.append(details_response)
        #     abap_response = call_model(selected_model, prompt=abap_prompt)
        #     #abap_response = call_model_o1(deployment_name="o1", prompt=abap_prompt)
        #     routines.append(abap_response)
        #     # Write the summary to a text file
        #     file_name = f"{file_names[index]}.txt"
        #     file_path = f"{output_dir}/{file_name}"
        #     dbutils.fs.put(file_path, f"Transformation details:\n{details_response}\n\nTransformation ABAP code:\n{abap_response}\n", overwrite=True)
        # except Exception as e:
        #     log_message = f"Error processing table {index}: {e}"
        #     log_messages.append(f"**Error**: {log_message}\n")
        #     #print(log_message)
    return summaries, routines

def convert_transf(tables_details, routine_codes, file_names, output_dir):
    """
    Translate SAP transformation details from detail tables using Azure OpenAI and saves the code to text files.
    Parameters:
    tables_details (list): List of table transformation details.
    routine_codes (list): List of transformation ABAP code.
    file_names (list): List of filenames for storing transformation details.
    output_dir (str): Directory where transformation details will be saved.
    Returns:
    list: List of model-generated code that replicates transformation behaviour on Databricks.
    """
    conversions = []
    for index, table_html in enumerate(tables_details):
        conversion_prompt = f"""Below you are given two major pieces of a SAP transformation, the transformation
        details with source and target fields and the routine code used in the transformation. Using these details
        do the following:
        1. Write SQL code that runs on Databricks that builds the source table structure.
        2. Write SQL code that runs on Databricks that builds the target table structure.
        3. Write SQL code that runs on Databricks that writes data from source table to target table in the following order:
            i. 1:1 mappings (i.e. "[DIRECT]")
            ii. "[CONSTANT]" fields
            iii. Transformation rules/logic in the routine code
            iv. Any other logic deduced elsewhere (mention it explicitly in the output)
        Optimize the code for Databricks. Assume that the environment in Databricks has already been created,
        thus adhere strictly to generating the code for the two tables without any setup steps. Write comments
        explaining code logic. \n\nTRANSFORMATION DETAILS:\n{table_html} \n\nROUTINE CODE:\n{routine_codes[index]}"""
        try:
            # Call Azure OpenAI model for chat completion
            #response = call_model_o1(deployment_name="o1", prompt=conversion_prompt)
            response = call_model(selected_model, prompt=conversion_prompt)
            conversions.append(response)
            # Write the summary to a text file
            file_name = f"{file_names[index]}.txt"
            file_path = f"{output_dir}/{file_name}"
            dbutils.fs.put(file_path, f"Transformation code:\n{response}", overwrite=True)
        except Exception as e:
            log_message = f"Error processing table {index}: {e}"
            log_messages.append(f"**Error**: {log_message}\n")
            #print(log_message)
    return conversions


current_date = datetime.now().strftime('%Y-%m-%d')
input_directory_path = f"{input_directory_base_path}/"

# List files in the source directory for the current dataflow
files = [f for f in os.listdir(input_directory_path)]

# Keep only BW source files for processing
bw_files_to_process = [file for file in files]

# Initialize a list to store all log messages for the current DataFlow
log_messages = []

# Process each file in the source directory
for file_info in bw_files_to_process:
    file_path = HTML_FILE
    processed_html = process_html_spark(file_path)
    extracted_table = extract_first_table(processed_html)
    df = table_to_dataframe(extracted_table)
    #print('First table: ')
    #display(df)
    transformations_technical_names = extract_transformation_names(df)
    #print(f'transformations_technical_names = {transformations_technical_names}')
    transformations_technical_names = [filtered_transf for filtered_transf in transformations_technical_names]# if filtered_transf in list_of_filtered_trans]
    #print(transformations_technical_names)
    transformation_tables = extract_tables_from_html(processed_html, transformations_technical_names)
    transformation_details, transformation_routines = extract_transf_details(transformation_tables, transformations_technical_names, '/transformation_detail')
    #print(f'transformation_details = {transformation_details}')
    #print(f'transformation_routines = {transformation_routines}')
    transformation_code = convert_transf(transformation_details, transformation_routines, transformations_technical_names, '/transformation_detail')
    #print(f'transformation_code = {transformation_code}')
