In [0]:
%run ./init

In [0]:
%python

import os
import copy
import xml.etree.ElementTree as ET
import tiktoken
import numpy as np
import pandas as pd
from datetime import datetime

In [0]:
%python

is_target_notebook = True 

In [0]:
%python

sa = "stlpdel01dev"
container_name = "codeconverter" # Replace with your actual container name
spark.conf.set("fs.azure.account.auth.type", "CustomAccessToken")
spark.conf.set("fs.azure.account.custom.token.provider.class", spark.conf.get("spark.databricks.passthrough.adls.gen2.tokenProviderClassName"))


In [0]:
%python

# Get parameters value
dbutils.widgets.text("data_flow_insertion", "DF34", "Choose max number of tokens")
data_flow_man = dbutils.widgets.get("data_flow_insertion")
data_flow_man = data_flow_man.upper()
print(f"Current working directory: {data_flow_man}")

In [0]:
# Get file info from config table (DataFlowTableInfo)
get_dataflowbaseinfo = f"""
    SELECT * FROM codeconverter_config.dataflowbaseinfo
    WHERE DataFlow = '{data_flow_man}'
"""

# Execute the query and assign to a DataFrame
result_flowbaseinfo = spark.sql(get_dataflowbaseinfo)

# Get values from DataFlowBaseInfo table
dataflow_value = result_flowbaseinfo.select('DataFlow').first()[0]
dataflow_files_sourcepath_value = result_flowbaseinfo.select('DataFlowFilesSourceFolder').first()[0]
prompt_value = result_flowbaseinfo.select('PathToPrompt').first()[0]

In [0]:
print("DataFlow: " + dataflow_value, "Source Path: " + dataflow_files_sourcepath_value, "Prompt Path: " + prompt_value, sep="\n")

In [0]:
%python

# Create a dropdown widget for selecting model type (o1 or o1 mini)
dbutils.widgets.dropdown("MAX_TOKENS", "120000", ["150000", "120000", "100000"], "Choose max number of tokens")
# Retrieve the selected model type
MAX_TOKENS =int(dbutils.widgets.get("MAX_TOKENS"))
print(f"Selected model: {MAX_TOKENS}")

In [0]:
# Set the base URL for the container and directories
base_url = f"abfss://{container_name}@{sa}.dfs.core.windows.net/AcceleratorSAPFiles/"

# Set the source directory path based on the selected folder
source_directory_path = f"{base_url}{dataflow_files_sourcepath_value}/"

# Create subdirectories under ValidatedFiles, ArchivedFiles, and Logs using the data_flow_sel
target_directory_path = f"{base_url}ValidatedFiles/{data_flow_man}/"
archive_directory_path = f"{base_url}ArchivedFiles/{data_flow_man}/"

# Create an archive folder with the current date
current_date = datetime.now().strftime('%Y-%m-%d')
archive_date_folder = f"{archive_directory_path.rstrip('/')}/{current_date}/"
dbutils.fs.mkdirs(archive_date_folder)

# Create a logs directory in the Blob Storage container
logs_directory_path = f"{base_url}Logs/{data_flow_man}/{current_date}/"
dbutils.fs.mkdirs(logs_directory_path)

# Define Prompt Path
prompt_path = base_url + prompt_value

print(base_url)
print(source_directory_path)

In [0]:
print("Prompt Path: " + prompt_path)

In [0]:
# List files in the source directory
files = dbutils.fs.ls(source_directory_path)


# Retrieve the HANA-related files
hana_files_query = f"SELECT SAPFileName FROM codeconverter_config.DataFlowTableInfo WHERE DataFlow = '{data_flow_man}' AND SourceSystem = 'HANA'"
hana_files_df = spark.sql(hana_files_query)

# Convert the result into a list of file names
hana_files = [row['SAPFileName'] for row in hana_files_df.collect()]

print(hana_files)

# Keep only files with source system HANA
files = [file for file in files if file.name in hana_files]

# Initialize a list to store all log messages
log_messages = []

# Configuration query
get_dataflowtableinfo = f"""
SELECT * FROM codeconverter_config.DataFlowTableInfo
WHERE DataFlow = '{data_flow_man}'
"""

# Execute the query and convert the results to Pandas DataFrame
result_flowtableinfo = spark.sql(get_dataflowtableinfo)
metadata_pd = result_flowtableinfo.toPandas()

# Cast 'group_id' column to object dtype and fill null values with 'withoutgroupid'
metadata_pd['group_id'] = metadata_pd['group_id'].astype(object)
metadata_pd['group_id'].fillna('withoutgroupid', inplace=True)

# Group files by group_id (or process individually if group_id is 'withoutgroupid')
grouped_files = metadata_pd.groupby('group_id')['SAPFileName'].apply(list).to_dict()

# Define default schema to cover cases where it's not defined
default_schema = 'nntst'

In [0]:
%python

def split_grouped_calc_views(input_path: str, output_dir: str):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    try:
        # Attempt to parse the input file
        tree = ET.parse(input_path)
    except FileNotFoundError:
        print(f"File {input_path} not found. Please check the file path and try again.")
        return

    # Rest of your function remains the same
    root = tree.getroot()
    children = list(root)
    cv_idx = next(
        i for i, el in enumerate(children) if el.tag.endswith("calculationViews")
    )
    header_elems = children[:cv_idx]
    cv_container = children[cv_idx]
    footer_elems = children[cv_idx + 1 :]

    # 2) Prepare tiktoken encoder
    enc = tiktoken.get_encoding("cl100k_base")

    # 3) Pre-serialize header & footer once
    header_xml = "".join(
        ET.tostring(el, encoding="utf-8").decode("utf-8") for el in header_elems
    )
    footer_xml = "".join(
        ET.tostring(el, encoding="utf-8").decode("utf-8") for el in footer_elems
    )
    header_tokens = len(enc.encode(header_xml))
    footer_tokens = len(enc.encode(footer_xml))
    base_tokens = header_tokens + footer_tokens

    # 4) Iterate calculationView children, grouping by token budget
    group = []
    group_tokens = 0
    file_idx = 1
    base_filename = os.path.basename(input_path)

    def flush_group():
        nonlocal group, group_tokens, file_idx
        if not group:
            return

        # Build new tree
        new_root = ET.Element(root.tag, root.attrib)
        for k, v in root.attrib.items():
            if k.startswith("xmlns"):
                new_root.set(k, v)

        # Attach header
        for el in header_elems:
            new_root.append(copy.deepcopy(el))

        # Attach grouped <calculationViews>
        new_cv = ET.Element(cv_container.tag, cv_container.attrib)
        for k, v in cv_container.attrib.items():
            if k.startswith("xmlns"):
                new_cv.set(k, v)
        for cv in group:
            new_cv.append(copy.deepcopy(cv))
        new_root.append(new_cv)

        # Attach footer
        for el in footer_elems:
            new_root.append(copy.deepcopy(el))

        # Write file
        ET.indent(new_root, space="  ")
        out_file = os.path.join(output_dir, f"{base_filename}_grouped_{file_idx}.xml")
        ET.ElementTree(new_root).write(out_file, encoding="utf-8", xml_declaration=True)

        total = base_tokens + group_tokens
        print(
            f"Wrote {out_file} — header+footer: {base_tokens}, "
            f"{len(group)} views: {group_tokens}, total: {total}"
        )

        # Reset for next group
        file_idx += 1
        group = []
        group_tokens = 0

    # 5) Loop and accumulate
    for single_cv in list(cv_container):
        cv_xml = ET.tostring(single_cv, encoding="utf-8").decode("utf-8")
        cv_tok = len(enc.encode(cv_xml))

        # If adding this view would exceed budget, flush current group first
        if base_tokens + group_tokens + cv_tok > MAX_TOKENS:
            flush_group()

        # Then start (or continue) a group
        group.append(single_cv)
        group_tokens += cv_tok

    # 6) Flush any remaining views
    flush_group()

# Example usage with a full path
input_path = "lpdbwlpbdt01devdev.gold_vta_lego_upo_mpo.pubs_base"
output_dir = "split_grouped_by_calcview"
split_grouped_calc_views(input_path, output_dir)

In [0]:
%sql
SELECT * FROM hive_metastore.codeconverter_config.dataflowtableinfo
WHERE DataBricksTableName LIKE '%PUBS%'


In [0]:
%run ./init

In [0]:
import certifi
 
scope_name = "OpenAI-scope"
secret_name = "OpenAI-certificate"
 
secret_value = """
placeholder
"""
 
print("---- Retrieve CA ----")
 
ca_cert = certifi.where()
 
print("---- Appending new SSL to CA ----")
 
with open(ca_cert, 'a') as custom_certificate:
    cert_content = custom_certificate.write("\n# Custom appended certificate \n")
    cert_content = custom_certificate.write(secret_value)
    cert_content = custom_certificate.write("\n")
 
 
print(f"---- Successfully appended to: {ca_cert} ----")
 

In [0]:
call_model_o1("o1", "Hello?")