In [1]:
from utils import read_config, OracleAgent, MySQLAgent
import re
import pandas as pd
import os
from py2neo import Graph, Node, Relationship
from langchain.prompts import PromptTemplate
from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)

from langchain.chat_models  import AzureChatOpenAI

import google.generativeai as genai
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)

import prompts
import ast


from models.neo4jmodels import config, db, BaseTable, JoinTable, AggregatTable, UnionTable, View


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config.DATABASE_URL = "neo4j://neo4j:yfy12345@138.3.214.21:7687"

delete all nodes

In [3]:
def clear_all_nodes(db):
    delete_query = "MATCH (n) DETACH DELETE n"

    db.cypher_query(delete_query)

    print("All data has been deleted from the Neo4j database.")
clear_all_nodes(db)

All data has been deleted from the Neo4j database.


check current nodes

In [4]:
def check_all_nodes(db):
    query = """
    MATCH (a)-[r:FROM]->(b)
    RETURN a, r, b
    """

    cypher_results, meta = db.cypher_query(query)
    if cypher_results:
        results_as_dict = [dict(zip(meta, row)) for row in cypher_results]
        return results_as_dict
    else:
        return None
    

results_as_dict = check_all_nodes(db)
results_as_dict

### Read data

In [13]:
bidb_view = pd.read_csv('./result/nice_result_1.csv')
bidb_view

Unnamed: 0,view_name,text,input,lineage
0,C$_0W_YFY_AV_TW_R,"select ""C1_ROW_ID"",""C2_ORG_ID"",""C3_REF_AV_HEAD...","select ""C1_ROW_ID"",""C2_ORG_ID"",""C3_REF_AV_HEAD...","Result = {\n""Union1"": ""\nDatasource = ['W_YFY_..."
1,C$_0W_YFY_IND_FIN_INFO_FS,"select FIN_INFO.SEQ C1_SEQ, FIN_INFO.ACC...","select FIN_INFO.SEQ C1_SEQ, FIN_INFO.ACC...","Result = {\n""Query"" : ""\nDatasource = ['ODS.TC..."
2,OP_FACT_CHP_INVENTORY_ETH_PULP,"SELECT PERIOD_NAME,STOCK_DATE TDATE,ORG_CODE,'...","SELECT PERIOD_NAME,STOCK_DATE TDATE,ORG_CODE,'...","Result = {\n""Union1"":""\n Datasource=['W_FAC..."
3,OP_FACT_CHP_INVENTORY_REDEFINE,"SELECT PERIOD_NAME,TDATE,ORG_CODE ,CASE ORG_...","SELECT PERIOD_NAME,TDATE,ORG_CODE ,CASE ORG_...","Result = {\n""Union1"" : ""\nDatasource = ['W_YFY..."
4,W_OSH_TARGET_NEW_V,"select ""PERIOD_YEAR"",""DATA_LEVEL"",""COUNTRY_ID""...","select ""PERIOD_YEAR"",""DATA_LEVEL"",""COUNTRY_ID""...","Result = {\n""Union1"" : ""\nDatasource = ['w_osh..."


### Help functions

In [14]:
# old extraction function
def extract_lists(input_string):
    # Initialize dictionary to hold the lists
    extracted_data = {}
    
    # Use regex to find the Datasource part
    datasource_match = re.search(r"Datasource\s*=\s*(\[.*?\])", input_string, re.DOTALL)
    if datasource_match:
        datasource_section = datasource_match.group(1).strip()
        extracted_data["Datasource"] = ast.literal_eval(datasource_section)
    
    # Use regex to find the Relationship part
    relationship_match = re.search(r"Relationship\s*=\s*(\[.*?\])", input_string, re.DOTALL)
    if relationship_match:
        relationship_section = relationship_match.group(1).strip()
        # Split the relationship section into individual items
        relationship_list = re.findall(r'\((.*?)\)', relationship_section, re.DOTALL)
        
        # Process each item in the relationship list
        processed_list = []
        for item in relationship_list:
            # Replace any variation of "FILTER" with "Filter"
            item = re.sub(r'^FILTER\s*:?', 'Filter:', item, flags=re.IGNORECASE)
            # Remove any newline characters and extra spaces
            item = re.sub(r'\s+', ' ', item.strip())
            processed_list.append(item)
        
        extracted_data["Relationship"] = processed_list

    return extracted_data["Datasource"], extracted_data["Relationship"]

In [None]:
def result_destructure(input_string):

    """
    1. Separate each Union as one dictionary
    2. Extract Filter and Join as list, Groupby as dictionary 
    """


    return 

In [15]:
def parse_result_string(result_string):
    """
    Parses the given result string and structures it into a dictionary.
    """
    # Initialize the main dictionary
    result_dict = {}

    # Clean up the input string
    cleaned_string = result_string.strip().replace('\n', '').replace('\t', '').replace('\r', '')

    # Split the string into different Unions and Final
    pattern = r'"(Union\d+|Final)"\s*:\s*"([^"]*)"'
    matches = re.findall(pattern, cleaned_string)

    if not matches:
        # Handle Union3 which seems to have a different format
        pattern = r'"(Union\d+|Final)":\s*"([^"]*)"'
        matches = re.findall(pattern, cleaned_string)

    for match in matches:
        union_name = match[0]
        union_content = match[1]

        # Initialize the union dictionary
        union_dict = {}

        # Extract Datasource
        datasource_pattern = r'Datasource\s*=\s*(\[[^\]]*\])'
        datasource_match = re.search(datasource_pattern, union_content)
        if datasource_match:
            datasource_str = datasource_match.group(1)
            try:
                datasource_list = ast.literal_eval(datasource_str)
            except Exception:
                datasource_list = []
        else:
            datasource_list = []
        union_dict['Datasource'] = datasource_list

        # Extract Filter
        filter_pattern = r'Filter\s*=\s*(\[[^\]]*\])'
        filter_match = re.search(filter_pattern, union_content)
        if filter_match:
            filter_str = filter_match.group(1)
            try:
                filter_list = ast.literal_eval(filter_str)
            except Exception:
                filter_list = []
        else:
            filter_list = []
        union_dict['Filter'] = filter_list

        # Extract Join
        join_pattern = r'Join\s*=\s*(\[[^\]]*\])'
        join_match = re.search(join_pattern, union_content)
        if join_match:
            join_str = join_match.group(1)
            try:
                join_list = ast.literal_eval(join_str)
            except Exception:
                join_list = []
        else:
            join_list = []
        union_dict['Join'] = join_list

        # Extract Groupby
        groupby_pattern = r'Groupby\s*=\s*(\{[^\}]*\}|\[[^\]]*\]|\"[^\"]*\")'
        groupby_match = re.search(groupby_pattern, union_content)
        if groupby_match:
            groupby_str = groupby_match.group(1)
            # Determine if it's a dictionary or list
            if groupby_str.startswith('{') and groupby_str.endswith('}'):
                try:
                    groupby_dict = ast.literal_eval(groupby_str)
                except Exception:
                    groupby_dict = {}
            elif groupby_str.startswith('[') and groupby_str.endswith(']'):
                try:
                    groupby_list = ast.literal_eval(groupby_str)
                    groupby_dict = {i: None for i in groupby_list}
                except Exception:
                    groupby_dict = {}
            else:
                groupby_dict = {}
        else:
            groupby_dict = {}
        union_dict['Groupby'] = groupby_dict

        # Add the union dictionary to the main result dictionary
        result_dict[union_name] = union_dict

    return result_dict


In [18]:
parse_result_string(bidb_view.iloc[3].lineage)

{'Union1': {'Datasource': ['W_YFY_INV_BALANCE_F',
   'w_chp_item_d',
   'W_YFY_ORG_D',
   'W_CHP_TS_BELONG_ORG_R'],
  'Filter': [],
  'Join': [],
  'Groupby': {}},
 'Union2': {'Datasource': ['W_YFY_INV_BALANCE_F',
   'w_chp_item_d',
   'W_YFY_ORG_D',
   'W_CHP_TS_BELONG_ORG_R'],
  'Filter': [],
  'Join': [],
  'Groupby': {}},
 'Final': {'Datasource': ['(All Union Tables)'],
  'Filter': [],
  'Join': [],
  'Groupby': {}}}

In [7]:
extracted_lineage = {}
for idx, row in bidb_view.iterrows():

    datasource_list, relationship_list = extract_lists(row.lineage)
    
    extracted_lineage[row.view_name] = {
        'datasource': datasource_list,
        'relationship': relationship_list
    }

In [11]:
test_key = list(extracted_lineage.keys())[2]
test_value = extracted_lineage[test_key]


test_dict = {test_key: test_value}
test_dict

{'OP_FACT_CHP_INVENTORY_ETH_PULP': {'datasource': ['W_FACTORY_INV_BALANCE_F',
   'W_FACTORY_INV_F'],
  'relationship': ["Filter: W_FACTORY_INV_F.ORG_CODE = 'ETH'",
   "Filter: W_FACTORY_INV_F.MEMO = '花蓮自製漿'",
   'JOIN: W_FACTORY_INV_F.STOCK_DATE = (SELECT max(stock_date',
   "CURRENT_DATE, 'yyyy/mm'"]}}

In [9]:
for key, item in test_dict.items():
    table_list = item['datasource']
    relationship = item['relationship']

In [10]:
relationship

["Filter: W_CHP_ORDER_TYPE_D.USED = '損益'",
 "Filter: W_CHP_SALES_NOMANUAL_F.ORDER_LINE_STATUS = 'AWAITING_SHIPPING'",
 'Filter: GREATEST(W_CHP_SALES_NOMANUAL_F.REQUEST_DATE, W_CHP_SALES_NOMANUAL_F.SCHEDULE_SHIP_DATE',
 'ADD_MONTHS(sysdate, -2',
 'JOIN: W_CHP_SALES_NOMANUAL_F.order_type = W_CHP_ORDER_TYPE_D.order_type',
 'JOIN: W_CHP_SALES_NOMANUAL_F.line_order_type = W_CHP_ORDER_TYPE_D.order_line_type',
 'JOIN: W_CHP_SALES_NOMANUAL_F.PAPER_STAT_GROUP_SALES = W_CHP_PAPER_SALES_ORNT_R.PAPER_STAT_GROUP_SALES(+',
 'JOIN: W_CHP_SALES_NOMANUAL_F.paper_stat_element = W_CHP_PAPER_STAT_GROUP_R.paper_stat_element(+',
 "Filter: W_CHP_SALES_NOMANUAL_F.om_customer_no <> 'S0001'",
 "Filter: W_CHP_SALES_NOMANUAL_F.om_customer_no NOT LIKE 'Z%' OR W_CHP_SALES_NOMANUAL_F.om_customer_no = 'Z5170'",
 'JOIN: W_CHP_SALES_NOMANUAL_F.ORG_CODE = W_YFY_ORG_D.ORG_CODE',
 'JOIN: W_CHP_SALES_NOMANUAL_F.PAPER_STAT_GROUP_SALES = (SELECT DISTINCT PAPER_STAT_GROUP_SALES FROM W_CHP_PAPER_STAT_GROUP_R',
 'Filter: W_CHP_

In [11]:
for key, item in test_dict.items():
    view_name = key
    table_list = item['datasource']
    relationship = item['relationship']
    relationship = [item.replace("(", "").replace(")", "").replace(",", "") for item in relationship]

    for table in table_list:
        for rel in relationship:
            if table in rel:
                print(f"table:{table}, rel:{rel}")

                view_node = View.nodes.get_or_create()(name=view_name)

                table_node = BaseTable.nodes.get_or_create()(name=table)



                if "Filter" in rel:
                    # connect view to the table with the conditions
                    view_node.parent.connect(table_node)

                    table_node.filter = rel
                    table_node.save()

table:W_FACTORY_INV_F, rel:Filter: W_FACTORY_INV_F.ORG_CODE = 'ETH'
we here
table:W_FACTORY_INV_F, rel:Filter: W_FACTORY_INV_F.MEMO = '花蓮自製漿'
we here
table:W_FACTORY_INV_F, rel:JOIN: W_FACTORY_INV_F.STOCK_DATE = SELECT maxstock_date


In [12]:
relationship

["Filter: W_FACTORY_INV_F.ORG_CODE = 'ETH'",
 "Filter: W_FACTORY_INV_F.MEMO = '花蓮自製漿'",
 'JOIN: W_FACTORY_INV_F.STOCK_DATE = SELECT maxstock_date',
 "CURRENT_DATE 'yyyy/mm'"]

In [13]:
View.nodes.all()

[<View: {'name': 'OP_FACT_CHP_INVENTORY_ETH_PULP', 'element_id_property': '4:1106eaef-578b-4e12-bf10-90067c3c032e:13'}>]

In [14]:
Table.nodes.all()

[<Table: {'name': 'W_FACTORY_INV_F', 'filter': "Filter: W_FACTORY_INV_F.MEMO = '花蓮自製漿'", 'groupby': None, 'element_id_property': '4:1106eaef-578b-4e12-bf10-90067c3c032e:14'}>]

In [18]:
class Person(StructuredNode):
    names = ArrayProperty(StringProperty(), required=True)

bob = Person(names=['bob', 'rob', 'robert']).save()

NodeClassAlreadyDefined: Class __main__.Person with labels Person already defined:
Table --> <class '__main__.Table'>
View --> <class '__main__.View'>
Person --> <class '__main__.Person'>
