In [1]:
from utils import read_config, OracleAgent, MySQLAgent
import re
import pandas as pd
import os
from py2neo import Graph, Node, Relationship
from neomodel import db, config, StructuredNode, RelationshipTo, RelationshipFrom, StringProperty
from langchain.prompts import PromptTemplate
from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)

from langchain.chat_models  import AzureChatOpenAI

import google.generativeai as genai
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)

import prompts
import ast


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config.DATABASE_URL = "neo4j://neo4j:yfy12345@138.3.214.21:7687"

In [3]:
# neomodel
class Table(StructuredNode):
    name = StringProperty(uniqued_index=False)
    filter = RelationshipTo('Table', 'Filter')
    join = RelationshipTo('Table', 'Join')
    groupby = RelationshipTo('Table', 'Groupby')

# class View(StructuredNode):
#     name = StringProperty(unique_index=True)
#     source = RelationshipTo(Table, 'IS_FROM')
#     filter = StringProperty(uniqued_index=False)
#     join = StringProperty(uniqued_index=False)
#     groupby = StringProperty(uniqued_index=False)


delete all nodes

In [4]:
def clear_all_nodes(db):
    delete_query = "MATCH (n) DETACH DELETE n"

    db.cypher_query(delete_query)

    print("All data has been deleted from the Neo4j database.")
clear_all_nodes(db)

All data has been deleted from the Neo4j database.


check current nodes

In [5]:
def check_all_nodes(db):
    query = """
    MATCH (a)-[r:FROM]->(b)
    RETURN a, r, b
    """

    cypher_results, meta = db.cypher_query(query)
    if cypher_results:
        results_as_dict = [dict(zip(meta, row)) for row in cypher_results]
        return results_as_dict
    else:
        return None
    

results_as_dict = check_all_nodes(db)
results_as_dict

### Read data

In [6]:
bidb_view = pd.read_csv('./result/temp1.csv')
bidb_view

Unnamed: 0,view_name,text,source,input,lineage
0,C$_0W_YFY_AV_TW_R,"select ""C1_ROW_ID"",""C2_ORG_ID"",""C3_REF_AV_HEAD...",from ( select W_YFY_AV_TW_R.ROW_ID C1_ROW...,"select ""C1_ROW_ID"",""C2_ORG_ID"",""C3_REF_AV_HEAD...",Datasource = ['ODS.W_YFY_AV_TW_R']\n\nRelation...
1,C$_0W_YFY_IND_FIN_INFO_FS,"select FIN_INFO.SEQ C1_SEQ, FIN_INFO.ACC...",from ODS.TC$_0W_YFY_IND_FIN_INFO_FS FIN_INFO,"select FIN_INFO.SEQ C1_SEQ, FIN_INFO.ACC...",Datasource = ['ODS.TC$_0W_YFY_IND_FIN_INFO_FS'...
2,OP_FACT_CHP_INVENTORY_ETH_PULP,"SELECT PERIOD_NAME,STOCK_DATE TDATE,ORG_CODE,'...",FROM W_FACTORY_INV_BALANCE_F UNION ALL SELECT ...,"SELECT PERIOD_NAME,STOCK_DATE TDATE,ORG_CODE,'...","Datasource = ['W_FACTORY_INV_BALANCE_F', 'W_FA..."
3,OP_FACT_CHP_INVENTORY_REDEFINE,"SELECT PERIOD_NAME,TDATE,ORG_CODE ,CASE ORG_...","FROM ( SELECT F.PERIOD_NAME,F.BALANCE_DATE T...","SELECT PERIOD_NAME,TDATE,ORG_CODE ,CASE ORG_...",### Summary\n\nDatasource = ['W_YFY_INV_BALANC...
4,OP_FACT_CHP_SALES_DETAILS,"SELECT '當月受訂' TYPE, --GREATEST(F.REQU...","from W_CHP_SALES_NOMANUAL_F F ,W_CHP_ORDER...","SELECT '當月受訂' TYPE, --GREATEST(F.REQU...",### Datasource\n```\nDatasource = ['W_CHP_SALE...


### Help functions

In [7]:
def extract_lists(input_string):
    # Initialize dictionary to hold the lists
    extracted_data = {}
    
    # Use regex to find the Datasource part
    datasource_match = re.search(r"Datasource\s*=\s*(\[.*?\])", input_string, re.DOTALL)
    if datasource_match:
        datasource_section = datasource_match.group(1).strip()
        extracted_data["Datasource"] = ast.literal_eval(datasource_section)
    
    # Use regex to find the Relationship part
    relationship_match = re.search(r"Relationship\s*=\s*(\[.*?\])", input_string, re.DOTALL)
    if relationship_match:
        relationship_section = relationship_match.group(1).strip()
        # Split the relationship section into individual items
        relationship_list = re.findall(r'\((.*?)\)', relationship_section, re.DOTALL)
        
        # Process each item in the relationship list
        processed_list = []
        for item in relationship_list:
            # Replace any variation of "FILTER" with "Filter"
            item = re.sub(r'^FILTER\s*:?', 'Filter:', item, flags=re.IGNORECASE)
            # Remove any newline characters and extra spaces
            item = re.sub(r'\s+', ' ', item.strip())
            processed_list.append(item)
        
        extracted_data["Relationship"] = processed_list

    return extracted_data["Datasource"], extracted_data["Relationship"]

In [8]:
extracted_lineage = {}
for idx, row in bidb_view.iterrows():

    datasource_list, relationship_list = extract_lists(row.lineage)
    
    extracted_lineage[row.view_name] = {
        'datasource': datasource_list,
        'relationship': relationship_list
    }

In [9]:
test_key = list(extracted_lineage.keys())[2]
test_value = extracted_lineage[test_key]


test_dict = {test_key: test_value}
test_dict

{'OP_FACT_CHP_INVENTORY_ETH_PULP': {'datasource': ['W_FACTORY_INV_BALANCE_F',
   'W_FACTORY_INV_F'],
  'relationship': ["Filter: W_FACTORY_INV_F.ORG_CODE = 'ETH'",
   "Filter: W_FACTORY_INV_F.MEMO = '花蓮自製漿'",
   'JOIN: W_FACTORY_INV_F.STOCK_DATE = (SELECT max(stock_date',
   "CURRENT_DATE, 'yyyy/mm'"]}}

In [11]:
for key, item in test_dict.items():
    table_list = item['datasource']
    relationship = item['relationship']

    for table in table_list:
        for rel in relationship:
            # clean
            rel_cleaned = rel.replace("(", "").replace(")", "").replace(",", "")
            if table in rel_cleaned:
                print(f"table:{table}, rel:{rel_cleaned}")
                
                # check if the table exists
                source_table = Table.nodes.get_or_none(name=table)

                # create the node if it doesn't exist
                if source_table is None:
                    table_node = Table(name=table).save()

                if "Filter" in rel_cleaned:
                    # Add filter which needs in creating the view

SyntaxError: incomplete input (3492085272.py, line 20)

In [None]:
relationship

["Filter: W_FACTORY_INV_F.ORG_CODE = 'ETH'",
 "Filter: W_FACTORY_INV_F.MEMO = '花蓮自製漿'",
 'JOIN: W_FACTORY_INV_F.STOCK_DATE = (SELECT max(stock_date',
 "CURRENT_DATE, 'yyyy/mm'"]

In [None]:
Table.nodes.all()

[<Table: {'name': 'W_FACTORY_INV_F', 'filter': "Filter: W_FACTORY_INV_F.MEMO = '花蓮自製漿'", 'join': None, 'groupby': None, 'element_id_property': '4:1106eaef-578b-4e12-bf10-90067c3c032e:6'}>]

In [None]:
for key, item in extracted_lineage.items():
    table_list = item['datasource']
    relationship = item['relationship']

    for table in table_list:
        for rel in relationship:
            # clean
            rel_cleaned = rel.replace("(", "").replace(")", "").replace(",", "")
            if table in rel_cleaned:
                print(f"table:{table}, rel:{rel_cleaned}")
                
                # Find the source node
                source_table = Table.nodes.get(name=table)

                if source_table == False:
                    # create Node
                    source_table = Node("table", name=table)

                property_dictionary = {}
                # Filter: add relationship in the node
                if "Filter" in rel_cleaned:
                    property_dictionary['Filter'] = rel_cleaned
                # JOIN: create relationship between nodes
                elif "JOIN" in rel_cleaned:
                    property_dictionary['JOIN'] = rel_cleaned

table:W_FACTORY_INV_F, rel:Filter: W_FACTORY_INV_F.ORG_CODE = 'ETH'


TableDoesNotExist: (TableDoesNotExist(...), "{'name': 'W_FACTORY_INV_F'}")