In [1]:
import sys
import os
# project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Go up one level to the root
# if project_root not in sys.path:
#     sys.path.append(project_root)

from src.utils import read_config, OracleAgent
import re

from py2neo import Graph, Node, Relationship
from langchain.prompts import PromptTemplate
from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)


import google.generativeai as genai
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
configs = read_config("../.env/info.json")

os.environ["GOOGLE_API_KEY"] = configs['gkey']

O_KEY = configs['okey']
os.environ["OPENAI_API_KEY"] = O_KEY
# openai.api_key = os.environ["OPENAI_API_KEY"]

In [5]:
configs = read_config("../.env/info.json")
# BIDB = configs['BIDB_conn_info']
# bi_agent = OracleAgent(BIDB)
DWDB = configs['DW_conn_info']
dw_agent = OracleAgent(DWDB)

FileNotFoundError: [WinError 3] 系統找不到指定的路徑。: './opt/oracle/'

In [4]:
query = """
    SELECT view_name, text FROM ALL_Views
    where owner = 'YFYDW'
"""

view_info = dw_agent.read_table(query=query)

In [5]:
view_info

Unnamed: 0,view_name,text
0,WACES_CARBON_EMS01_DF_V,"SELECT\n ORG.ORG_NAME,\n CHECKTYPE_CODE,..."
1,WACES_CARBON_EMS02_MF_V,"select \n org.org_name,\n c.check_date p..."


In [6]:
# re.DOTALL: This is a flag that allows the '.' in the regular expression to match newline characters
# re.IGNORECASE: This flag makes the search case-insenitive. So it will match "FROM", "from", "From" etc
view_info['data_source'] = view_info['text'].apply(lambda x: re.search(r'FROM\s+(.*)', x, re.DOTALL | re.IGNORECASE).group(0))

In [7]:
view_info

Unnamed: 0,view_name,text,data_source
0,WACES_CARBON_EMS01_DF_V,"SELECT\n ORG.ORG_NAME,\n CHECKTYPE_CODE,...","FROM WACES_CARBON_EMS_DF F,\n DIM_ORG O..."
1,WACES_CARBON_EMS02_MF_V,"select \n org.org_name,\n c.check_date p...","from WBIMF_PRODUCTION_INDS_MF p,\n (sele..."


In [13]:
llm = ChatOpenAI(model='gpt-4o-mini', openai_api_key = O_KEY)

In [14]:
system_template = """
    I will provide the table_name and the corresponing Oracle SQL. 
    Recongize all the unique identifer such as DIM_ORG org, the unique identifier is org.
    Understand the relationship, such as org.org_code which means the column org_code in DIM_ORG.
    Return the answer with the ORIGINAL table name instead of the unique identifier.

    Following is the example, give me the answer based on real case:
    ### Oracle SQL:
            from WBIPD_PRODUCTION_INDS_MF p,
            (select 
                    org_code,
                    check_date,
                    EMISSION_CATEGORY,
                    sum(co2_qty) co2_qty
                from WACES_CARBON_EMS_DF
                where 1=1
                and checktype_name = '月盤查'
                group by org_code, check_date,EMISSION_CATEGORY) c,
            DIM_ORG org
        where 1=1
        and c.org_code = p.org_code(+)
        and c.check_date = p.period_date(+)
        and org.org_code = c.org_code
        and org.rpt_used = 'ESG碳排分析'
    ### Result:

        Datasource = ['WACES_CARBON_EMS_DF', 'WBIPD_PRODUCTION_INDS_MF', 'DIM_ORG']

        Relationship = 
        [
        (GROUPBY: WACES_CARBON_EMS_DF filter with checktype_name = '月盤查', group by org_code, check_date,EMISSION_CATEGORY),
        (JOIN: WACES_CARBON_EMS_DF.org_code = WBIPD_PRODUCTION_INDS_MF.org_code(+)),
        (JOIN: WACES_CARBON_EMS_DF.period_date = WBIPD_PRODUCTION_INDS_MF.period_date(+)),
        (JOIN: DIM_ORG.org_code = WACES_CARBON_EMS_DF.org_code),
        (Filter: DIM_ORG.rpt_used = 'ESG碳排分析'),
        ]

    Just export the summary without any other description.
    
    table_name: {table_name}

    datasource: {datasource}

    """

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{table_name}, {datasource}")
]

CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)

In [15]:
chain = CHAT_PROMPT | llm
input_data = {
    "table_name": view_info.iloc[0].view_name,
    "datasource": view_info.iloc[0].data_source
}
llm_response = chain.invoke(input_data)

In [16]:
view_info.iloc[0].data_source

"FROM WACES_CARBON_EMS_DF F,\n       DIM_ORG  ORG\n WHERE 1=1\n   AND ORG.RPT_USED = 'ESG碳排分析'\n   AND F.ORG_CODE = ORG.ORG_CODE"

In [17]:
print(llm_response.content)

Datasource = ['WACES_CARBON_EMS_DF', 'DIM_ORG']

Relationship = 
[
(JOIN: WACES_CARBON_EMS_DF.org_code = DIM_ORG.org_code),
(Filter: DIM_ORG.rpt_used = 'ESG碳排分析'),
]


In [18]:
llm_response.content

"Datasource = ['WACES_CARBON_EMS_DF', 'DIM_ORG']\n\nRelationship = \n[\n(JOIN: WACES_CARBON_EMS_DF.org_code = DIM_ORG.org_code),\n(Filter: DIM_ORG.rpt_used = 'ESG碳排分析'),\n]"

In [19]:
import ast

def extract_lists(input_string):
    # Split the input string into sections based on the prefixes
    parts = input_string.split('\n\n')
    
    # Initialize dictionaries to hold the lists
    extracted_data = {}
    
    # Loop through each part and extract the corresponding list
    for part in parts:
        if part.startswith("Datasource"):
            # Extract the Datasource list using the prefix
            datasource_section = part[len("Datasource = "):].strip()
            extracted_data["Datasource"] = ast.literal_eval(datasource_section)
        elif part.startswith("Relationship"):
            # Extract the Relationship list using the prefix
            relationship_section = part[len("Relationship = "):].strip()
            # Remove brackets and split into list
            relationship_section = relationship_section.strip("[]\n")
            relationship_list = [item.strip().replace("(", "").replace(")", "").replace(",", "") for item in relationship_section.split(',\n')]
            extracted_data["Relationship"] = relationship_list
    
    return extracted_data

# Example usage
input_string = llm_response.content
result = extract_lists(input_string)

# Output the lists
datasource_list = result.get("Datasource", [])
relationship_list = result.get("Relationship", [])

print("Datasource List:", datasource_list)
print("Relationship List:", relationship_list)


Datasource List: ['WACES_CARBON_EMS_DF', 'DIM_ORG']
Relationship List: ['JOIN: WACES_CARBON_EMS_DF.org_code = DIM_ORG.org_code', "Filter: DIM_ORG.rpt_used = 'ESG碳排分析'"]


In [20]:
datasource_list[0]

'WACES_CARBON_EMS_DF'

In [68]:
# Create graph
graph = Graph(host="138.3.214.21", auth=("neo4j","yfy12345"))

In [69]:
result_dict = {}
for source_item in datasource_list:
    for rel_item in relationship_list:
        if source_item in rel_item:
            result_dict[source_item] = rel_item
            related_source = [rel_source for rel_source in datasource_list if rel_source != source_item]
            # num_related_source = len(related_source)
            source_table = Node("table", name=source_item)
            related_source_table = Node("table", name=related_source[0])

            property_dictionary = {}
            rel_item = rel_item.replace("(", "").replace(")", "").replace(",", "")
            if "Filter" in rel_item:
                property_dictionary['condition'] = rel_item
            elif "JOIN" in rel_item:
                property_dictionary['merge'] = rel_item

            relationship = Relationship(source_table, "FROM", related_source_table, **property_dictionary)

            # Check if the relationship already exists
            # existing_relationships = graph.match(
            #     (source_table, related_source_table), r_type="FROM"
            # )
            
            # if not existing_relationships:
            #     graph.create(relationship)
                
            # relationship_exists = any(
            #     rel for rel in existing_relationships if rel['condition'] == property_dictionary.get('condition') or rel['merge'] == property_dictionary.get('merge')
            # )

            # if not relationship_exists:
            #     relationship = Relationship(source_table, "FROM", related_source_table, **property_dictionary)
            #     graph.create(relationship)
            # graph.create(relationship)

In [70]:
existing_relationships = graph.match(
    (source_table, related_source_table), r_type="FROM"
)

In [71]:
try:
    if existing_relationships:
        relationship_exists = any(
                rel for rel in existing_relationships if rel['condition'] == property_dictionary.get('condition') or rel['merge'] == property_dictionary.get('merge')
            )
        print("a")
except:
    graph.create(relationship)


In [72]:
if existing_relationships:
    relationship_exists = any(
            rel for rel in existing_relationships if rel['condition'] == property_dictionary.get('condition') or rel['merge'] == property_dictionary.get('merge')
        )

In [73]:
relationship_exists

True

In [74]:
property_dictionary = {}
rel_item = rel_item.replace("(", "").replace(")", "").replace(",", "")
if "Filter" in rel_item:
    property_dictionary['condition'] = rel_item
elif "JOIN" in rel_item:
    property_dictionary['merge'] = rel_item
property_dictionary

{'condition': "Filter: DIM_ORG.rpt_used = 'ESG碳排分析'"}

In [75]:
# Query to match the nodes and relationship
query = """
MATCH (a)-[r:FROM]->(b)
RETURN a, r, b
"""

# Execute the query and get the result
result = graph.run(query)

# Iterate over the result and print it
for record in result:
    print(f"Node A: {record['a']}")
    print(f"Relationship: {record['r']}")
    print(f"Node B: {record['b']}")


Node A: (_2:table {name: 'DIM_ORG'})
Relationship: (DIM_ORG)-[:FROM {condition: "Filter: DIM_ORG.rpt_used = 'ESG\u78b3\u6392\u5206\u6790'"}]->(WACES_CARBON_EMS_DF)
Node B: (_3:table {name: 'WACES_CARBON_EMS_DF'})


In [76]:
query = """
MATCH (a)-[r:FROM]->(b)
RETURN a, r, b
"""

# Execute the query and get the result
result = graph.run(query)
result

a,r,b
(_2:table {name: 'DIM_ORG'}),"(DIM_ORG)-[:FROM {condition: ""Filter: DIM_ORG.rpt_used = 'ESG\u78b3\u6392\u5206\u6790'""}]->(WACES_CARBON_EMS_DF)",(_3:table {name: 'WACES_CARBON_EMS_DF'})


In [33]:
# Delete all relationships
graph.run("MATCH ()-[r]->() DELETE r")

# Delete all nodes
graph.run("MATCH (n) DELETE n")