In [11]:
from utils import read_config, OracleAgent
import re
import os
from langchain.prompts import PromptTemplate
from langchain_openai.chat_models import ChatOpenAI
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)


import google.generativeai as genai
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)


In [12]:
configs = read_config(".env/info.json")

os.environ["GOOGLE_API_KEY"] = configs['gkey']

O_KEY = configs['okey']
os.environ["OPENAI_API_KEY"] = O_KEY
# openai.api_key = os.environ["OPENAI_API_KEY"]

In [13]:
configs = read_config(".env/info.json")
# BIDB = configs['BIDB_conn_info']
# bi_agent = OracleAgent(BIDB)
DWDB = configs['DW_conn_info']
dw_agent = OracleAgent(DWDB)



In [14]:
query = """
    SELECT view_name, text FROM ALL_Views
    where owner = 'YFYDW'
"""

view_info = dw_agent.read_table(query=query)

In [15]:
view_info

Unnamed: 0,view_name,text
0,WACES_CARBON_EMS01_DF_V,"SELECT\n ORG.ORG_NAME,\n CHECKTYPE_CODE,..."
1,WACES_CARBON_EMS02_MF_V,"select \n org.org_name,\n c.check_date p..."


In [16]:
# re.DOTALL: This is a flag that allows the '.' in the regular expression to match newline characters
# re.IGNORECASE: This flag makes the search case-insenitive. So it will match "FROM", "from", "From" etc
view_info['data_source'] = view_info['text'].apply(lambda x: re.search(r'FROM\s+(.*)', x, re.DOTALL | re.IGNORECASE).group(0))

In [17]:
view_info

Unnamed: 0,view_name,text,data_source
0,WACES_CARBON_EMS01_DF_V,"SELECT\n ORG.ORG_NAME,\n CHECKTYPE_CODE,...","FROM WACES_CARBON_EMS_DF F,\n DIM_ORG O..."
1,WACES_CARBON_EMS02_MF_V,"select \n org.org_name,\n c.check_date p...","from WBIPD_PRODUCTION_INDS_MF p,\n (sele..."


In [18]:
print(view_info.iloc[1].text)

select 
    org.org_name,
    c.check_date period_date,
    EMISSION_CATEGORY,
    c.co2_qty co2_qty,
    p.prod_qty
  from WBIPD_PRODUCTION_INDS_MF p,
       (select 
            org_code,
            check_date,
            EMISSION_CATEGORY,
            sum(co2_qty) co2_qty
          from WACES_CARBON_EMS_DF
         where 1=1
           and checktype_name = '月盤查'
         group by org_code, check_date,EMISSION_CATEGORY) c,
      DIM_ORG org
 where 1=1
   and c.org_code = p.org_code(+)
   and c.check_date = p.period_date(+)
   and org.org_code = c.org_code
   and org.rpt_used = 'ESG碳排分析'


In [24]:
system_template = """
    I will provide the table_name and the datasource by SQL. 
    Following is the example, give me the answer based on real case:
    1. Give me the child table name as a list, like following:
        child_list = [child_table_1, child_table_2]
    2. Give me the condition if there is, such as: carbon_volume > 20
    3. Give me the merge condition if ther is, such as F.ORG_CODE = ORG.ORG_CODE, then the result is ORG_CODE

    Just export the summary without any other description.
    
    table_name: {table_name}

    datasource: {datasource}

    """

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{table_name}, {datasource}")
]

CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)

In [25]:
llm = ChatOpenAI(model='gpt-4o-mini', openai_api_key = O_KEY)

In [26]:
# llm = ChatGoogleGenerativeAI(
#     model="gemini-pro",
#     convert_system_message_to_human=True,
#     safety_settings={
#         HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
#         HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
#         HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
#     },
# )

In [27]:
chain = CHAT_PROMPT | llm
input_data = {
    "table_name": view_info.iloc[1].view_name,
    "datasource": view_info.iloc[1].data_source
}
llm_response = chain.invoke(input_data)

In [28]:
print(view_info.iloc[1].data_source)

from WBIPD_PRODUCTION_INDS_MF p,
       (select 
            org_code,
            check_date,
            EMISSION_CATEGORY,
            sum(co2_qty) co2_qty
          from WACES_CARBON_EMS_DF
         where 1=1
           and checktype_name = '月盤查'
         group by org_code, check_date,EMISSION_CATEGORY) c,
      DIM_ORG org
 where 1=1
   and c.org_code = p.org_code(+)
   and c.check_date = p.period_date(+)
   and org.org_code = c.org_code
   and org.rpt_used = 'ESG碳排分析'


In [29]:
print(llm_response.content)

child_list = [WACES_CARBON_EMS_DF, WBIPD_PRODUCTION_INDS_MF, DIM_ORG]
condition = org.rpt_used = 'ESG碳排分析'
merge_condition = c.org_code = p.org_code, c.check_date = p.period_date, org.org_code = c.org_code


In [33]:
system_template = """
    I will provide the table_name and the corresponing Oracle SQL. 
    Recongize all the unique identifer such as DIM_ORG org, the unique identifier is org.
    Understand the relationship, such as org.org_code which means the column org_code in DIM_ORG.
    Return the answer with the ORIGINAL table name instead of the unique identifier.

    Following is the example, give me the answer based on real case:
    ### Oracle SQL:
            from WBIPD_PRODUCTION_INDS_MF p,
            (select 
                    org_code,
                    check_date,
                    EMISSION_CATEGORY,
                    sum(co2_qty) co2_qty
                from WACES_CARBON_EMS_DF
                where 1=1
                and checktype_name = '月盤查'
                group by org_code, check_date,EMISSION_CATEGORY) c,
            DIM_ORG org
        where 1=1
        and c.org_code = p.org_code(+)
        and c.check_date = p.period_date(+)
        and org.org_code = c.org_code
        and org.rpt_used = 'ESG碳排分析'
    ### Result:
        [
        (GROUPBY: WACES_CARBON_EMS_DF filter with checktype_name = '月盤查', group by org_code, check_date,EMISSION_CATEGORY)
        (JOIN: WACES_CARBON_EMS_DF.org_code = WBIPD_PRODUCTION_INDS_MF.org_code(+)),
        (JOIN: WACES_CARBON_EMS_DF.period_date = WBIPD_PRODUCTION_INDS_MF.period_date(+)),
        (JOIN: DIM_ORG.org_code = WACES_CARBON_EMS_DF.org_code),
        (Filter: DIM_ORG.rpt_used = 'ESG碳排分析')
        ]

    Just export the summary without any other description.
    
    table_name: {table_name}

    datasource: {datasource}

    """

messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{table_name}, {datasource}")
]

CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)

In [36]:
chain = CHAT_PROMPT | llm
input_data = {
    "table_name": view_info.iloc[0].view_name,
    "datasource": view_info.iloc[0].data_source
}
llm_response = chain.invoke(input_data)

In [37]:
print(llm_response.content)

[
    (Filter: DIM_ORG.rpt_used = 'ESG碳排分析'),
    (JOIN: WACES_CARBON_EMS_DF.org_code = DIM_ORG.org_code)
]
