# RAG-LLMs

## Setup

### Imports

In [9]:
# imports
import os
from dotenv import load_dotenv
import pandas as pd
from langchain_openai import ChatOpenAI, AzureChatOpenAI 
from langchain_experimental.sql import SQLDatabaseChain
from langchain.chains import GraphCypherQAChain
from langchain_openai import AzureOpenAIEmbeddings
from langchain.prompts import PromptTemplate

from src.utils import *
from src.data_utils import SQLDBManager, Neo4jGraphManager
from src.prompt_templates import sql_rag_prompt, kg_rag_prompt, kg_rag_agent_prompt, kg_rag_agent_sys_prompt
from src.tools import GraphQueryAgent, SQLQueryAgent
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Config

In [59]:
setup_azure_openai(api_base='https://azsdc-openai-33.openai.azure.com/', api_version='2023-07-01-preview')    # setup azure openai AD token
llm_version = 'gpt-4'   # gpt-4, gpt-35-turbo
llm = AzureChatOpenAI(model=llm_version, max_tokens=2000, temperature=0)    # instantiate llm

Successfully setup Azure OpenAI authentication


## Prepare & init DB

In [35]:
# llm_version = 'gpt-4'   # gpt-4, gpt-35-turbo
# llm = AzureChatOpenAI(model=llm_version, max_tokens=1000, temperature=0)    # instantiate llm
# set db variables
# src_table = 'pegadata.ppm_work'
# req_cols_path = 'data/pega-as-clone/req_fields.txt'
# data_table = f'{src_table}_filtered'
# primary_key = 'pyid'

# dbm = SQLDBManager.from_env()  # instantiate SQLDBManager
# filter data table and create new
# dbm.filter_table(src_table, req_cols_path, primary_key, overwrite=True)

# drop cols
# cols_to_drop = ...
# dbm.drop_cols(data_table, cols_to_drop)

# clean html
# text_col = 'pydescription'
# dbm.clean_html(data_table, [text_col], primary_key) # clean html

# create embeddings
# dbm.embed_objs(f'{schema}.{table}', cols_to_embed=['pylabel', 'description'], pk='pyid')

connected to database


## SQL-RAG

In [4]:
schema = 'pegadata'
table = 'ppm_work_filtered' 
dbm = SQLDBManager.from_env(schema=schema, include_tables=[table])   # instantiate SQLDBManager

connected to database


### Agent with tools

In [None]:
# use agent w tools to query graph
dqa = SQLQueryAgent(dbm.db, llm)    # instantiate db query agent

# run queries
# query = "how many user stories are there in total?"
# query = "which epic is about implementing GenAI capabilities as a cloud service?"
query = "which epic is about implementing GenAI capabilities as a cloud service? how many user stories are there under it?"
print(f"user query: {query}")
result = dqa.run(query)
print(result)

### Eval

In [None]:
# eval on test set
test_queries_path = 'data/sample_queries.csv'
df_queries = pd.read_csv(test_queries_path, sep=';', index_col='id')
responses = {}
dqa = SQLQueryAgent(dbm.db, llm)    # instantiate db query agent

# iter thru queries and generate responses
for qid, row in df_queries.iterrows():
    query = row['query']
    print(f"running query {qid}: {query}")
    try:
        result = dqa.run(query=query)
        responses[qid] = result
    except Exception as e:
        print(f"error: {e}")
    print('\n')
    
# save responses to csv
df_responses = pd.DataFrame.from_dict(responses, orient='index', columns=['response'])
responses_path = f'outputs/sql-rag/responses_{llm_version}.csv'
os.makedirs(os.path.dirname(responses_path), exist_ok=True)
with open(responses_path, 'w') as f:
    df_responses.to_csv(f, sep=';')

In [72]:
# compute accuracy of responses
responses_eval_path = f'outputs/sql-rag/responses_{llm_version}_eval.csv'
eval_res = eval_rag_responses(responses_eval_path)
for metric, val in eval_res.items():
    print(f"{metric}: {val:.2f}")

num_queries: 30.00
num_correct: 27.00
num_queries_easy: 15.00
num_correct_easy: 15.00
num_queries_hard: 15.00
num_correct_hard: 12.00
accuracy: 0.90
accuracy_easy: 1.00
accuracy_hard: 0.80


## KG-RAG

In [47]:
ngm = Neo4jGraphManager.from_env()  # instantiate neo4j graph manager
schema = 'pegadata'
table = 'ppm_work_filtered'
# ngm.from_table(f'{schema}.{table}', reset=True)    # create graph from table
# ngm.graph.refresh_schema()
# print(f"graph schema:\n{ngm.graph.schema}")

# embed graph objects
# ngm.embed_objs() 

### Agent with tools

In [None]:
# use agent w tools to query graph
gqa = GraphQueryAgent(ngm.graph, llm)    # instantiate graph query agent

# single query
# query = "how many user stories are there in total?"
query = "what is the status of the user story about implementing an endpoint for GenAI service API and which epic does it belong to?"
print(f"user query: {query}")
gqa.run(query)

### Eval

In [None]:
# eval on test set
test_queries_path = 'data/sample_queries.csv'
df_queries = pd.read_csv(test_queries_path, sep=';', index_col='id')
responses = {}

gqa = GraphQueryAgent(ngm.graph, llm)    # instantiate graph query agent

# iter thru queries and generate responses
for qid, row in df_queries.iterrows():
    query = row['query']
    print(f"running query {qid}: {query}")
    try:
        result = gqa.run(query)
        responses[qid] = result
        print(result)
    except Exception as e:
        print(f"error: {e}")
    print('\n')

# save responses to csv
df_responses = pd.DataFrame.from_dict(responses, orient='index', columns=['response'])
responses_path = f'outputs/kg-rag/responses_{llm_version}.csv'
os.makedirs(os.path.dirname(responses_path), exist_ok=True)
df_responses.head()
with open(responses_path, 'w') as f:
    df_responses.to_csv(f, sep=';')

In [71]:
# compute accuracy of responses
responses_eval_path = f'outputs/kg-rag/responses_{llm_version}_eval.csv'
eval_res = eval_rag_responses(responses_eval_path)
for metric, val in eval_res.items():
    print(f"{metric}: {val:.2f}")

num_queries: 30.00
num_correct: 23.00
num_queries_easy: 15.00
num_correct_easy: 13.00
num_queries_hard: 15.00
num_correct_hard: 10.00
accuracy: 0.77
accuracy_easy: 0.87
accuracy_hard: 0.67
