# RAG-LLMs

## Setup

### Imports

In [1]:
# imports
import os
from dotenv import load_dotenv
import pandas as pd
from langchain_openai import ChatOpenAI, AzureChatOpenAI 
from langchain_experimental.sql import SQLDatabaseChain
from langchain.chains import GraphCypherQAChain
from langchain_openai import AzureOpenAIEmbeddings
from langchain.prompts import PromptTemplate

from src.utils import *
from src.data_utils import SQLDBManager, Neo4jGraphManager
from src.prompt_templates import sql_rag_prompt, kg_rag_prompt, kg_rag_agent_prompt, kg_rag_agent_sys_prompt
from src.tools import GraphQueryAgent

%load_ext autoreload
%autoreload 2

### Config

In [3]:
# load env vars
host, port, db, user, password = load_postgres_env_variables()
# dbm = SQLDBManager.from_env()   # instantiate SQLDBManager
setup_azure_openai(api_base='https://azsdc-openai-33.openai.azure.com/', api_version='2023-07-01-preview')    # setup azure openai AD token
# print(f"AD token set: {os.environ['AZURE_OPENAI_AD_TOKEN']}")

Successfully setup Azure OpenAI authentication


## Prepare & init DB

In [35]:
# llm_version = 'gpt-4'   # gpt-4, gpt-35-turbo
# llm = AzureChatOpenAI(model=llm_version, max_tokens=1000, temperature=0)    # instantiate llm
# set db variables
# src_table = 'pegadata.ppm_work'
# req_cols_path = 'data/pega-as-clone/req_fields.txt'
# data_table = f'{src_table}_filtered'
# primary_key = 'pyid'

# dbm = SQLDBManager.from_env()  # instantiate SQLDBManager
# filter data table and create new
# dbm.filter_table(src_table, req_cols_path, primary_key, overwrite=True)

# drop cols
# cols_to_drop = ...
# dbm.drop_cols(data_table, cols_to_drop)

# clean html
# text_col = 'pydescription'
# dbm.clean_html(data_table, [text_col], primary_key) # clean html

# # create embeddings
# embs_model = AzureOpenAIEmbeddings(azure_deployment="text-embedding-ada-002") # instantiate embeddings model
# dbm.create_embs_col(data_table, text_col, embs_model) # create embeddings col

connected to database


## SQL-RAG

In [16]:
schema = 'pegadata'
table = 'ppm_work_filtered' 
dbm = SQLDBManager.from_env(schema=schema, include_tables=[table])   # instantiate SQLDBManager
# using sql agent
# from langchain.agents import create_sql_agent
# from langchain.agents import AgentExecutor
# from langchain.agents.agent_types import AgentType
# from langchain_community.agent_toolkits import SQLDatabaseToolkit
# agent_type = AgentType.ZERO_SHOT_REACT_DESCRIPTION
# toolkit = SQLDatabaseToolkit(dbm.db, llm)
# db_agent = create_sql_agent(llm=llm, toolkit=toolkit, agent_type=agent_type, verbose=True)    # instantiate db agent
# db_agent.run(query)

connected to database


### Eval

In [17]:
# custom prompt
prompt = PromptTemplate(input_variables=["input", "table_info"], template=sql_rag_prompt)
db_chain = SQLDatabaseChain.from_llm(llm, dbm.db, verbose=True, prompt=prompt)    # instantiate sql rag chain
table_info = dbm.db.get_table_info([table])

# single query
query = "list all pending user stories w/ deadlines before Apr 1, 2024, along with their assignees and priority"
db_chain.run(query=query, table_info=table_info)

# eval on test set
# test_queries_path = 'data/sample_queries.csv'
# df_queries = pd.read_csv(test_queries_path, sep=';', index_col='id')
# responses = {}

# # iter thru queries and generate responses
# for qid, row in df_queries.iterrows():
#     query = row['query']
#     print(f"running query {qid}: {query}")
#     try:
#         response = db_chain.run(query=query, table_info=table_info)
#         responses[qid] = response
#     except Exception as e:
#         print(f"error: {e}")
#     print('\n')

# # save responses to csv
# df_responses = pd.DataFrame.from_dict(responses, orient='index', columns=['response'])
# responses_path = f'outputs/sql-rag/responses_{llm_version}.csv'
# os.makedirs(os.path.dirname(responses_path), exist_ok=True)
# with open(responses_path, 'w') as f:
#     df_responses.to_csv(f, sep=';')



[1m> Entering new SQLDatabaseChain chain...[0m
list all pending user stories w/ deadlines before Apr 1, 2024, along with their assignees and priority
SQLQuery:[32;1m[1;3mSELECT pyid, assignee, pxurgencywork FROM pegadata.ppm_work_filtered WHERE pystatuswork = 'Pending' AND pysladeadline < '2024-04-01'[0m
SQLResult: [33;1m[1;3m[('US-12', 'Neil', Decimal('20')), ('US-13', 'Stijn', Decimal('35')), ('US-16', 'Stijn', Decimal('20'))][0m
Answer:[32;1m[1;3mThe pending user stories with deadlines before Apr 1, 2024, along with their assignees and priority are: 
1. User Story ID: US-12, Assignee: Neil, Priority: 20
2. User Story ID: US-13, Assignee: Stijn, Priority: 35
3. User Story ID: US-16, Assignee: Stijn, Priority: 20[0m
[1m> Finished chain.[0m


'The pending user stories with deadlines before Apr 1, 2024, along with their assignees and priority are: \n1. User Story ID: US-12, Assignee: Neil, Priority: 20\n2. User Story ID: US-13, Assignee: Stijn, Priority: 35\n3. User Story ID: US-16, Assignee: Stijn, Priority: 20'

In [25]:
# compute accuracy of responses
responses_eval_path = f'outputs/sql-rag/responses_{llm_version}_eval.csv'
eval_res = eval_rag_responses(responses_eval_path)
print(eval_res)

{'num_queries': 20, 'num_correct': 10, 'num_queries_easy': 10, 'num_correct_easy': 8, 'num_queries_hard': 10, 'num_correct_hard': 2, 'accuracy': 0.5, 'accuracy_easy': 0.8, 'accuracy_hard': 0.2}


## KG-RAG

In [4]:
ngm = Neo4jGraphManager.from_env()  # instantiate neo4j graph manager
schema = 'pegadata'
table = 'ppm_work_filtered'
# ngm.from_table(f'{schema}.{table}', reset=True)    # create graph from table
# ngm.graph.refresh_schema()
# print(f"graph schema:\n{ngm.graph.schema}")
llm_version = 'gpt-4'   # gpt-4, gpt-35-turbo
# llm = ChatOpenAI(model=llm_version, max_tokens=1000, temperature=0)    # instantiate llm
llm = AzureChatOpenAI(model=llm_version, max_tokens=2000, temperature=0)    # instantiate llm

### Eval

In [None]:
# generate response using kg rag
prompt_template = kg_rag_prompt
prompt = PromptTemplate(input_variables=["schema", "question"], template=kg_rag_prompt)   # construct prompt
kg_rag_chain = GraphCypherQAChain.from_llm(llm, graph=ngm.graph, cypher_prompt=prompt, validate_cypher=True, verbose=True)   # instantiate kg rag chain

# single query
query = "fetch all user stories assigned to Abhijith which have a priority higher than the highest priority of user stories assigned to me (Neil)."
print(f"user query: {query}")
response = kg_rag_chain.run(query)  # generate response
print(response)

In [12]:
# eval on test set
test_queries_path = 'data/sample_queries.csv'
df_queries = pd.read_csv(test_queries_path, sep=';', index_col='id')
responses = {}

# iter thru queries and generate responses
for qid, row in df_queries.iterrows():
    query = row['query']
    print(f"running query {qid}: {query}")
    try:
        response = kg_rag_chain.run(query=query)
        responses[qid] = response
        print(response)
    except Exception as e:
        print(f"error: {e}")
    print('\n')

# save responses to csv
df_responses = pd.DataFrame.from_dict(responses, orient='index', columns=['response'])
responses_path = f'outputs/kg-rag/responses_{llm_version}.csv'
os.makedirs(os.path.dirname(responses_path), exist_ok=True)
with open(responses_path, 'w') as f:
    df_responses.to_csv(f, sep=';')

running query 1: how many user stories are under epic 1?


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (us:UserStory)-[:IS_STORY_OF_EPIC]->(e:Epic) WHERE e.pyid = 'EPIC-1' RETURN COUNT(us)[0m
Full Context:
[32;1m[1;3m[{'COUNT(us)': 9}][0m

[1m> Finished chain.[0m


running query 2: find the most recent user story under epic 1 and summarize it


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (us:UserStory)-[:IS_STORY_OF_EPIC]->(e:Epic)
WHERE e.pyid = 'EPIC-1'
RETURN us.pyid AS UserStoryID, us.pylabel AS UserStoryLabel, us.description AS UserStoryDescription, us.pystatuswork AS UserStoryStatus, us.category AS UserStoryCategory, us.startdate AS UserStoryStartDate, us.enddate AS UserStoryEndDate
ORDER BY us.enddate DESC
LIMIT 1[0m
Full Context:
[32;1m[1;3m[{'UserStoryID': 'US-6', 'UserStoryLabel': 'Release first version', 'UserStoryDescription': ' As PO, I would like to release the first version o

In [26]:
# compute accuracy of responses
responses_eval_path = f'outputs/kg-rag/responses_{llm_version}_eval.csv'
eval_res = eval_rag_responses(responses_eval_path)
print(eval_res)

{'num_queries': 20, 'num_correct': 13, 'num_queries_easy': 10, 'num_correct_easy': 8, 'num_queries_hard': 10, 'num_correct_hard': 5, 'accuracy': 0.65, 'accuracy_easy': 0.8, 'accuracy_hard': 0.5}


### Agent with tools

In [72]:
# ngm.embed_objs()    # embed graph objects

gqa = GraphQueryAgent(ngm.graph, llm)    # instantiate graph query agent

# query = "which goal is about security issues in Pega Infinity?"
query = "how many user stories are there in total?"
print(f"user query: {query}")
# agent_exec.run(query)
gqa.run(query)