# RAG-LLMs

## Setup

### Imports

In [None]:
# imports
import os
from dotenv import load_dotenv
import pandas as pd
from langchain_openai import ChatOpenAI, AzureChatOpenAI 

from src.utils import *
from src.data_utils import SQLDBManager, Neo4jGraphManager
from src.tools import GraphQueryAgent, SQLQueryAgent
%load_ext autoreload
%autoreload 2

### Config

In [None]:
setup_azure_openai(api_base='https://azsdc-openai-33.openai.azure.com/', api_version='2023-07-01-preview')    # setup azure openai AD token
llm_version = 'gpt-4'   # gpt-4, gpt-35-turbo
llm = AzureChatOpenAI(model=llm_version, max_tokens=2000, temperature=0)    # instantiate llm

## Prepare & init DB

In [None]:
# llm_version = 'gpt-4'   # gpt-4, gpt-35-turbo
# llm = AzureChatOpenAI(model=llm_version, max_tokens=1000, temperature=0)    # instantiate llm
# set db variables
# src_table = 'pegadata.ppm_work'
# req_cols_path = 'data/pega-as-clone/req_fields.txt'
# data_table = f'{src_table}_filtered'
# primary_key = 'pyid'

# dbm = SQLDBManager.from_env()  # instantiate SQLDBManager
# filter data table and create new
# dbm.filter_table(src_table, req_cols_path, primary_key, overwrite=True)

# drop cols
# cols_to_drop = ...
# dbm.drop_cols(data_table, cols_to_drop)

# clean html
# text_col = 'pydescription'
# dbm.clean_html(data_table, [text_col], primary_key) # clean html

# create embeddings
# dbm.embed_objs(f'{schema}.{table}', cols_to_embed=['pylabel', 'description'], pk='pyid')

## SQL-RAG

In [None]:
schema = 'pegadata'
table = 'ppm_work_filtered' 
dbm = SQLDBManager.from_env(schema=schema, include_tables=[table])   # instantiate SQLDBManager

### Agent with tools

In [None]:
# use agent w tools to query graph
dqa = SQLQueryAgent(dbm.db, llm)    # instantiate db query agent

# run queries
query = "who in charge of the user story about end-to-end testing of GenAI services?"
print(f"user query: {query}")
result = dqa.run(query)
print(result)

### Eval

In [None]:
# eval on test set
test_queries_path = 'data/sample_queries.csv'
df_queries = pd.read_csv(test_queries_path, sep=';', index_col='id')
responses = {}
dqa = SQLQueryAgent(dbm.db, llm)    # instantiate db query agent

# iter thru queries and generate responses
for qid, row in df_queries.iterrows():
    query = row['query']
    print(f"running query {qid}: {query}")
    try:
        result = dqa.run(query=query)
        responses[qid] = result
    except Exception as e:
        print(f"error: {e}")
    print('\n')
    
# save responses to csv
df_responses = pd.DataFrame.from_dict(responses, orient='index', columns=['response'])
responses_path = f'outputs/sql-rag/responses_{llm_version}.csv'
os.makedirs(os.path.dirname(responses_path), exist_ok=True)
with open(responses_path, 'w') as f:
    df_responses.to_csv(f, sep=';')

In [None]:
# compute accuracy of responses
responses_eval_path = f'outputs/sql-rag/responses_{llm_version}_eval.csv'
results_path = f'outputs/sql-rag/results_{llm_version}.json'
eval_res = eval_rag_responses(responses_eval_path, results_path)
for metric, val in eval_res.items():
    print(f"{metric}: {val:.2f}")

## KG-RAG

In [None]:
ngm = Neo4jGraphManager.from_env()  # instantiate neo4j graph manager
schema = 'pegadata'
table = 'ppm_work_filtered'
# ngm.from_table(f'{schema}.{table}', reset=True)    # create graph from table
# ngm.graph.refresh_schema()
# print(f"graph schema:\n{ngm.graph.schema}")

# embed graph objects
# ngm.embed_objs() 

### Agent with tools

In [None]:
# use agent w tools to query graph
gqa = GraphQueryAgent(ngm.graph, llm)    # instantiate graph query agent

# single query
# query = "how many user stories are there in total?"
query = "who in charge of the user story about end-to-end testing of GenAI services?"
print(f"user query: {query}")
gqa.run(query)

### Eval

In [None]:
# eval on test set
test_queries_path = 'data/sample_queries.csv'
df_queries = pd.read_csv(test_queries_path, sep=';', index_col='id')
responses = {}

gqa = GraphQueryAgent(ngm.graph, llm)    # instantiate graph query agent

# iter thru queries and generate responses
for qid, row in df_queries.iterrows():
    query = row['query']
    print(f"user query: {query}")
    try:
        result = gqa.run(query)
        responses[qid] = result
        print(result)
    except Exception as e:
        print(f"error: {e}")
    print('\n')

# save responses to csv
df_responses = pd.DataFrame.from_dict(responses, orient='index', columns=['response'])
responses_path = f'outputs/kg-rag/responses_{llm_version}.csv'
os.makedirs(os.path.dirname(responses_path), exist_ok=True)
df_responses.head()
# with open(responses_path, 'w') as f:
#     df_responses.to_csv(f, sep=';')

In [None]:
# compute accuracy of responses
responses_eval_path = f'outputs/kg-rag/responses_{llm_version}_eval.csv'
results_path = f'outputs/kg-rag/results_{llm_version}.json'
eval_res = eval_rag_responses(responses_eval_path, results_path)
for metric, val in eval_res.items():
    print(f"{metric}: {val:.2f}")