# RAG-LLMs

## Setup

### Imports

In [43]:
# imports
import os
from dotenv import load_dotenv
from langchain.chat_models import AzureChatOpenAI
from langchain_experimental.sql import SQLDatabaseChain
from langchain.chains import GraphCypherQAChain
from langchain_openai import AzureOpenAIEmbeddings
from langchain.prompts import PromptTemplate

from src.utils import *
from src.data_utils import SQLDBManager, Neo4jGraphManager
from src.prompt_templates import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Config

In [39]:
# load env vars
host, port, db, user, password = load_postgres_env_variables()
# dbm = SQLDBManager.from_env()   # instantiate SQLDBManager
setup_azure_openai()    # setup azure openai AD token
# print(f"AD token set: {os.environ['AZURE_OPENAI_AD_TOKEN']}")
# table = 'pegadata.ppm_work_filtered'

Successfully setup Azure OpenAI authentication


## Prepare & init DB

In [35]:
# set db variables
# src_table = 'pegadata.ppm_work'
# req_cols_path = 'data/pega-as-clone/req_fields.txt'
# data_table = f'{src_table}_filtered'
# primary_key = 'pyid'

# dbm = SQLDBManager.from_env()  # instantiate SQLDBManager
# filter data table and create new
# dbm.filter_table(src_table, req_cols_path, primary_key, overwrite=True)

# drop cols
# cols_to_drop = ...
# dbm.drop_cols(data_table, cols_to_drop)

# clean html
# text_col = 'pydescription'
# dbm.clean_html(data_table, [text_col], primary_key) # clean html

# # create embeddings
# embs_model = AzureOpenAIEmbeddings(azure_deployment="text-embedding-ada-002") # instantiate embeddings model
# dbm.create_embs_col(data_table, text_col, embs_model) # create embeddings col

connected to database


## RAG

In [38]:
schema = 'pegadata'
table = 'ppm_work_filtered' 
dbm = SQLDBManager.from_env(schema=schema, include_tables=[table])   # instantiate SQLDBManager

connected to database


In [46]:
# using sql chain
llm = AzureChatOpenAI(model='gpt-4', max_tokens=1000, temperature=0)    # instantiate llm

# custom prompt
prompt = PromptTemplate(input_variables=["input", "table_info"], template=rag_sql_prompt)

db_chain = SQLDatabaseChain.from_llm(llm, dbm.db, verbose=True, prompt=prompt)    # instantiate db chain

# using sql agent
# from langchain.agents import create_sql_agent
# from langchain.agents import AgentExecutor
# from langchain.agents.agent_types import AgentType
# from langchain_community.agent_toolkits import SQLDatabaseToolkit

# agent_type = AgentType.ZERO_SHOT_REACT_DESCRIPTION
# toolkit = SQLDatabaseToolkit(dbm.db, llm)
# db_agent = create_sql_agent(llm=llm, toolkit=toolkit, agent_type=agent_type, verbose=True)    # instantiate db agent

# generate sql query
# query = "how many user stories are under epic 1?"
# query = "find the most recent user story under epic 1 and summarize it"
# query = "list all the user stories under epic 1, along w/ their priority (if known) and status"
# query = "list all user stories with deadlines before Feb 1, 2024, along w their assignees and status"
# query =  "how many user stories are assigned to each assignee?"
# query = "which pending user story has the nearest deadline?"
# query = "how many user stories in the 'Testing' category have been completed?"
# query = "what is the distribution of user stories across different categories?"
# query = "how many pending user stories have a priority greater than 30?"
# query = "which pending user story has the nearest deadline? what is its priority and category? which epic does it belong to? are there any other pending user stories under that epic? if yes, which ones?"
query = "give me all user stories assigned to me (Neil) with priority > 10 which have a deadline after all stories assigned to Stijn."
# query = "given the data in this table, generate 10 questions/queries that could be asked about it. your questions should not be too simple, i.e., they cannot be answered w say, a single lookup operation."

# generate response using rag
table_info = dbm.db.get_table_info([table])
db_chain.run(query=query, table_info=table_info)
# db_agent.run(query)



[1m> Entering new SQLDatabaseChain chain...[0m
give me all user stories assigned to me (Neil) with priority > 10 which have a deadline after all stories assigned to Stijn.
SQLQuery:[32;1m[1;3mSELECT * FROM pegadata.ppm_work_filtered WHERE initialassignee = 'Neil' AND pxurgencywork > 10 AND pysladeadline > (SELECT MAX(pysladeadline) FROM pegadata.ppm_work_filtered WHERE initialassignee = 'Stijn')[0m
SQLResult: [33;1m[1;3m[('BL-1', 'Backlog', 'GenAI Gateway Service Backlog', 'Neil', Decimal('20'), datetime.datetime(2024, 5, 1, 0, 0), 'Active', 'Feature Development', 'PROJ-1', 'PRD-1', 'GOAL-1', None, None, 'GenAI Gateway Service Backlog', datetime.date(2023, 11, 9), None), ('US-11', 'UserStory', 'HFIX on 8.7.5 and 23.1.x', 'Neil', Decimal('30'), datetime.datetime(2024, 4, 11, 0, 0), 'Active', 'Feature Development', 'PROJ-2', 'PRD-2', 'GOAL-2', 'EPIC-2', 'BL-2', 'backport or forward port the solution we delovp for multi part signing of outbound messeages for SOAP service.', datet

'The user stories assigned to Neil with priority greater than 10 and have a deadline after all stories assigned to Stijn are:\n\n1. User Story ID: BL-1, Title: GenAI Gateway Service Backlog, Priority: 20, Deadline: 2024-05-01\n2. User Story ID: US-11, Title: HFIX on 8.7.5 and 23.1.x, Priority: 30, Deadline: 2024-04-11\n3. User Story ID: GOAL-1, Title: Run GenAI Gateway service on Azure OpenAI ChatGPT, Priority: 30, Deadline: 2024-05-01\n4. User Story ID: PROJ-1, Title: Autopilot & GenAI Services, Priority: 80, Deadline: 2024-06-01\n5. User Story ID: EPIC-3, Title: Restrict Redirections to same domain, Priority: 30, Deadline: 2024-05-01\n6. User Story ID: EPIC-1, Title: GenAI as CloudK Service, Priority: 50, Deadline: 2024-04-01'

## KG-RAG

### Create neo4j graph from postgres db

In [28]:
ngm = Neo4jGraphManager.from_env()  # instantiate neo4j graph manager
ngm.from_table(f'{schema}.{table}', reset=True)    # create graph from table
ngm.graph.refresh_schema()
print(f"graph schema:\n{ngm.graph.schema}")

  df = pd.read_sql(query, conn)


graph schema:
Node properties are the following:
UserStory {backlogid: STRING, pxurgencywork: FLOAT, goalid: STRING, epicid: STRING, projectid: STRING, productid: STRING, pydescription_clean: STRING, pysladeadline: LOCAL_DATE_TIME, pystatuswork: STRING, pylabel: STRING, category: STRING, initialassignee: STRING, pyid: STRING, startdate: DATE, pxobjclass: STRING, enddate: DATE},Backlog {goalid: STRING, projectid: STRING, pxurgencywork: FLOAT, pydescription_clean: STRING, pystatuswork: STRING, productid: STRING, category: STRING, pylabel: STRING, pyid: STRING, pysladeadline: LOCAL_DATE_TIME, startdate: DATE, pxobjclass: STRING, initialassignee: STRING},Epic {backlogid: STRING, pxurgencywork: FLOAT, goalid: STRING, productid: STRING, projectid: STRING, pysladeadline: LOCAL_DATE_TIME, pystatuswork: STRING, category: STRING, pyid: STRING, pylabel: STRING, pydescription_clean: STRING, startdate: DATE, pxobjclass: STRING, initialassignee: STRING},Project {pydescription_clean: STRING, pystatus

### Implement KG-RAG using graph

In [49]:
# generate response using kg rag
llm = AzureChatOpenAI(model='gpt-4', max_tokens=1000, temperature=0.5)    # instantiate llm

# query = "how many user stories are under epic 1?"
# query = "find the most recent user story under epic 1 and summarize it"
# query = "list all the user stories under epic 1, along w/ their priority (if known) and status"
# query = "list all user stories with deadlines before Feb 1, 2024, along w their assignees and status"
# query = "which pending user story has the nearest deadline? what is its priority and category? which epic does it belong to? are there any other pending user stories under that epic? if yes, which ones?"
query = "give me all user stories assigned to me (Neil) with priority > 10 which have a deadline after all stories assigned to Stijn."
print(f"user query: {query}")

prompt_template = kg_rag_prompt
prompt = PromptTemplate(input_variables=["schema", "question"], template=kg_rag_prompt)   # construct prompt

kg_rag_chain = GraphCypherQAChain.from_llm(llm, graph=ngm.graph, cypher_prompt=prompt, verbose=True)   # instantiate kg rag chain

response = kg_rag_chain.run(query)  # generate response
print(response)

user query: give me all user stories assigned to me (Neil) with priority > 10 which have a deadline after all stories assigned to Stijn.


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (us:UserStory)-[:IS_STORY_OF_EPIC]->(e:Epic)-[:IS_EPIC_OF_GOAL]->(g:Goal)-[:IS_GOAL_OF_PROJECT]->(p:Project)
WHERE us.initialassignee = "Neil" AND us.pxurgencywork > 10 
WITH MAX(us.pysladeadline) AS maxNeilDeadline
MATCH (us2:UserStory)
WHERE us2.initialassignee = "Stijn"
WITH MAX(us2.pysladeadline) AS maxStijnDeadline
MATCH (us3:UserStory)-[:IS_STORY_OF_EPIC]->(e3:Epic)-[:IS_EPIC_OF_GOAL]->(g3:Goal)-[:IS_GOAL_OF_PROJECT]->(p3:Project)
WHERE us3.initialassignee = "Neil" AND us3.pxurgencywork > 10 AND us3.pysladeadline > maxStijnDeadline
RETURN us3[0m
Full Context:
[32;1m[1;3m[{'us3': {'goalid': 'GOAL-2', 'pysladeadline': neo4j.time.DateTime(2024, 4, 11, 0, 0, 0, 0), 'productid': 'PRD-2', 'pystatuswork': 'Active', 'initialassignee': 'Neil', 'pxobjclass': 'User