# Constructing RAG application with knowledge graph

In [1]:
import os
from langchain.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI
from langchain.chains import GraphCypherQAChain
from langchain_community.graphs import Neo4jGraph
from langchain_openai import ChatOpenAI
from langchain_core.prompts.prompt import PromptTemplate

In [2]:
os.environ["NEO4J_URI"] = os.getenv('NEO4J_URI')
os.environ["NEO4J_USERNAME"] = os.getenv('NEO4J_USERNAME')
os.environ["NEO4J_PASSWORD"] = os.getenv('NEO4J_PASSWORD')
os.environ["OPENAI_API_KEY"] =  os.getenv('OPENAI_API_KEY')
NEO4j_URI = os.getenv('NEO4J_URI')
NEO4j_USERNAME = os.getenv('NEO4J_USERNAME')
NEO4j_PASSWORD = os.getenv('NEO4J_PASSWORD')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

graph = Neo4jGraph(url = NEO4j_URI, username=NEO4j_USERNAME, password = NEO4j_PASSWORD, refresh_schema=False)



In [3]:
CYPHER_GENERATION_TEMPLATE = """Task:Generate Cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use the embedding property for KeyElements
Only look for terms in KeyElements nodes
Do not use any other relationship types or properties that are not provided.
Schema:
{schema}
Note: Respond only with the Cypher query.

The question is:
{question} 

Important: In the generated Cypher query, look for terms in keyelements when using WHERE. 
In the generated Cypher query, the RETURN statement must explicitly include the property values used in the query's filtering condition but never the embedding property, alongside the main information requested from the original question. 
Do not return anything related to chunks node types. When using atomic facts the text property is important.

"""
CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=CYPHER_GENERATION_TEMPLATE
)


In [4]:
graph.refresh_schema()

cypher_chain = GraphCypherQAChain.from_llm(
    cypher_prompt = CYPHER_GENERATION_PROMPT,
    cypher_llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini"),
    qa_llm = ChatOpenAI(temperature=0,model_name="gpt-4o-mini"), graph=graph, verbose=True, allow_dangerous_requests=True
)

In [5]:
cypher_chain.invoke(
    {"query": "Which document has most connections to other documents?"}
)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (d:Document)-[:HAS_CHUNK]->(:Chunk)-[:HAS_ATOMIC_FACT]->(:AtomicFact)-[:HAS_KEY_ELEMENT]->(k:KeyElement)
WITH d, COUNT(DISTINCT k.id) AS connections
RETURN d.id, d, connections
ORDER BY connections DESC
LIMIT 1
[0m
Full Context:
[32;1m[1;3m[{'d.id': '1611.04701v2.pdf.json', 'd': {'id': '1611.04701v2.pdf.json'}, 'connections': 2183}][0m

[1m> Finished chain.[0m


{'query': 'Which document has most connections to other documents?',
 'result': 'The document with the most connections to other documents is 1611.04701v2.pdf.json, which has 2183 connections.'}

In [6]:
response = cypher_chain.invoke({"query": "List `Theorem`s in the papers"})
print(response)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)-[:HAS_ATOMIC_FACT]->(af:AtomicFact)-[:HAS_KEY_ELEMENT]->(ke:KeyElement)
WHERE ke.id CONTAINS 'Theorem'
RETURN d.id AS documentId, d AS document, af.text AS atomicFactText, ke.id AS keyElementId
[0m
Full Context:
[32;1m[1;3m[{'documentId': '2201.01879v3.pdf.json', 'document': {'id': '2201.01879v3.pdf.json'}, 'atomicFactText': 'Equation (A.32) states that [ζ(1 − π) + ωπ] = 0.', 'keyElementId': 'Central Limit Theorem (CLT)'}, {'documentId': '2201.01879v3.pdf.json', 'document': {'id': '2201.01879v3.pdf.json'}, 'atomicFactText': 'By the Central Limit Theorem (CLT), Equation (A.34) shows that (1/√n)Σ(π̂i(mi − lπ̂i)) converges to N(0, βmωπ(βm − 2l) + E[π̂i](1 + l²)).', 'keyElementId': 'Central Limit Theorem (CLT)'}, {'documentId': '2209.01679v3.pdf.json', 'document': {'id': '2209.01679v3.pdf.json'}, 'atomicFactText': 'The study focuses on hyperplanar moments

In [7]:
response = cypher_chain.invoke({"query": "Explain the matematical detail of OLS"})
print(response)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)-[:HAS_ATOMIC_FACT]->(af:AtomicFact)-[:HAS_KEY_ELEMENT]->(ke:KeyElement)
WHERE ke.id CONTAINS 'OLS' OR ke.id CONTAINS 'Ordinary Least Squares'
RETURN af.text, ke.id
[0m
Full Context:
[32;1m[1;3m[{'af.text': 'The norm upper bound of the operator learned via OLS decreases with increasing noise in the independent variable.', 'ke.id': 'operator learned via OLS'}, {'af.text': 'EiV operator learning robustly recovers the Burgers operator in high-noise regimes that defeat OLS operator learning.', 'ke.id': 'OLS operator learning'}, {'af.text': 'The effect of bias in OLS operator learning depends on the regularity of the target operator.', 'ke.id': 'OLS operator learning'}, {'af.text': 'An EiV model for time-evolving PDE discovery shows that OLS and EiV perform similarly in learning the Kuramoto-Sivashinsky evolution operator from corrupted data.', 'ke.id': 'OL

In [9]:
response = cypher_chain.invoke({"query": "Which paper uses WIDEHIST algorithm?"})
print(response)



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)-[:HAS_ATOMIC_FACT]->(af:AtomicFact)-[:HAS_KEY_ELEMENT]->(ke:KeyElement)
WHERE ke.id CONTAINS 'WIDEHIST'
RETURN d.id AS documentId, d AS document, af.text AS atomicFactText, ke.id AS keyElementId
[0m
Full Context:
[32;1m[1;3m[{'documentId': '1505.04215v1.pdf.json', 'document': {'id': '1505.04215v1.pdf.json'}, 'atomicFactText': 'The algorithm WIDEHIST divides the interval [-1, 1] into m bins of width h.', 'keyElementId': 'Algorithm WIDEHIST'}][0m

[1m> Finished chain.[0m
{'query': 'Which paper uses WIDEHIST algorithm?', 'result': 'The paper with the document ID 1505.04215v1.pdf.json uses the WIDEHIST algorithm.'}
