## RAG using Langchain with Neo4j Vector DB


In [1]:
## Check Dependencies
#!pip3 install --upgrade --quiet langchain langchain-community langchain-openai langchain_mistralai neo4j==5.19.0 tiktoken tokenizers

In [2]:
!pip3 show langchain neo4j langchain_mistralai

Name: langchain
Version: 0.1.17
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages
Requires: aiohttp, dataclasses-json, jsonpatch, langchain-community, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 
---
Name: neo4j
Version: 5.19.0
Summary: Neo4j Bolt driver for Python
Home-page: 
Author: 
Author-email: "Neo4j, Inc." <drivers@neo4j.com>
License: Apache License, Version 2.0
Location: /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages
Requires: pytz
Required-by: 
---
Name: langchain-mistralai
Version: 0.1.5
Summary: An integration package connecting Mistral and LangChain
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Library/Frameworks/Python.f

### Load environment variables

In [3]:
from dotenv import load_dotenv
import os
load_dotenv()

print("NEO4J_URI = " + os.getenv("NEO4J_URI"))
print("NEO4J_USERNAME = " + os.getenv("NEO4J_USERNAME"))
print("NEO4J_PASSWORD = " + os.getenv("NEO4J_PASSWORD"))

NEO4J_URI = bolt://localhost:7687
NEO4J_USERNAME = neo4j
NEO4J_PASSWORD = neo4j123


### Create Neo4jGraph

Make sure the Neo4j instance is running and the environment variables are set correctly.

In [4]:
from langchain_community.graphs import Neo4jGraph
graph = Neo4jGraph()
# graph = Neo4jGraph(url="bolt://localhost:7687", username="neo4j", password="neo4j123")


### Check graph schema

In [5]:
print(graph.schema)

Node properties:
Element {tags: STRING, name: STRING, type: STRING, embedding: LIST, source: STRING, parent: STRING, description: STRING}
Relationship properties:
Uses {consumer: STRING, provider: STRING, embedding: LIST, source: STRING, technology: STRING, description: STRING}
The relationships:
(:Element)-[:Uses]->(:Element)
(:Element)-[:Contains]->(:Element)


In [6]:
# graph.structured_schema
# graph.structured_schema['node_props']
# graph.structured_schema['rel_props']
# graph.structured_schema['relationships']
# graph.structured_schema['metadata']['index']

### Cypher Query Examples

We prepared some questions that will be passed by LLM.

The first 4 questions are also provided with example Cypher query statements that should be used to retrieve relevant data from the graph database.

They will be used to test the LLM's ability to generate Cypher query statements.

In [7]:
query_examples = [
    {#1
        "question": "what software systems are used by customers?",
        "query": """
                MATCH (p:Element)-[r:Uses]->(ss:Element) 
                WHERE p.type="Person" AND p.tags CONTAINS "Customer" AND ss.type="SoftwareSystem" 
                RETURN p.tags + " " + r.description + " " + ss.name as context, ss.name as name, ss.description as description
                """,
    },
    {#2
        "question": "what software systems are used by customers to withdraw cash?",
        "query": """
                MATCH (p:Element)-[r:Uses]->(ss:Element) 
                WHERE p.type="Person" AND p.tags CONTAINS "Customer" 
                    AND (r.description CONTAINS 'withdraw cash' OR (ss.type="SoftwareSystem" AND ss.description CONTAINS 'withdraw cash'))
                RETURN p.tags + " " + r.description + " " + ss.name as context, ss.name as name, ss.description as description
                """,
    },
    {#3
        "question": "what software systems are available?",
        "query": """
                MATCH (ss:Element) 
                WHERE ss.type='SoftwareSystem' 
                RETURN ss.tags + ":" + ss.name as system, ss.description as description
                """,
    },
    {#4
        "question": "what users are available?",
        "query": """
                MATCH (p:Element) 
                WHERE p.type='Person' 
                RETURN p.tags + ":" + p.name as person, p.description as description
                """,
    },
    {#5
        "question": "what software systems are used by staff?",
    },
    {#6
        "question": "What software systems are used by Back Office Staff?",
    },
    {#7
        "question": "what can provide a summary of a customer's bank accounts?"
    },
    {#8
        "question": "what can be used to store customer information?"
    },
    {#9
        "question": "what can be used by customers to view their banking information?",
    },
    {#10
        "question": "what software system can be used to store customer information?"
    }
]

### Simple query from graph database 

In [8]:
print(f"""
      question: {query_examples[0]["question"]}
      query: {query_examples[0]["query"]}
      results: {graph.query(query_examples[0]["query"])}
      """)


      question: what software systems are used by customers?
      query: 
                MATCH (p:Element)-[r:Uses]->(ss:Element) 
                WHERE p.type="Person" AND p.tags CONTAINS "Customer" AND ss.type="SoftwareSystem" 
                RETURN p.tags + " " + r.description + " " + ss.name as context, ss.name as name, ss.description as description
                
      results: [{'context': 'Element,Person,Customer Withdraws cash using ATM', 'name': 'ATM', 'description': 'Allows customers to withdraw cash.'}, {'context': 'Element,Person,Customer Views account balances, and makes payments using Internet Banking System', 'name': 'Internet Banking System', 'description': 'Allows customers to view information about their bank accounts, and make payments.'}]
      


### Create LLM

In [9]:
# from langchain_openai import ChatOpenAI
# os.environ["OPENAI_API_KEY"] = .......
# llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

from langchain_mistralai.chat_models import ChatMistralAI
llm = ChatMistralAI(mistral_api_key=os.getenv("MISTRAL_API_KEY"), model=os.getenv("MISTRAL_MODEL"), temperature=0)

print("MISTRAL_MODEL = " + os.getenv("MISTRAL_MODEL"))

MISTRAL_MODEL = open-mistral-7b


### Create GraphCypherQAChain

Please refer to API documentation for more details.

https://api.python.langchain.com/en/latest/chains/langchain.chains.graph_qa.cypher.GraphCypherQAChain.html

In [10]:
from langchain.chains import GraphCypherQAChain
chain = GraphCypherQAChain.from_llm(graph=graph, llm=llm, verbose=True)

# class GraphCypherQAChain(Chain):
#     graph: GraphStore = Field(exclude=True)
#     cypher_generation_chain: LLMChain
#     qa_chain: LLMChain
#     graph_schema: str
#     input_key: str = "query"  #: :meta private:
#     output_key: str = "result"  #: :meta private:
#     top_k: int = 10
#     """Number of results to return from the query"""
#     return_intermediate_steps: bool = False
#     """Whether or not to return the intermediate steps along with the final answer."""
#     return_direct: bool = False
#     """Whether or not to return the result of querying the graph directly."""
#     cypher_query_corrector: Optional[CypherQueryCorrector] = None
#     """Optional cypher validation tool"""

# from_llm(
#         cls,
#         llm: Optional[BaseLanguageModel] = None,
#         *,
#         qa_prompt: Optional[BasePromptTemplate] = None,
#         cypher_prompt: Optional[BasePromptTemplate] = None,
#         cypher_llm: Optional[BaseLanguageModel] = None,
#         qa_llm: Optional[BaseLanguageModel] = None,
#         exclude_types: List[str] = [],
#         include_types: List[str] = [],
#         validate_cypher: bool = False,
#         qa_llm_kwargs: Optional[Dict[str, Any]] = None,
#         cypher_llm_kwargs: Optional[Dict[str, Any]] = None,
#         **kwargs: Any,
#     ) -> GraphCypherQAChain:

### Check the default prompts

CYPHER_GENERATION_PROMPT is used to generate the cypher statement.

CYPHER_QA_PROMPT is used to generate the answer to questions.

In [11]:
from langchain.chains.graph_qa.prompts import CYPHER_GENERATION_PROMPT, CYPHER_QA_PROMPT

print(f"CYPHER_GENERATION_PROMPT : \n{CYPHER_GENERATION_PROMPT}", flush=True)

print(f"CYPHER_QA_PROMPT : \n{CYPHER_QA_PROMPT}", flush=True)

CYPHER_GENERATION_PROMPT : 
input_variables=['question', 'schema'] template='Task:Generate Cypher statement to query a graph database.\nInstructions:\nUse only the provided relationship types and properties in the schema.\nDo not use any other relationship types or properties that are not provided.\nSchema:\n{schema}\nNote: Do not include any explanations or apologies in your responses.\nDo not respond to any questions that might ask anything else than for you to construct a Cypher statement.\nDo not include any text except the generated Cypher statement.\n\nThe question is:\n{question}'
CYPHER_QA_PROMPT : 
input_variables=['context', 'question'] template="You are an assistant that helps to form nice and human understandable answers.\nThe information part contains the provided information that you must use to construct an answer.\nThe provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.\nMake the answer sound as a response 

### Ask GraphCypherQAChain

The **question** must be passed as a dictionary with a key "query" and the value is the question.

The QA chain will generate a cypher query statement using `CYPHER_GENERATION_PROMPT` and return the answer to the question.


In [12]:
print(f"""question: {query_examples[0]["question"]}""")

chain.invoke({"query": query_examples[0]["question"]})

question: what software systems are used by customers?


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mcypher
MATCH (e1:Element)-[:Uses]->(e2:Element)
WHERE e1.tags IN ["software"] AND e2.tags IN ["customer"]
RETURN e1.name AS Software_Name, e2.name AS Customer_Name
[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


{'query': 'what software systems are used by customers?',
 'result': "I don't have information on the specific software systems used by customers."}

### Promt Refinement for GraphCypherQAChain

The cypher query statement generated by the QA chain is not always correct.

We can use the `MY_CYPHER_GENERATION_PROMPT` and `MY_CYPHER_QA_PROMPT` to refine the prompts and pass them to the QA chain.

In [13]:
from langchain_core.prompts import PromptTemplate

# ## Refine the CYPHER_GENERATION_TEMPLATE by adding additional instructions below
# 1. Do not include parameters in the MATCH statement.
# 2. Only include parameters in the WHERE statement.
# 3. Always use "MATCH (p:Element)-[r:Uses]->(ss:Element)" to lookup "Uses" relationships.
# 4. Always use "MATCH (p:Element)-[r:Contains]->(ss:Element)" to lookup "Contains" relationships.

MY_CYPHER_GENERATION_TEMPLATE_PREFIX = """
Task: Generate a syntactically correct cypher statement to query a graph database.
Instructions:
Use only the provided relationship types and properties in the schema.
Do not use any other relationship types or properties that are not provided.
Do not include parameters in the MATCH statement.
Only include parameters in the WHERE statement.
Always use "MATCH (p:Element)-[r:Uses]->(ss:Element)" to lookup "Uses" relationships.
Always use "MATCH (p:Element)-[r:Contains]->(ss:Element)" to lookup "Contains" relationships.

Schema:
{schema}

Note: Do not include any explanations or apologies in your responses.
Do not respond to any questions that might ask anything else than for you to construct a Cypher statement.
Do not include any text except the generated Cypher statement.

Here are a few examples of generated Cypher statements for particular questions:
"""

############################################
##### Prompt refinement with examples ######
############################################
MY_CYPHER_GENERATION_TEMPLATE_EXAMPLES = f"""
# {query_examples[0]["question"]} 
{query_examples[0]["query"]}
# {query_examples[1]["question"]} 
{query_examples[1]["query"]}
# {query_examples[2]["question"]} 
{query_examples[2]["query"]}
"""

MY_CYPHER_GENERATION_TEMPLATE_SUFFIX = """   
User input: {question}
Cypher query:
"""

MY_CYPHER_GENERATION_TEMPLATE = MY_CYPHER_GENERATION_TEMPLATE_PREFIX + MY_CYPHER_GENERATION_TEMPLATE_EXAMPLES + MY_CYPHER_GENERATION_TEMPLATE_SUFFIX

MY_CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=["schema", "question"], template=MY_CYPHER_GENERATION_TEMPLATE
)


MY_CYPHER_QA_TEMPLATE = """
You are an assistant that helps to form nice and human understandable answers.
The context part contains the provided information that you must use to construct an answer.
The provided information is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
Make the answer sound as a response to the question. 
Do not mention that you based the result on the given context.
Do not make up anything which does not exist in the provided context.

Here is an example:
Question: What software systems are used by customers?
Context: Internet Banking System
Helpful Answer: Internet Banking System.

Follow this example when generating answers. If the provided context is empty, say that you don't know the answer.

Context: {context}

Question: {question}
Helpful Answer:
"""

MY_CYPHER_QA_PROMPT = PromptTemplate(
    input_variables=['context', 'question'], template=MY_CYPHER_QA_TEMPLATE
)
# Pass the refined prompts to the QA chain
chain = GraphCypherQAChain.from_llm(llm=llm, graph=graph, cypher_prompt=MY_CYPHER_GENERATION_PROMPT,qa_prompt=MY_CYPHER_QA_PROMPT,verbose=True)

In [14]:
print(f"""question: {query_examples[0]["question"]}""")

chain.invoke({"query": query_examples[0]["question"]})

question: what software systems are used by customers?


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Element)-[r:Uses]->(ss:Element)
WHERE p.type="Person" AND p.tags CONTAINS "Customer" AND ss.type="SoftwareSystem"
RETURN p.tags + " " + r.description + " " + ss.name as context, ss.name as name, ss.description as description[0m
Full Context:
[32;1m[1;3m[{'context': 'Element,Person,Customer Withdraws cash using ATM', 'name': 'ATM', 'description': 'Allows customers to withdraw cash.'}, {'context': 'Element,Person,Customer Views account balances, and makes payments using Internet Banking System', 'name': 'Internet Banking System', 'description': 'Allows customers to view information about their bank accounts, and make payments.'}][0m

[1m> Finished chain.[0m


{'query': 'what software systems are used by customers?',
 'result': 'Customers use the Internet Banking System and ATMs. The Internet Banking System allows them to view account balances and make payments, while ATMs enable them to withdraw cash.'}

In [15]:
print(f"""question: {query_examples[2]["question"]}""")

chain.invoke({"query": query_examples[2]["question"]})

question: what software systems are available?


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (ss:Element)
WHERE ss.type='SoftwareSystem'
RETURN ss.tags + ":" + ss.name as system, ss.description as description[0m
Full Context:
[32;1m[1;3m[{'system': 'Element,Software System,Existing System:Mainframe Banking System', 'description': 'Stores all of the core banking information about customers, accounts, transactions, etc.'}, {'system': 'Element,Software System,Existing System:E-mail System', 'description': 'The internal Microsoft Exchange e-mail system.'}, {'system': 'Element,Software System,Existing System:ATM', 'description': 'Allows customers to withdraw cash.'}, {'system': 'Element,Software System:Internet Banking System', 'description': 'Allows customers to view information about their bank accounts, and make payments.'}][0m

[1m> Finished chain.[0m


{'query': 'what software systems are available?',
 'result': 'The software systems available include the Mainframe Banking System, the Microsoft Exchange e-mail system, ATMs, and the Internet Banking System.'}

In [16]:
print(f"""question: {query_examples[4]["question"]}""")

chain.invoke({"query": query_examples[4]["question"]})

question: what software systems are used by staff?


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Element)-[r:Uses]->(ss:Element)
WHERE p.type="Person" AND p.tags CONTAINS "Staff" AND ss.type="SoftwareSystem"
RETURN p.tags + " " + r.description + " " + ss.name as context, ss.name as name, ss.description as description[0m
Full Context:
[32;1m[1;3m[{'context': 'Element,Person,Bank Staff Uses Mainframe Banking System', 'name': 'Mainframe Banking System', 'description': 'Stores all of the core banking information about customers, accounts, transactions, etc.'}, {'context': 'Element,Person,Bank Staff Uses Mainframe Banking System', 'name': 'Mainframe Banking System', 'description': 'Stores all of the core banking information about customers, accounts, transactions, etc.'}][0m

[1m> Finished chain.[0m


{'query': 'what software systems are used by staff?',
 'result': 'Mainframe Banking System is used by bank staff.'}

### Few-shot examples
    

In [17]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate

example_prompt = PromptTemplate.from_template(
    "User input: {question}\nCypher query: {query}"
)

fewShotPrompt = FewShotPromptTemplate(
    examples=query_examples[:4],
    example_prompt=example_prompt,
    prefix=MY_CYPHER_GENERATION_TEMPLATE_PREFIX,
    suffix=MY_CYPHER_GENERATION_TEMPLATE_SUFFIX,
    input_variables=["question", "schema"],
)

chain = GraphCypherQAChain.from_llm(graph=graph, llm=llm, cypher_prompt=fewShotPrompt, qa_prompt=MY_CYPHER_QA_PROMPT, verbose=True)


In [18]:
# print(fewShotPrompt.format(question=query_examples[6]["question"], schema=graph.structured_schema))

### Ask a predefined question

Ask a predefined question and the predefined query should be used to generate the answer.

In [19]:
print(f"""question: {query_examples[0]["question"]}\nquery: {query_examples[0]["query"]}""")

chain.invoke(query_examples[0]["question"])

question: what software systems are used by customers?
query: 
                MATCH (p:Element)-[r:Uses]->(ss:Element) 
                WHERE p.type="Person" AND p.tags CONTAINS "Customer" AND ss.type="SoftwareSystem" 
                RETURN p.tags + " " + r.description + " " + ss.name as context, ss.name as name, ss.description as description
                


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Element)-[r:Uses]->(ss:Element)
 WHERE p.type="Person" AND p.tags CONTAINS "Customer" AND ss.type="SoftwareSystem"
 RETURN p.tags + " " + r.description + " " + ss.name as context, ss.name as name, ss.description as description[0m
Full Context:
[32;1m[1;3m[{'context': 'Element,Person,Customer Withdraws cash using ATM', 'name': 'ATM', 'description': 'Allows customers to withdraw cash.'}, {'context': 'Element,Person,Customer Views account balances, and makes payments using Internet Banking System', 'name': 'Internet Banking System', 'des

{'query': 'what software systems are used by customers?',
 'result': 'Customers use the Internet Banking System and ATMs. The Internet Banking System allows them to view account balances and make payments, while ATMs enable them to withdraw cash.'}

### Ask a question that is not in the predefined list

The QA Chain should be able to find the similar question in the predefined list and use the corresponding query to generate the answer.

In [20]:
print(f"""question: {query_examples[4]["question"]}""")

chain.invoke(query_examples[4]["question"])

question: what software systems are used by staff?


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Element)-[r:Uses]->(ss:Element)
 WHERE p.type="Person" AND p.tags CONTAINS "Staff" AND ss.type="SoftwareSystem"
 RETURN p.tags + " " + r.description + " " + ss.name as context, ss.name as name, ss.description as description[0m
Full Context:
[32;1m[1;3m[{'context': 'Element,Person,Bank Staff Uses Mainframe Banking System', 'name': 'Mainframe Banking System', 'description': 'Stores all of the core banking information about customers, accounts, transactions, etc.'}, {'context': 'Element,Person,Bank Staff Uses Mainframe Banking System', 'name': 'Mainframe Banking System', 'description': 'Stores all of the core banking information about customers, accounts, transactions, etc.'}][0m

[1m> Finished chain.[0m


{'query': 'what software systems are used by staff?',
 'result': 'Mainframe Banking System is used by bank staff.'}

In [21]:
print(f"""question: {query_examples[5]["question"]}""")

chain.invoke(query_examples[5]["question"])

question: What software systems are used by Back Office Staff?


[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (p:Element)-[r:Uses]->(ss:Element)
 WHERE p.type="Person" AND p.tags CONTAINS "Back Office Staff" AND ss.type="SoftwareSystem"
 RETURN p.tags + " " + r.description + " " + ss.name as context, ss.name as name, ss.description as description[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


{'query': 'What software systems are used by Back Office Staff?',
 'result': 'I cannot provide a definitive answer without specific information about the software systems used by Back Office Staff.'}

### Semantic Search

The cypher statements generated for the above examples are using simple matching against keywords, e.g. `p.tags contains 'Bank Staff'` 

if we ask the question in another way, QA Chain will not be able to answer correctly, it should use semnatic search instead to resolve the problem. 

https://python.langchain.com/docs/use_cases/graph/semantic/


In [22]:
# print(f"""question: {query_examples[7]["question"]}""")

# chain.invoke(query_examples[7]["question"])

In [23]:
# chain.invoke("what software systems can be used by customer to view their banking information?")

##### Create Neo4j Vector Store from an existing graph database

Create embedding vector for the SoftwareSystem node from existing neo4j graph database. A new **embedding** property will be added to the SoftwareSystem node.

- [MistralAIEmbeddings](https://python.langchain.com/docs/integrations/text_embedding/mistralai/)

- [Neo4j Vector](https://python.langchain.com/docs/integrations/vectorstores/neo4jvector/)

- [Create Neo4jVector from an existing graph](https://api.python.langchain.com/en/latest/_modules/langchain_community/vectorstores/neo4j_vector.html#Neo4jVector.from_existing_graph)

In [24]:
from langchain_community.vectorstores import Neo4jVector
from langchain_mistralai import MistralAIEmbeddings

retrieval_query = """
RETURN "name: "+node.name+"\ntype: "+node.type+"\ndescription: "+node.description AS text, score, {tags: node.tags, name: node.name, type: node.type, parent: node.parent, source: node.source} AS metadata
"""

# Initialize and return a Neo4jVector instance from an existing graph.
# This method initializes a Neo4jVector instance using the provided parameters and the existing graph. 
# It validates the existence of the indices and creates new ones if they don't exist.
vectorestore = Neo4jVector.from_existing_graph(
    embedding=MistralAIEmbeddings(mistral_api_key=os.getenv("MISTRAL_API_KEY")),
    node_label="Element",
    embedding_node_property="embedding",
    text_node_properties=["description"], #combime properties to create a single text property
    index_name="Element_index",
    retrieval_query=retrieval_query, #default retrival query will contain all the text node properties
    url=os.getenv("NEO4J_URI"), username=os.getenv("NEO4J_USERNAME"), password=os.getenv("NEO4J_PASSWORD")
)



In [25]:
print(f"""question: {query_examples[8]["question"]}""")

vectorestore.similarity_search(query_examples[8]["question"], k=1)

question: what can be used by customers to view their banking information?


[Document(page_content='name: Internet Banking System\ntype: SoftwareSystem\ndescription: Allows customers to view information about their bank accounts, and make payments.', metadata={'tags': 'Element,Software System', 'source': 'Big Bank plc', 'name': 'Internet Banking System', 'parent': '', 'type': 'SoftwareSystem'})]

In [26]:
results = vectorestore.similarity_search("Mainframe Banking System", k=1)
results

[Document(page_content='name: Mainframe Banking System\ntype: SoftwareSystem\ndescription: Stores all of the core banking information about customers, accounts, transactions, etc.', metadata={'tags': 'Element,Software System,Existing System', 'source': 'Big Bank plc', 'name': 'Mainframe Banking System', 'parent': '', 'type': 'SoftwareSystem'})]

In [27]:
from langchain_core.prompts import ChatPromptTemplate, format_document

# DEFAULT_DOCUMENT_PROMPT = ChatPromptTemplate.from_template(template="\nname: {name}\ntype: {type}{page_content}")
DEFAULT_DOCUMENT_PROMPT = ChatPromptTemplate.from_template(template="{page_content}")
def _extract_context_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    context_content = document_separator.join(doc_strings)
    
    return context_content

print(_extract_context_documents(results))

Human: name: Mainframe Banking System
type: SoftwareSystem
description: Stores all of the core banking information about customers, accounts, transactions, etc.


In [28]:
print(f"""question: {query_examples[8]["question"]}""")

results = vectorestore.similarity_search_with_score(query_examples[8]["question"], k=1)
results

question: what can be used by customers to view their banking information?


[(Document(page_content='name: Internet Banking System\ntype: SoftwareSystem\ndescription: Allows customers to view information about their bank accounts, and make payments.', metadata={'tags': 'Element,Software System', 'source': 'Big Bank plc', 'name': 'Internet Banking System', 'parent': '', 'type': 'SoftwareSystem'}),
  0.8344582915306091)]

#### Create Neo4j Vector Store from an existing embedding index

- [MistralAIEmbeddings](https://python.langchain.com/docs/integrations/text_embedding/mistralai/)

- [Neo4j Vector](https://python.langchain.com/docs/integrations/vectorstores/neo4jvector/)

- [Create Neo4jVector from an existing index](https://api.python.langchain.com/en/latest/_modules/langchain_community/vectorstores/neo4j_vector.html#Neo4jVector.from_existing_index)

In [29]:
# Create Neo4jVector instance from an existing index.
from langchain_community.vectorstores import Neo4jVector
from langchain_mistralai import MistralAIEmbeddings

embedding=MistralAIEmbeddings(mistral_api_key=os.getenv("MISTRAL_API_KEY"))

retrieval_query = """
RETURN "\n\nname: "+node.name+"\ntype: "+node.type+"\ndescription: "+node.description AS text, score, 
{tags: node.tags, name: node.name, type: node.type, parent: node.parent, source: node.source} AS metadata
"""

# Get instance of an existing Neo4j vector index. This method will return the instance of the store without inserting any new embeddings.
vectorestore = Neo4jVector.from_existing_index(
    embedding=embedding,
    url=os.getenv("NEO4J_URI"), username=os.getenv("NEO4J_USERNAME"), password=os.getenv("NEO4J_PASSWORD"),
    index_name="Element_index",
    retrieval_query=retrieval_query,
    embedding_node_property="embedding",
)


In [30]:

embedding_dimension, index_type = vectorestore.retrieve_existing_index()

print(f"embedding_dimension: {embedding_dimension}\nindex_type: {index_type}")


embedding_dimension: 1024
index_type: NODE


In [31]:
question = "what software systems can be used to store customer information?"
print(f"""question: {question}""")
results = vectorestore.similarity_search(query=question, k=1)
results

question: what software systems can be used to store customer information?


[Document(page_content='\n\nname: Mainframe Banking System\ntype: SoftwareSystem\ndescription: Stores all of the core banking information about customers, accounts, transactions, etc.', metadata={'tags': 'Element,Software System,Existing System', 'source': 'Big Bank plc', 'name': 'Mainframe Banking System', 'parent': '', 'type': 'SoftwareSystem'})]

In [32]:
print(f"""question: {query_examples[7]["question"]}""")
results = vectorestore.similarity_search(query_examples[7]["question"], k=1)
results

question: what can be used to store customer information?


[Document(page_content='\n\nname: Database\ntype: Container\ndescription: Stores user registration information, hashed authentication credentials, access logs, etc.', metadata={'tags': 'Element,Container,Database', 'source': 'Big Bank plc', 'name': 'Database', 'parent': 'Internet Banking System', 'type': 'Container'})]

In [33]:
results = vectorestore.similarity_search("what software systems can be used to withdraw cash?", k=1)
results

[Document(page_content='\n\nname: Internet Banking System\ntype: SoftwareSystem\ndescription: Allows customers to view information about their bank accounts, and make payments.', metadata={'tags': 'Element,Software System', 'source': 'Big Bank plc', 'name': 'Internet Banking System', 'parent': '', 'type': 'SoftwareSystem'})]

### RAG - RetrievalQAWithSourcesChain using Element_index

In [34]:
retriever = vectorestore.as_retriever()

# from langchain_core.runnables import RunnableParallel, RunnablePassthrough
# retrieval = RunnableParallel(
#     {"question": RunnablePassthrough() | retriever | _extract_context_documents}
# )

In [35]:
from langchain.chains import RetrievalQAWithSourcesChain
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm, chain_type="stuff", retriever=retriever
)

In [36]:
chain.invoke(
    {"question": "what software systems can be used to store customer information?"},
    return_only_outputs=True,
)

{'answer': 'The following software systems can be used to store customer information: a) Mainframe Banking System, b) Internet Banking System, and c) E-mail System.\n\n',
 'sources': 'Big Bank plc (28-pl, 30-pl)'}

In [37]:
chain.invoke(
    {"question": "what software systems can be used to store customer banking information?"},
    return_only_outputs=True,
)

{'answer': 'The Mainframe Banking System, Internet Banking System, and ATM software systems can be used to store and manage customer banking information.\n\n',
 'sources': 'Big Bank plc (28-pl, 30-pl, and multiple other instances)'}

In [38]:
chain.invoke(
    {"question": "what can be used to store customer information?"},
    return_only_outputs=True,
)

{'answer': 'Customer information can be stored in a database, which is a container that stores user registration information, hashed authentication credentials, and access logs, among other things (Big Bank plc, ',
 'sources': '1). The core banking information about customers, accounts, and transactions can also be stored in a mainframe banking system, which is a software system (Big Bank plc,'}

In [39]:
chain.invoke(
    {"question": "what software systems can be used to withdraw cash?"},
    return_only_outputs=True,
)

{'answer': 'The software systems that can be used to withdraw cash are ATM systems.\n\n',
 'sources': 'Big Bank plc.'}

In [40]:

chain.invoke(
    {"question": "what can provide a summary of a customer's bank accounts"},
    return_intermediate_steps=True
)

{'question': "what can provide a summary of a customer's bank accounts",
 'answer': 'The "Accounts Summary Controller" component in the "Big Bank plc" system provides a summary of a customer\'s bank accounts.\n\n',
 'sources': 'Big Bank plc (multiple sources)'}

In [41]:
results = vectorestore.similarity_search_with_score("what can provide a summary of a customer's bank accounts?")
results

[(Document(page_content='\n\nname: Accounts Summary Controller\ntype: Component\ndescription: Provides customers with a summary of their bank accounts.', metadata={'tags': 'Element,Component', 'source': 'Big Bank plc', 'name': 'Accounts Summary Controller', 'parent': 'API Application', 'type': 'Component'}),
  0.8787081241607666),
 (Document(page_content='\n\nname: Personal Banking Customer\ntype: Person\ndescription: A customer of the bank, with personal bank accounts.', metadata={'tags': 'Element,Person,Customer', 'source': 'Big Bank plc', 'name': 'Personal Banking Customer', 'parent': '', 'type': 'Person'}),
  0.8426058292388916),
 (Document(page_content='\n\nname: Internet Banking System\ntype: SoftwareSystem\ndescription: Allows customers to view information about their bank accounts, and make payments.', metadata={'tags': 'Element,Software System', 'source': 'Big Bank plc', 'name': 'Internet Banking System', 'parent': '', 'type': 'SoftwareSystem'}),
  0.8267369866371155),
 (Docum

### Relatioship Embedding

In [44]:
# First create relationship index and relationship data

# Create a vector from an existing relationship index

retrieval_query = """
RETURN "'"+relationship.consumer + "' " + relationship.description + " '" +relationship.provider + "'" AS text, score, 
{technology: relationship.technology, source: relationship.source, consumer: relationship.consumer, provider: relationship.provider} AS metadata
"""

relationship_vector = Neo4jVector.from_existing_relationship_index(
    MistralAIEmbeddings(mistral_api_key=os.getenv("MISTRAL_API_KEY")),
    url=os.getenv("NEO4J_URI"), username=os.getenv("NEO4J_USERNAME"), password=os.getenv("NEO4J_PASSWORD"),
    index_name="Relationship_index",
    # text_node_property="text", # the text property used as the page_content of Document
    retrieval_query=retrieval_query, # the retrieval_query used as the page_content of Document
)

In [45]:
relationship_vector.similarity_search("What system is used by Service Staff", k=1)


[Document(page_content="'Customer Service Staff' Uses 'Mainframe Banking System'", metadata={'technology': '', 'source': 'Big Bank plc', 'provider': 'Mainframe Banking System', 'consumer': 'Customer Service Staff'})]

In [46]:
relationship_vector.similarity_search_with_score("What system is used by Service Staff")


[(Document(page_content="'Customer Service Staff' Uses 'Mainframe Banking System'", metadata={'technology': '', 'source': 'Big Bank plc', 'provider': 'Mainframe Banking System', 'consumer': 'Customer Service Staff'}),
  0.8858469724655151),
 (Document(page_content="'Back Office Staff' Uses 'Mainframe Banking System'", metadata={'technology': '', 'source': 'Big Bank plc', 'provider': 'Mainframe Banking System', 'consumer': 'Back Office Staff'}),
  0.8771942853927612),
 (Document(page_content="'Internet Banking System' Gets account information from, and makes payments using 'Mainframe Banking System'", metadata={'technology': '', 'source': 'Big Bank plc', 'provider': 'Mainframe Banking System', 'consumer': 'Internet Banking System'}),
  0.8604210019111633),
 (Document(page_content="'Internet Banking System' Sends e-mail using 'E-mail System'", metadata={'technology': '', 'source': 'Big Bank plc', 'provider': 'E-mail System', 'consumer': 'Internet Banking System'}),
  0.8592248558998108)]

In [47]:
vectorestore.similarity_search_with_score("What system is used by Service Staff")

[(Document(page_content='\n\nname: Customer Service Staff\ntype: Person\ndescription: Customer service staff within the bank.', metadata={'tags': 'Element,Person,Bank Staff', 'source': 'Big Bank plc', 'name': 'Customer Service Staff', 'parent': '', 'type': 'Person'}),
  0.8767651319503784),
 (Document(page_content='\n\nname: E-mail System\ntype: SoftwareSystem\ndescription: The internal Microsoft Exchange e-mail system.', metadata={'tags': 'Element,Software System,Existing System', 'source': 'Big Bank plc', 'name': 'E-mail System', 'parent': '', 'type': 'SoftwareSystem'}),
  0.858390748500824),
 (Document(page_content='\n\nname: Back Office Staff\ntype: Person\ndescription: Administration and support staff within the bank.', metadata={'tags': 'Element,Person,Bank Staff', 'source': 'Big Bank plc', 'name': 'Back Office Staff', 'parent': '', 'type': 'Person'}),
  0.8562641143798828),
 (Document(page_content='\n\nname: Internet Banking System\ntype: SoftwareSystem\ndescription: Allows cust

#### RAG - RetrievalQAWithSourcesChain using Relationship_index

In [48]:

relationship_retriever = relationship_vector.as_retriever()

from langchain.chains import RetrievalQAWithSourcesChain
relationship_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm, chain_type="stuff", retriever=relationship_retriever
)

In [49]:
## information found from relationship index
relationship_chain.invoke(
    {"question": "What system is used by Service Staff"},
    return_only_outputs=True,
)

{'answer': 'The Customer Service Staff and Back Office Staff of the Big Bank plc use the Mainframe Banking System. The Internet Banking System gets account information from and makes payments using the Mainframe Banking System.\n\n',
 'sources': 'Big Bank plc (multiple sources)'}

In [50]:
## information not found from element index
chain.invoke(
    {"question": "What system is used by Service Staff"},
    return_only_outputs=True,
)

{'answer': 'The document does not provide information on the system used by the Customer Service Staff.\n\n',
 'sources': ''}

In [51]:
## information found from relationship index
relationship_chain.invoke(
    {"question": "what can provide a summary of a customer's bank accounts?"},
    return_only_outputs=True,
)

{'answer': 'A personal banking customer can view account balances and make payments using a mobile app, internet banking system, or single-page application.\n\n',
 'sources': 'Big Bank plc (multiple sources)'}

In [52]:
## information found from element index
chain.invoke(
    {"question": "what can provide a summary of a customer's bank accounts?"},
    return_only_outputs=True,
)

{'answer': 'The "Accounts Summary Controller" component of the "Big Bank plc" software system provides a summary of a customer\'s bank accounts.\n\n',
 'sources': '1, 2, 3.'}

### RAG using Relationship_index and Element_index
