In [None]:
%pip install openai neo4j langchain_openai

In [1]:

import os
from neo4j import GraphDatabase
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain.schema import HumanMessage
from langchain.chains import GraphCypherQAChain

import pandas as pd

# --- Azure OpenAI config ---
API_KEY = "CJf0VBRpakDXgiWW8wxUT7hL9CbaOmE979tyuEGMg0J5n5zsnWOOJQQJ99BGACYeBjFXJ3w3AAABACOGRVVR"
API_ENDPOINT = "https://ameytxtai.openai.azure.com/"
API_VERSION = "2023-12-01-preview"
DEPLOYMENT_NAME_LLM = "test_jpm_3_5"
DEPLOYMENT_NAME_EMBED = "text-embedding01"

# --- Neo4j config ---
NEO4J_URI = "neo4j+s://0e39a427.databases.neo4j.io"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = "5qAQp_sJS3DoW98Mx0X41PN91yKPqEtCzMagYYY3cdQ"
VECTOR_INDEX_NAME = "chunkEmbeddings"






In [18]:
# --- Initialize drivers ---
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

embedder = AzureOpenAIEmbeddings(
    azure_deployment=DEPLOYMENT_NAME_EMBED,
    api_key=API_KEY,
    api_version=API_VERSION,
    azure_endpoint=API_ENDPOINT
)

llm = AzureChatOpenAI(
    deployment_name=DEPLOYMENT_NAME_LLM,
    openai_api_key=API_KEY,
    azure_endpoint=API_ENDPOINT,
    openai_api_version=API_VERSION,
    temperature=0
)

## About Vector Search

Vector search enables semantic retrieval of text chunks from your Neo4j graph.  
Instead of keyword matching, it finds the most contextually similar passages to your query, even if the wording is different.

**Tip:**  
Inspect the returned results to verify relevance and adjust your chunking or embedding strategy if needed.

# Initialize Vector Retriever
This section sets up the vector-based retriever for semantic search over your Neo4j knowledge graph.

- **Query:**  
  The example query asks, "What are the risks that Apple faces?"

- **Vector Search in Neo4j:**  
  - Connects to the Neo4j database using the provided `driver`.
  - Uses the `chunkEmbeddings` vector index for efficient semantic retrieval.
  - The `embedder` generates embeddings for the query.
  - Returns the `text` property from matching chunks.

In [3]:
def vectorSearchNeo4j(query_vector):
    with driver.session() as session:
        result = session.run(
            f"""
            CALL db.index.vector.queryNodes($index_name, 10, $query_vector)
            YIELD node, score
            RETURN node.text AS text, elementId(node) AS node_id, score
            """,
            index_name=VECTOR_INDEX_NAME,
            query_vector=query_vector
        )
        records = result.data()

    df = pd.DataFrame(records)
    print("Retrieved Chunks:")
    node_ids = [record["node_id"] for record in records]
    return df, node_ids

def getGraphRagResponse(query, node_ids, context_records):
    with driver.session() as session:
        for node_id in node_ids:
            result = session.run(
                query,
                node_id=node_id
            )
            context_records.extend(result.data())

    if len(context_records) == 0:
        print("No enriched context found; falling back to plain chunk text.")
    else:
        print("Enriched Context Records Found:")
        df_context = pd.DataFrame(context_records)

    return context_records, df_context


In [4]:
query = "What are the risks that Apple faces?"

response = llm([HumanMessage(content=query)])
print("\nPlain LLM Response:")
print(response.content)

  response = llm([HumanMessage(content=query)])



Plain LLM Response:
1. Competition: Apple faces intense competition from other technology companies such as Samsung, Google, and Microsoft. These companies are constantly innovating and releasing new products that could potentially outperform Apple's offerings.

2. Market saturation: The smartphone market is becoming increasingly saturated, with fewer new customers to attract. This could lead to slower growth and declining sales for Apple.

3. Dependence on iPhone sales: Apple generates a significant portion of its revenue from iPhone sales. Any decline in iPhone sales could have a significant impact on the company's financial performance.

4. Supply chain disruptions: Apple's supply chain is complex and relies on components from various suppliers around the world. Any disruptions in the supply chain, such as natural disasters or political instability, could impact Apple's ability to produce and sell its products.

5. Regulatory challenges: Apple operates in multiple countries and is 

In [5]:
query = "What are the risks that Apple faces?"
query_vector = embedder.embed_query(query)
vector_search_apple_risks_df, node_ids_vector_search_apple_risk = vectorSearchNeo4j(query_vector)
vector_search_apple_risks_df

Retrieved Chunks:


Unnamed: 0,text,node_id,score
0,"the Company to potential liabilities, increase...",4:3204c234-a248-4a1f-8797-165c95b59f64:2599,0.916428
1,"relief against the Company, and has\nfrom time...",4:3204c234-a248-4a1f-8797-165c95b59f64:2598,0.912994
2,interruptions can harm or disrupt internationa...,4:3204c234-a248-4a1f-8797-165c95b59f64:2587,0.911133
3,in both\nfrequency and sophistication with in...,4:3204c234-a248-4a1f-8797-165c95b59f64:2596,0.911026
4,impact the Company's net sales to its indirect...,4:3204c234-a248-4a1f-8797-165c95b59f64:2585,0.909668
5,intended to be inactive\ntextual references on...,4:3204c234-a248-4a1f-8797-165c95b59f64:2586,0.908737
6,conditions and adverse economic conditions can...,4:3204c234-a248-4a1f-8797-165c95b59f64:2591,0.90831
7,perception and analysis of the relative benefi...,4:3204c234-a248-4a1f-8797-165c95b59f64:2593,0.906876
8,interruption\naffecting such sources would exa...,4:3204c234-a248-4a1f-8797-165c95b59f64:2588,0.906815
9,sults.\nApple Inc. | 2023 Form 10-K | 25\nSect...,4:3204c234-a248-4a1f-8797-165c95b59f64:2608,0.906677


## Advanced RAG: Contextual Cypher Retrieval

This section demonstrates how to use a custom Cypher query with the `Vector Search` to provide richer, more contextual answers.

- **Custom Cypher Query:**  
  The `detail_context_query` matches text chunks (`node`) to their source documents, associated companies, and the risk factors those companies face.  
  It returns:
  - The company name
  - The context text from the chunk
  - A list of distinct risk factors

- **Vector Search:**  
  - Performs semantic search using the `chunkEmbeddings` vector index.
  - Applies the custom Cypher query to retrieve relevant context and associated risk factors.

- **GraphRAG:**  
  - Combines the LLM and the custom retriever to answer the question:  
    _"What are the top risk factors that Apple faces?"_

- **Usage:**  
  This approach enables highly specific, context-rich answers by leveraging the full power of graph relationships and semantic search.

In [6]:
context_records_apple_risks = []
detail_context_query  = """
                            MATCH (n) WHERE elementId(n) = $node_id
                            MATCH (n)-[:FROM_DOCUMENT]-(doc:Document)-[:FILED]-(company:Company)-[:FACES_RISK]-(risk:RiskFactor)
                            RETURN company.name AS company, n.text AS context, collect(DISTINCT risk.name) AS risks
                        """
context_records_apple_risks, df_contextual_search_apple_risk = getGraphRagResponse(detail_context_query, node_ids_vector_search_apple_risk,context_records_apple_risks)

df_contextual_search_apple_risk

Enriched Context Records Found:


Unnamed: 0,company,context,risks
0,APPLE INC,"the Company to potential liabilities, increase...","[Economic conditions, Political conditions, Ch..."
1,APPLE INC,"relief against the Company, and has\nfrom time...","[Economic conditions, Political conditions, Ch..."
2,APPLE INC,interruptions can harm or disrupt internationa...,"[Economic conditions, Political conditions, Ch..."
3,APPLE INC,in both\nfrequency and sophistication with in...,"[Economic conditions, Political conditions, Ch..."
4,APPLE INC,impact the Company's net sales to its indirect...,"[Economic conditions, Political conditions, Ch..."
5,APPLE INC,intended to be inactive\ntextual references on...,"[Economic conditions, Political conditions, Ch..."
6,APPLE INC,conditions and adverse economic conditions can...,"[Economic conditions, Political conditions, Ch..."
7,APPLE INC,perception and analysis of the relative benefi...,"[Economic conditions, Political conditions, Ch..."
8,APPLE INC,interruption\naffecting such sources would exa...,"[Economic conditions, Political conditions, Ch..."
9,APPLE INC,sults.\nApple Inc. | 2023 Form 10-K | 25\nSect...,"[Economic conditions, Political conditions, Ch..."


**Feed outputs of graph based traversals back to LLM**  
  Get outputs from the graph by running multi hop queries as above, get it as a dataframe and then pass it back to the LLM to summarize the results to get a contextually rich response based on not just the documents but also the relationships in the graph

In [7]:
if len(context_records_apple_risks) > 0:
    context_strings = []
    for record in context_records_apple_risks:
        risks = ", ".join(record.get("risks", []))
        company = record.get("company", "Unknown Company")
        context_text = record.get("context", "")
        context_strings.append(f"Company: {company}\nRisks: {risks}\nContext: {context_text}")

    final_context = "\n\n".join(context_strings)
else:
    # fallback to plain text chunks if no enriched context
    final_context = "\n".join([r["text"] for r in context_records_apple_risks])

prompt = f"""
You are a helpful assistant. Based on the following context, answer the question.
Summarize in proper numbered bullet points, well spaced.
Context:
{final_context}

Question:
{query}
"""

response = llm([HumanMessage(content=prompt)])
print("\nResponse from LLM:")
print(response.content)



Response from LLM:
1. Economic conditions, including slow growth or recession, high unemployment, inflation, tighter credit, higher interest rates, and currency fluctuations.
2. Political conditions, trade and other international disputes, war, terrorism, natural disasters, public health issues, industrial accidents, and other business interruptions.
3. Change in tax laws, adverse changes in tax laws, tax liabilities, uncertain tax positions.
4. Security breaches, cyber-attacks, product, system security, and data protection breaches.
5. Manufacturing and supply chain risks, disruptions in manufacturing or logistics, design and manufacturing defects.
6. Stock market fluctuations, volatility of stock price, concentration of stock ownership.
7. Aggressive price competition, frequent product introductions and transitions, short product life cycles.
8. Evolving industry standards, commodity pricing fluctuations, industry-wide shortage and significant commodity pricing fluctuations.
9. Inte

## Why "Apple" Queries Can Fail in Vector-Cypher Retrieval

When you ask a question like "What are the risks that Apple faces?" using a vector-Cypher retriever, you may not get the structured or complete answer you expect. Here’s why:

- **How Vector-Cypher Works:**  
  - The retrieval process first performs a semantic search over all text chunks in the graph.
  - It retrieves the top-k chunks most similar to your query—regardless of which company (or entity) they are about.
  - The Cypher query then starts from each chunk and traverses the graph for related information.

- **The Problem with Entity-Centric Queries:**  
  - If your query is about "Apple," but there are no chunks whose text is semantically similar to your query and also specifically about Apple, the retriever may return:
    - Chunks about other companies.
    - Chunks that mention "risk" but not "Apple."
    - Generic or boilerplate risk factor text.
  - The Cypher query can only traverse from the retrieved chunk—it cannot "filter" or "redirect" to Apple if the chunk isn’t already about Apple.

- **Key Limitation:**  
  - **The chunk is the anchor.** If your query is about an entity (like Apple), but the chunk retrieval is not entity-aware, you may never reach the correct node or context in the graph.
  - This is especially problematic for broad or entity-centric questions, where you want to aggregate or summarize information about a specific node (e.g., a company) rather than just retrieve semantically similar passages.

> **Conclusion:**  
> Vector-Cypher retrieval is powerful for finding relevant context, but it is fundamentally limited by the chunk-centric approach. For entity-centric questions, you need either:
> - Chunks that are explicitly about the entity, or
> - A retrieval/query strategy that starts from the entity node itself, not from arbitrary text chunks.

## VectorCypherRetriever Example: Detailed Search with Context — Why This Is a Good Query

This example demonstrates how to use `VectorCypherRetriever` to answer nuanced, relationship-driven questions by combining semantic search with graph traversal.

**Why this is a good query:**

- **Semantic Relevance:**  
  The vector retriever surfaces text chunks that are semantically similar to the question ("Who are the asset managers most affected by banking regulations?"). This grounds the search in passages that actually discuss asset managers and regulations.

- **Graph Contextualization:**  
  The Cypher query starts from each relevant chunk and traverses the graph to:
    - Find the source document.
    - Identify the company associated with the document.
    - Traverse to asset managers that own or are related to that company.

- **Structured and Contextual Output:**  
  The result provides:
    - The company name (`company`)
    - The asset manager’s name (`manager`)
    - The text context from the original chunk (`context`)

- **Why it works well:**  
  - The question is naturally chunk-centric: you want to find meaningful passages about asset managers and regulations, then extract structured information about the entities involved.
  - The graph traversal enriches the answer, linking unstructured context (text) to structured graph entities (companies, managers).
  - This approach leverages the best of both worlds: semantic search for relevance, and graph queries for structure and relationships.

**Summary:**  
This pattern is ideal when your question is about relationships or context that can be surfaced from relevant passages, and when you want to return both the context and the structured entities connected to it.  
It is less effective for purely entity-centric aggregation (like "all risks for Apple"), but perfect for questions where the chunk is the natural anchor for graph exploration.


In [8]:
asset_manager_query_text = "Which Asset Managers are most affected by cryptocurrency policies?"
query_vector = embedder.embed_query(asset_manager_query_text)

chunk_to_asset_manager_query = """
 MATCH (n) WHERE elementId(n) = $node_id
 WITH n as node
MATCH (node)-[:FROM_DOCUMENT]-(doc:Document)-[:FILED]-(company:Company)-[:OWNS]-(manager:AssetManager)
RETURN company.name AS company, manager.managerName AS AssetManagerWithSharesInCompany, node.text AS context
"""

vector_search_asset_manager_df, node_ids_asset_manager = vectorSearchNeo4j(query_vector)
vector_search_asset_manager_df

Retrieved Chunks:


Unnamed: 0,text,node_id,score
0,cryptocurrency assets could be treated as a ge...,4:3204c234-a248-4a1f-8797-165c95b59f64:927,0.904678
1,ncy offerings could subject us to additional\n...,4:3204c234-a248-4a1f-8797-165c95b59f64:926,0.897812
2,"changes in cryptocurrencies, government crypto...",4:3204c234-a248-4a1f-8797-165c95b59f64:480,0.89624
3,"may continue to result\nin, disruption of and ...",4:3204c234-a248-4a1f-8797-165c95b59f64:489,0.889496
4,"activity, government fiscal and tax\npolicies,...",4:3204c234-a248-4a1f-8797-165c95b59f64:938,0.887054
5,pliance with such requirements can be onerous ...,4:3204c234-a248-4a1f-8797-165c95b59f64:490,0.88623
6,on royalties for our intellectual property.\nC...,4:3204c234-a248-4a1f-8797-165c95b59f64:1329,0.885498
7,While we maintain insurance policies intended ...,4:3204c234-a248-4a1f-8797-165c95b59f64:923,0.884918
8,vendors and large cloud services companies. \n...,4:3204c234-a248-4a1f-8797-165c95b59f64:470,0.88443
9,which we do business\nwould also likely harm ...,4:3204c234-a248-4a1f-8797-165c95b59f64:485,0.884384


In [9]:
context_records_asset_manager = []
context_records_asset_manager, df_contextual_search_asset_manager = getGraphRagResponse(chunk_to_asset_manager_query, node_ids_asset_manager,context_records_asset_manager)
df_contextual_search_asset_manager

Enriched Context Records Found:


Unnamed: 0,company,AssetManagerWithSharesInCompany,context
0,NVIDIA CORPORATION,ALLIANCEBERNSTEIN L.P.,"changes in cryptocurrencies, government crypto..."
1,NVIDIA CORPORATION,AMERIPRISE FINANCIAL INC,"changes in cryptocurrencies, government crypto..."
2,NVIDIA CORPORATION,AMUNDI,"changes in cryptocurrencies, government crypto..."
3,NVIDIA CORPORATION,BANK OF AMERICA CORP /DE/,"changes in cryptocurrencies, government crypto..."
4,NVIDIA CORPORATION,Bank of New York Mellon Corp,"changes in cryptocurrencies, government crypto..."
...,...,...,...
79,NVIDIA CORPORATION,MORGAN STANLEY,which we do business\nwould also likely harm ...
80,NVIDIA CORPORATION,NORTHERN TRUST CORP,which we do business\nwould also likely harm ...
81,NVIDIA CORPORATION,STATE STREET CORP,which we do business\nwould also likely harm ...
82,NVIDIA CORPORATION,WELLINGTON MANAGEMENT GROUP LLP,which we do business\nwould also likely harm ...


In [31]:
CYPHER_GENERATION_TEMPLATE = """
You are an expert Neo4j Cypher translator who understands natural language questions and generates Cypher queries strictly based on the Neo4j Schema provided.

Instructions:
- Use aliases for nodes and relationships.
- Generate Cypher queries compatible with Neo4j 5.
- Do not use EXISTS or SIZE.
- Avoid LIMIT clauses.
- Always use case-insensitive fuzzy matching for string properties (e.g., toLower(c.name) CONTAINS 'apple').
- Use only nodes and relationships from this schema.

Schema:
{schema}

Examples:

Human: Find all companies owned by BlackRock Inc
Assistant: ```MATCH (am:AssetManager)-[:OWNS]->(c:Company) WHERE toLower(am.managerName) CONTAINS 'blackrock inc' RETURN c.name```

Human: List risk factors for Apple Inc
Assistant: ```MATCH (c:Company)-[:FACES_RISK]->(r:RiskFactor) WHERE toLower(c.name) CONTAINS 'apple' RETURN r.name```

Human: List documents mentioning Apple Inc and their associated risk factors
Assistant: ```MATCH (c:Company)-[:FACES_RISK]->(r:RiskFactor)WHERE toLower(c.name) CONTAINS 'apple inc' WITH c,r MATCH (doc:Document)-[]-(c)  RETURN doc.id, collect(DISTINCT r.name) AS risks```

Human: Find executives who are associated with companies that BlackRock Inc owns
Assistant: ```MATCH (am:AssetManager)-[:OWNS]->(c:Company)<-[:HAS_ENTITY]-(e:Executive) WHERE toLower(am.managerName) CONTAINS 'blackrock inc' RETURN e.name, c.name```

Human: Find transactions that mention products mentioned in documents filed by companies
Assistant: ```MATCH (d:Document)-[:FILED]-(c:Company) MATCH (d)-[:MENTIONS]->(p:Product) MATCH (t:Transaction)-[:MENTIONS]->(p) RETURN DISTINCT c.name, t.name```

Human: What metrics are Microsoft transactions associated with?
Assistant: ```MATCH (c:Company)-[:MENTIONS]-(t:Transaction) WHERE toLower(c.name) CONTAINS 'microsoft' WITH t MATCH (t)-[:HAS_METRIC]->(m:FinancialMetric) RETURN DISTINCT m.name```

Human: {question}
Assistant:
"""


In [28]:
from langchain.prompts.prompt import PromptTemplate


CYPHER_GENERATION_PROMPT = PromptTemplate(
    input_variables=['schema','question'], validate_template=True, template=CYPHER_GENERATION_TEMPLATE
)

from langchain.graphs import Neo4jGraph

graph = Neo4jGraph(
    url=NEO4J_URI, 
    username=NEO4J_USERNAME, 
    password=NEO4J_PASSWORD,
    database="neo4j"
)

chain = GraphCypherQAChain.from_llm(
    llm,
    graph=graph,
    cypher_prompt=CYPHER_GENERATION_PROMPT,
    verbose=True,
    allow_dangerous_requests=True,
    return_direct=True,top_k=500
)

import json
def chat(que):
    r = chain.invoke(que)
    print(r)
    summary_prompt_tpl = f"""Context: {json.dumps(r['result'])}
    * Summarise the above fact as if you are answering this question "{r['query']}"
    * When the fact is not empty, assume the question is valid and the answer is true
    * Do not return helpful or extra text or apologies
    * List the results in rich text format if there are more than one results
    * When you summarize, properly analyze ALL rows of data, if multiple year comparisons are asked, properly analyze and respond.
    * DO NOT LIMIT THE NUMBER OF ROWS WHEN YOU SUMMARIZE
    Answer:
    """
    return llm.invoke(summary_prompt_tpl).content



In [39]:

r2 = chat("""What metrics are Nvidia transactions associated with?""")
print(f"Final answer: {r2}")



[1m> Entering new GraphCypherQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (t:Transaction)-[:MENTIONS]->(:Product {name: 'Nvidia'})-[:HAS_METRIC]->(m:FinancialMetric) RETURN DISTINCT m.name[0m

[1m> Finished chain.[0m
{'query': 'What metrics are Nvidia transactions associated with?', 'result': []}
Final answer: - Nvidia transactions are associated with metrics related to revenue, operating income, net income, and earnings per share.
