# Implement a Knowledge Graph–enhanced RAG (Retrieval-Augmented Generation) system using Wikipedia data.

#### Import Libraries

In [1]:
# !pip install --quiet --upgrade langchain-core langchain-community neo4j langchain-neo4j yfiles-jupyter-graphs 
# !pip install --quiet --upgrade langchain-google-genai langchain-experimental google-generativeai pydantic sentence-transformers yfiles-jupyter-graphs
# !pip install --quiet numpy==1.26.4 tf-keras
# !pip install --quiet -U class langchain-huggingface

In [2]:
from langchain_core.runnables import (
    RunnableBranch,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts.prompt import PromptTemplate
from pydantic import BaseModel, Field
from typing import Tuple, List
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser
import os
from langchain_neo4j import Neo4jGraph
from langchain_community.document_loaders import WikipediaLoader
from langchain.text_splitter import TokenTextSplitter
from neo4j import GraphDatabase
from yfiles_jupyter_graphs import GraphWidget
from langchain_community.vectorstores import Neo4jVector
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.vectorstores.neo4j_vector import remove_lucene_chars
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from google.api_core.exceptions import ResourceExhausted
import time
# Optional: for pretty output formatting
from langchain_core.output_parsers import StrOutputParser

##### Neo4j Environment and Credentials Setup



In [None]:
os.environ["GOOGLE_API_KEY"] = "-redacted-"
os.environ["NEO4J_URI"] = "neo4j+s://df11d58e.databases.neo4j.io"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "-redacted-"

In [4]:
# Initialize the Neo4j graph and the database to store the graph we create from the Wikipedia data
graph = Neo4jGraph(database="neo4j")

#### Data ingestion

For this demonstration, we will use Nykaa's (cosmetic e-commerce) Wikipedia page. We can utilize LangChain loaders to fetch and split the documents from Wikipedia seamlessly.

In [5]:
#Load Wikipedia article about query = 'Nykaa'
raw_documents = WikipediaLoader(query="Nykaa").load()
len(raw_documents) # total lenght of the raw document

25

In [6]:
raw_documents

[Document(metadata={'title': 'Nykaa', 'summary': 'FSN E-Commerce Ventures Ltd, doing business as Nykaa, is an Indian retail company, headquartered in Mumbai. It sells beauty, wellness and fashion products through its website, mobile app and over 100 physical stores. In 2020, it became the first Indian unicorn startup headed by a woman.\nNykaa sells products which are manufactured in India as well as internationally. In 2015, the company expanded from online-only to an omnichannel model and began selling products apart from beauty. As of 2020, it retails over 2,000 brands and 200,000 products across its platforms.\n\n', 'source': 'https://en.wikipedia.org/wiki/Nykaa'}, page_content='FSN E-Commerce Ventures Ltd, doing business as Nykaa, is an Indian retail company, headquartered in Mumbai. It sells beauty, wellness and fashion products through its website, mobile app and over 100 physical stores. In 2020, it became the first Indian unicorn startup headed by a woman.\nNykaa sells products

In [7]:
# define chunksize and chunk_overlap for tokentext splitter
chunksize = 2**8
chunk_overlap = int(chunksize/2**3) + 1
print('chunksize :', chunksize)
print('chunk_overlap :', chunk_overlap)

chunksize : 256
chunk_overlap : 33


In [8]:
#Split it into smaller chunks for Gemini to process
text_splitter = TokenTextSplitter(chunk_size=chunksize, chunk_overlap=chunk_overlap)
documents = text_splitter.split_documents(raw_documents)
len(documents)

89

In [9]:
# drop duplicate metadata for unique information
duplicate_docs = []
unique_docs = []
for doc in documents:
    if doc.metadata['summary'] in [d.metadata['summary'] for d in unique_docs]:
        duplicate_docs.append(doc)
    else:
        unique_docs.append(doc)

print(f"Number of unique documents: {len(unique_docs)}")
print(f"Number of duplicate documents: {len(duplicate_docs)}")


Number of unique documents: 25
Number of duplicate documents: 64


- Looking at the unique documents

In [10]:
unique_docs

[Document(metadata={'title': 'Nykaa', 'summary': 'FSN E-Commerce Ventures Ltd, doing business as Nykaa, is an Indian retail company, headquartered in Mumbai. It sells beauty, wellness and fashion products through its website, mobile app and over 100 physical stores. In 2020, it became the first Indian unicorn startup headed by a woman.\nNykaa sells products which are manufactured in India as well as internationally. In 2015, the company expanded from online-only to an omnichannel model and began selling products apart from beauty. As of 2020, it retails over 2,000 brands and 200,000 products across its platforms.\n\n', 'source': 'https://en.wikipedia.org/wiki/Nykaa'}, page_content='FSN E-Commerce Ventures Ltd, doing business as Nykaa, is an Indian retail company, headquartered in Mumbai. It sells beauty, wellness and fashion products through its website, mobile app and over 100 physical stores. In 2020, it became the first Indian unicorn startup headed by a woman.\nNykaa sells products

##### Use LLM Transformer to create graph
- Now it's time to construct a graph based on the retrieved documents. 
- For this purpose, we have implemented an LLMGraphTransformer module that significantly simplifies constructing and storing a knowledge graph in a Neo4j graph database

In [11]:
# using one when the other max out on utilization limit
model_pro="gemini-2.5-pro" # Priority 1
model_2_5_lite = "gemini-2.5-flash-lite" # Priority 2
model_2_5="gemini-2.5-flash" # Priority 3
model_1_5="gemini-1.5-flash" # Priority 4
model_2_0="gemini-2.0-flash" # Priority 5
model_2_0_exp="gemini-2.0-flash-exp" # Priority 6
model = model_2_5_lite

In [12]:
#Initialize the Graph Transformer
llm_transformer = LLMGraphTransformer(llm=ChatGoogleGenerativeAI(model=model))

#Convert chunked Wikipedia documents into Subject-Predicate-Object triplets
graph_documents = llm_transformer.convert_to_graph_documents(unique_docs)
print("Graph Transformer done successfully")

#Push these SPO triplets into Neo4j as nodes + relationships
graph.add_graph_documents(
    graph_documents,
    baseEntityLabel=True,
    include_source=True
)


E0000 00:00:1758439072.134989 1064716 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Graph Transformer done successfully


In [13]:
# Set the Cypher query to display nodes and relationships (the graph)
default_cypher = "MATCH (s)-[r]->(t) RETURN s,r,t limit 100" # restricting the number of nodes to display to 100 in the output

def showGraph(cypher: str = default_cypher):
  # Create a Neo4j driver using values from environment variables
    driver = GraphDatabase.driver(
        uri=os.environ["NEO4J_URI"],
        auth=(
            os.environ["NEO4J_USERNAME"],
            os.environ["NEO4J_PASSWORD"]
        )
    )
    session = driver.session() # Start a new session
    widget = GraphWidget(graph=session.run(cypher).graph()) # Run the Cypher query and get the resulting graph
    widget.node_label_mapping = 'id' # Set how nodes are labeled in the graph widget
    display(widget) # Return the widget so it displays in notebook

# Show the Knowledge Graph!
showGraph()


GraphWidget(layout=Layout(height='800px', width='100%'))

##### Creating an Vector and keyword Index database

In [14]:
##### Use the graph in Neo4j Vector and Keyword index database. This will help in calculating the embedding values for the missing or new entries.
vector_index = Neo4jVector.from_existing_graph(
    embedding=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"),
    graph=graph,
    search_type="hybrid",
    node_label="Document",
    text_node_properties=["text"],
    embedding_node_property="embedding"
)


2025-09-21 12:50:19.132730: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


##### Graph retriever

- Only prompt based LLM response without any context

In [15]:
# Step 1: Create Fulltext Index in Neo4j
graph.query("""
    CREATE FULLTEXT INDEX entity_index IF NOT EXISTS
    FOR (e:__Entity__) ON EACH [e.id]
""")

# Step 2: Define Output Schema
class Entities(BaseModel):
    """Entity extraction from Nykaa Wikipedia article."""
    names: List[str] = Field(
        ...,
        description="All unique person names, product brands, organization or business entities names and locations mentioned in the input text."
    )

# Step 3: Gemini Prompt Template (Customized for Wikipedia context)
prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        "Your job is to extract all relevant named entities that are people, organizations, product brands or the locations mentioned in the input text."
    ),
    (
        "human",
        "Use the given format to extract information from the following input: {question}"
    )
])

# Step 4: Chain with Gemini Pro + Structured Output Schema
entity_chain = prompt | ChatGoogleGenerativeAI(model=model).with_structured_output(Entities)


E0000 00:00:1758439239.052288 1064716 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [16]:
entity_chain.invoke({"question": "Where did Falguni Nayar study?"}).names

['Falguni Nayar']

- The response based only the LLM prompt is not giving the dezired results. 
- This is because the LLM is not able to understand the context of the question and is not able to provide the correct answer.
- This is the Generative part fo the Retrieval Augmented Generation (RAG) system without the Augment context.

- We have a rich Graph Knowledge base created using the data from the wikipedia pages earlier.
- We can use this Graph Knowledge base to Augment the context LLM gets for the generation of the response.

In [17]:
def generate_full_text_query(input: str) -> str:
    """
    Generate a full-text search query for a given input string.

    This function constructs a query string suitable for a full-text search.
    It processes the input string by splitting it into words and appending a
    similarity threshold (~2 changed characters) to each word, then combines
    them using the AND operator. Useful for mapping entities from user questions
    to database values, and allows for some misspelings.
    """
    full_text_query = ""
    words = [el for el in remove_lucene_chars(input).split() if el]
    for word in words[:-1]:
        full_text_query += f" {word}~2 AND"
    full_text_query += f" {words[-1]}~2"
    return full_text_query.strip()

# Fulltext index query
def structured_retriever(question: str) -> str:
    """
    The function starts by detecting entities in the user question. Next, it iterates over the detected entities and uses a Cypher template to retrieve the neighborhood of relevant nodes.
    """
    result = ""
    entities = entity_chain.invoke({"question": question})

    for entity in entities.names:
        response = graph.query(
            """CALL db.index.fulltext.queryNodes('entity_index', $query, {limit:2})
            YIELD node,score
            CALL {
              WITH node
              MATCH (node)-[r:!MENTIONS]->(neighbor)
              RETURN node.id + ' - ' + type(r) + ' -> ' + neighbor.id AS output
              UNION ALL
              WITH node
              MATCH (node)<-[r:!MENTIONS]-(neighbor)
              RETURN neighbor.id + ' - ' + type(r) + ' -> ' +  node.id AS output
            }
            RETURN output LIMIT 50
            """,
            {"query": generate_full_text_query(entity)},
        )
        result += "\n".join([el['output'] for el in response])
    return result

In [18]:
relations = structured_retriever("Where did Falguni Nayar study?")
print(f'{relations}')

  words = [el for el in remove_lucene_chars(input).split() if el]


Falguni Nayar - FOUNDER -> Nykaa
Falguni Nayar - FORMER_MANAGING_DIRECTOR -> Kotak Mahindra Capital Company
Falguni Nayar - CEO -> Nykaa
Falguni Nayar - FOUNDED -> Nykaa
Falguni Nayar - WORKED_AT -> Kotak Mahindra Group
Falguni Nayar - WORKED_AT -> A. F. Ferguson & Co
Falguni Nayar - BORN_IN -> Mumbai
Falguni Nayar - NATIONALITY -> Indian
Falguni Nayar - GRADUATED_IN -> 1985
Falguni Nayar - RAISED_IN -> Maharashtra
Falguni Nayar - GRADUATE_OF -> Sydenham College Of Commerce And Economics
Falguni Nayar - FORMER_MANAGING_DIRECTOR_AT -> Kotak Mahindra Capital Company
Falguni Nayar - BORN -> 19 February 1963
Falguni Nayar - BORN -> 1963
Falguni Nayar - FOUNDER_OF -> Nykaa
Falguni Nayar - FOUNDER_OF -> Fsn E-Commerce Ventures
Falguni Nayar - CEO_OF -> Nykaa
Falguni Nayar - RANKED_BY -> Forbes
Falguni Nayar - RANKED_ON -> October 09, 2024
Falguni Nayar - POSTGRADUATE_OF -> Indian Institute Of Management Ahmedabad
Falguni Nayar - JOINED -> Kotak Mahindra Group
Falguni Nayar - JOINED -> 1993
F

- Final retriever were we'll combine the unstructured and graph retriever to create the final context that will be passed to an LLM.

In [19]:
def retriever(question: str):
    print(f"Search query: {question}")
    structured_data = structured_retriever(question)
    unstructured_data = [el.page_content for el in vector_index.similarity_search(question)]
    final_data = f"""Structured data:
{structured_data}
Unstructured data:
{"#Document ". join(unstructured_data)}
    """
    return final_data

##### RAG Pipeline definition

In [20]:
# Condense a chat history and follow-up question into a standalone question
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question,
in its original language.
Chat History:
{chat_history}
Follow Up Input: {question}
Standalone question:"""  # noqa: E501
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

def _format_chat_history(chat_history: List[Tuple[str, str]]) -> List:
    buffer = []
    for human, ai in chat_history:
        buffer.append(HumanMessage(content=human))
        buffer.append(AIMessage(content=ai))
    return buffer

_search_query = RunnableBranch(
    # If input includes chat_history, we condense it with the follow-up question
    (
        RunnableLambda(lambda x: bool(x.get("chat_history"))).with_config(
            run_name="HasChatHistoryCheck"
        ),  # Condense follow-up question and chat into a standalone_question
        RunnablePassthrough.assign(
            chat_history=lambda x: _format_chat_history(x["chat_history"])
        )
        | CONDENSE_QUESTION_PROMPT
        | ChatGoogleGenerativeAI(model=model) # Use Gemini instead of OpenAI
        | StrOutputParser(),
    ),
    # Else, we have no chat history, so just pass through the question
    RunnableLambda(lambda x : x["question"]),
)

E0000 00:00:1758439242.344363 1064716 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


- We introduce a prompt that leverages the context provided by the integrated hybrid retriever to produce the response, completing the implementation of the RAG chain.

In [21]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
Use natural language and be concise.
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    RunnableParallel(
        {
            "context": _search_query | retriever,
            "question": RunnablePassthrough(),
        }
    )
    | prompt
    | ChatGoogleGenerativeAI(model=model)
    | StrOutputParser()
)

E0000 00:00:1758439242.416210 1064716 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


##### Let's test response of the chain to user questions:

In [22]:
chain.invoke({"question": "Where did Falguni Nayar study?"})

Search query: Where did Falguni Nayar study?




'Falguni Nayar studied at Sydenham College Of Commerce And Economics and the Indian Institute Of Management Ahmedabad.'

In [23]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "Who founded Nykaa?"})

Search query: Who founded Nykaa?




'Falguni Nayar founded Nykaa.'

In [24]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "When was Nykaa founded?",
              "chat_history": [("Who founded Nykaa?", "Falguni Nayar founded Nykaa.")],
              })

Search query: When was Nykaa founded?




'Nykaa was founded in April 2012.'

In [25]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "Where is Nykaa located?"})

Search query: Where is Nykaa located?




'Nykaa is headquartered in Mumbai.'

In [26]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "What brands sells in Nykaa?"})

Search query: What brands sells in Nykaa?




'Nykaa sells beauty products, wellness products, and fashion products.'

In [27]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "What Organization are assosiated with Nykaa?"})

Search query: What Organization are assosiated with Nykaa?




'Nykaa is associated with Fsn E-Commerce Ventures Ltd, Falguni Nayar, and Guwahati City Centre Mall.'

In [28]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "Who are the people assosiated with Nykaa?"})

Search query: Who are the people assosiated with Nykaa?




'Falguni Nayar is the founder and CEO of Nykaa. Sakshi Malik is a model for Nykaa. Mansi Aggarwal is a choreographer for Nykaa.'

In [29]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "What is the revenue of Nykaa?"}) #example of no context to test the response when context is unavailable

Search query: What is the revenue of Nykaa?




"The provided context does not contain information about Nykaa's revenue."

In [36]:
time.sleep(10) # this avoid the unitilization limit error 
chain.invoke({"question": "What are products category sold in Nykaa?"})

Search query: What are products category sold in Nykaa?




'Nykaa sells beauty products, wellness products, and fashion products.'

In [31]:
time.sleep(61) # this avoid the unitilization limit error 
chain.invoke({"question": "Where did Nykaa's founder study i.e got his/her education?",
              "chat_history": [("Who founded Nykaa?", "Falguni Nayar founded Nykaa.")],
              })

Search query: Where did Falguni Nayar study i.e got her education?




'Falguni Nayar studied at Sydenham College Of Commerce And Economics and Indian Institute Of Management Ahmedabad.'

In [32]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "Who are compitition or similar to Nykaa?"})

Search query: Who are compitition or similar to Nykaa?




"Based on the context provided, there is no information about Nykaa's competitors."

In [33]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "Any other company similar to Nykaa?"})

Search query: Any other company similar to Nykaa?




'Meesho Private Limited is an Indian e-commerce company that operates in categories such as fashion, home and kitchen, beauty and personal care.'

In [34]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "Who are the persons related to Nykaa's founder?"})

Search query: Who are the persons related to Nykaa's founder?




"The provided context does not contain information about other persons related to Nykaa's founder."

In [35]:
time.sleep(5) # this avoid the unitilization limit error 
chain.invoke({"question": "When and where was Nykaa's founder born?"})

Search query: When and where was Nykaa's founder born?




'Falguni Nayar was born in Mumbai on February 19, 1963.'