### Implementation of HippoRAG using only APIs to search over a PDF file

#### PDF extraction

In [1]:
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI

import os
load_dotenv()

from uuid import uuid4
unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"HippoRag-API - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGCHAIN_API_KEY"] = "LANGCHAIN_API_KEY"  # Update to your API key

In [2]:
# Extract text from the PDF

pdf_file_path = "/Users/viewitpro/Downloads/HIDROPONIA.pdf"
# pdf_file_path = "/Users/viewitpro/Downloads/MANUAL_DUSTER.pdf"

pdf_loader = PyPDFLoader(pdf_file_path)
pdf_pages = pdf_loader.load_and_split()
pdf_full_text = "\n".join([d.page_content for d in pdf_pages])

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

passages_doc_arr = text_splitter.create_documents([pdf_full_text])

#### RippoRAG INDEXING 
- Extract name entities from each passage

- Create Triplets
- Encode Entities (OpenAI text-embedding-3-small with 256 vector size)
- Find similar entities to create E' relations triplets (THIS IS NOT IMPLEMENTED)
- Add to GraphDB

In [3]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers.json import SimpleJsonOutputParser


llm_model = "gpt-3.5-turbo-1106" # 1106 works better extracting more entities

# changed the prompt to include "in the same language as the paragraph"
extract_entities_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Your task is to extract named entities from the given paragraph, in the same language as the paragraph.
Respond with a JSON list of entities."""),
        ("human", """Paragraph:
```
Radio City
Radio City is India's first private FM radio station and was started on 3 July 2001.
It plays Hindi, English and regional songs.
Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.
```"""),
        ("ai", """{{"named_entities":
    ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]
}}"""),
        ("human", """Paragraph:```
{passage_text}
```"""),
    ]
)

extract_triplets_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Your task is to construct an RDF (Resource Description Framework) graph from the given passages and named entity lists. 
Respond with a JSON list of triples, with each triple representing a relationship in the RDF graph. 

Pay attention to the following requirements:
- Each triple should contain at least one, but preferably two, of the named entities in the list for each passage.
- Clearly resolve pronouns to their specific names to maintain clarity.
"""),
        ("human", """Convert the paragraph into a JSON dict, it has a named entity list and a triple list.
Paragraph:
```
Radio City
Radio City is India's first private FM radio station and was started on 3 July 2001.
It plays Hindi, English and regional songs.
Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.
```

{{"named_entities":
    ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]
}}"""),
        ("ai", """{{"triples": [
            ["Radio City", "located in", "India"],
            ["Radio City", "is", "private FM radio station"],
            ["Radio City", "started on", "3 July 2001"],
            ["Radio City", "plays songs in", "Hindi"],
            ["Radio City", "plays songs in", "English"]
            ["Radio City", "forayed into", "New Media"],
            ["Radio City", "launched", "PlanetRadiocity.com"],
            ["PlanetRadiocity.com", "launched in", "May 2008"],
            ["PlanetRadiocity.com", "is", "music portal"],
            ["PlanetRadiocity.com", "offers", "news"],
            ["PlanetRadiocity.com", "offers", "videos"],
            ["PlanetRadiocity.com", "offers", "songs"]
    ]
}}"""),
        ("human", """Convert the paragraph into a JSON dict, it has a named entity list and a triple list.
Paragraph:
```
{passage_text}
```

{named_entities}"""),
    ]
)

# passages_arr = [ { "id":idx, "text":passage.page_content } for idx, passage in enumerate(passages_doc_arr[:4])]
passages_arr = [ { "id":idx, "text":passage.page_content } for idx, passage in enumerate(passages_doc_arr)]

for passage in passages_arr:
    json_output_parser = SimpleJsonOutputParser()
    chain_entities = extract_entities_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
    named_entities = chain_entities.invoke({"passage_text": passage["text"]})
    passage["named_entities"] = named_entities["named_entities"]

    chain_triples = extract_triplets_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
    triples = chain_triples.invoke({"passage_text": passage["text"], "named_entities": named_entities})
    passage["triples"] = triples["triples"]



In [13]:
# add to the passage named_entities name entities created from the triples
for passage in passages_arr:
    named_entities = passage["named_entities"]
    named_entities = [entity.lower() for entity in named_entities]
    for triple in passage["triples"]:
        if len(triple) != 3:
            continue
        named_entities.extend([triple[0].lower(), triple[2].lower()])
    
    passage["named_entities"] = list(set(named_entities)) # remove duplicates

In [14]:
# create a unique id for each named entity
named_entities_dict = {}
named_entities_next_id = 0

for passage in passages_arr:
    named_entities = passage["named_entities"]
    for idx, named_entity in enumerate(named_entities):
        named_entities_dict[named_entity] = named_entities_next_id
        named_entities_next_id += 1


In [15]:
# Encode Entities
entities = []
for passage in passages_arr:
    for entity in passage["named_entities"]:
        entities.append(entity)

# Embedding Entities using OpenAI
openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=256)
entities_embeddings = openai_embeddings.embed_documents(entities)

In [16]:
# Update passages_arr with entity embeddings
pos = 0
for passage in passages_arr:
    passage["entity_embeddings"] = []
    for _ in passage["named_entities"]:
        passage["entity_embeddings"].append(entities_embeddings[pos])
        pos += 1

In [8]:
# for passage in passages_arr:
#     print(passage["named_entities"])
#     print(passage["triples"])
#     print("\n\n\n")


In [17]:
# Add to GraphDB 
from neo4j import GraphDatabase

URI = os.environ.get("NEO4J_URI")
AUTH = (os.environ.get("NEO4J_USER"), os.environ.get("NEO4J_PASSWORD"))

def create_triplets(tx, triplets):
    query = """
    UNWIND $triplets AS triplet
    MERGE (a:Entity {name: triplet.subject, passageId: triplet.passageId, embedding: triplet.embedding_subject, node_id: triplet.node_id_subject})
    MERGE (b:Entity {name: triplet.object, passageId: triplet.passageId, embedding: triplet.embedding_object, node_id: triplet.node_id_object})
    MERGE (a)-[:RELATES_TO {type: triplet.predicate}]->(b)
    """
    tx.run(query, triplets=triplets)

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session() as session:
        triplets = []
        for passage in passages_arr:
            for triple in passage["triples"]:
                if len(triple) != 3:
                    continue
                # print(triple)
                predicate=triple[1].replace(" ", "_").upper()
                subject=triple[0].lower()
                object=triple[2].lower()
                triplets.append({
                    "subject": subject, 
                    "predicate": predicate, 
                    "object": object,
                    "passageId": passage["id"],
                    "embedding_subject": passage["entity_embeddings"][passage["named_entities"].index(subject)],
                    "embedding_object": passage["entity_embeddings"][passage["named_entities"].index(object)],
                    "node_id_subject": named_entities_dict[subject],
                    "node_id_object": named_entities_dict[object]
                })

        session.execute_write(lambda tx: create_triplets(tx, triplets))

        # create the vector index on  Neo4j
        def create_vector_index(tx):
            query = """
            CREATE VECTOR INDEX entitieDB IF NOT EXISTS
            FOR (m:Entity)
            ON m.embedding
            OPTIONS {indexConfig: {
                `vector.dimensions`: 256,
                `vector.similarity_function`: 'cosine'
            }}
            """
            tx.run(query)
        
        session.execute_write(lambda tx: create_vector_index(tx))


#### RippoRAG RETRIEVAL
- Extract name entities from query

- Vector search on GraphDB to find similar Entities
- PPR on GraphDB to find "good" nodes
- Rank passages from nodes found (THIS IS NOT IMPLEMENTED)
- Send passages + query to LLM for answer

In [18]:
# Extract name entities from query

def extract_entities_from_query(user_query):
    # This prompt is a simpler version o the original, it works better for small paragraphs and less entities and
    # in other languages like portuguese
    extract_entities_custom_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", """Your task is to extract all entities from the given paragraph, in the same language as the paragraph.
    Respond with a JSON list of entities like {{"entities":["entity1", "entity2", ...]}}"""),
            ("human", """Paragraph:```
    {passage_text}
    ```"""),
        ]
    )

    json_output_parser = SimpleJsonOutputParser()
    chain_query_entities = extract_entities_custom_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
    #chain_query_entities = extract_entities_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
    query_entities = chain_query_entities.invoke({"passage_text": user_query})
    query_entities["named_entities"] = query_entities["entities"] # change the name to named_entities

    return query_entities



In [19]:
#Vector search on GraphDB to find similar Entities

#https://neo4j.com/docs/cypher-manual/current/indexes/semantic-indexes/vector-indexes/

def vector_search_graphdb(query_entitie, min_score=0.8):
    openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=256)
    query_entitie_embedding = openai_embeddings.embed_query(query_entitie)
    results = []
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        with driver.session() as session:
            def search_vector(tx, query_embedding):
                cypher_query = f"""
                CALL db.index.vector.queryNodes('entitieDB', 3, {query_embedding}) YIELD node, score
                RETURN node.node_id as id, node.name as name, score
                """
                return tx.run(cypher_query).data()

            db_result = session.execute_read(lambda tx: search_vector(tx, query_entitie_embedding))
            
            for result in db_result:
                if result["score"] >= min_score:
                    results.append(result)

    return results



In [20]:
#PPR on GraphDB to find "good" nodes

# gds.pageRank only works on Neo4j 4.0 auraDS and its a payed feature on Neo4j 4.0 
# for this reason I will export the graph from auraDB to igraph and use the igraph implementation of PageRank  
import igraph as ig

def ppr_graphdb(nodes):

    passage_ids = []
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        with driver.session() as session:
            #get nodes and relations from graphDB
            graph_nodes = session.execute_read(lambda tx: tx.run("MATCH (n) RETURN n.node_id AS node_id, n.passageId as passageId").data())
            relations = session.execute_read(lambda tx: tx.run("MATCH (a)-[r]->(b) RETURN a.node_id AS source, b.node_id AS target").data())

            # Create an igraph graph
            g = ig.Graph(directed=True) 
            # add nodes
            for gnode in graph_nodes:
                g.add_vertex(name=str(gnode["node_id"]), labels=str(gnode["node_id"]), passageId=gnode["passageId"])

            # add edges
            g.add_edges([(str(rel["source"]), str(rel["target"])) for rel in relations])

            # Personalized PageRank
            personalization = [0] * len(g.vs)
            # Set personalization vector 
            personalization_value  = 1.0 / len(nodes)
            for node in nodes:
                idx = g.vs.find(name=str(node["id"])).index
                personalization[idx] = personalization_value
            #https://igraph.org/python/api/0.9.11/igraph._igraph.GraphBase.html#personalized_pagerank
            pagerank_scores = g.personalized_pagerank(damping=0.85, reset=personalization)
            # print(pagerank_scores)
            # print(g.vs["passageId"])
            # get sorted nodes by pagerank descending
            pagerank_nodes = sorted(zip(g.vs["passageId"], pagerank_scores), key=lambda x: x[1], reverse=True)
            # print(pagerank_nodes)

            for node in pagerank_nodes:
                if node[1] > 0:
                    print(node)
                    passage_ids.append(node[0])

    return passage_ids



In [24]:
# create the final answer with the passages
from langchain_core.output_parsers.string import StrOutputParser

# User question
user_query = "Quais as diferenças entre os modelos de casas de vegetação capela e dente-de-serra?"
# query = "Qual a capacidade do tanque de combustível da duster?"

# Extract name entities from query
query_entities = extract_entities_from_query(user_query)
print("Query entities:",query_entities)

# find the nodes on the graphDB that are similar to the named entities in the query
base_nodes = []
if len(query_entities["named_entities"]) == 0:
    print("No named entities found in the query")
else:
    for query_entitie in query_entities["named_entities"]:
        results = vector_search_graphdb(query_entitie)
        base_nodes.extend(results)

# get the passages ids that are "good" nodes
result_passages_ids = ppr_graphdb(base_nodes)

# create the context with the passages 
context = ""
unique_passages_id = list(set(result_passages_ids))
for passage_id in unique_passages_id:
    for passage in passages_arr:
        if passage["id"] == passage_id:
            context += passage["text"] + "\n\n"
            break

template = """Answer the question based only on the provided context, answer in the same languague as the question.
{context}
Question: {question}
"""
rag_prompt_text = ChatPromptTemplate.from_messages(
    [
        ("system", "Answer the question based only on the provided context, answer in the same languague as the question."),
        ("human", """Context:```
{context}
```

Question: {question}"""),
    ]
)
# Create the chain to answer the question, will use a stronger model like GPT-4o
final_chain = rag_prompt_text | ChatOpenAI(model="gpt-4o", temperature=0.0) | StrOutputParser()

# Invoke the chain with a query
answer = final_chain.invoke({"context": context, "question": user_query})

answer

Query entities: {'entities': ['modelos de casas de vegetação capela', 'dente-de-serra'], 'named_entities': ['modelos de casas de vegetação capela', 'dente-de-serra']}
(0, 0.2622475048242614)
(1, 0.2622475048242614)
(1, 0.1114551895503111)
(1, 0.1114551895503111)
(5, 0.04458207582012444)
(3, 0.02229103791006222)
(0, 0.02229103791006222)
(0, 0.02229103791006222)
(5, 0.02229103791006222)
(5, 0.02229103791006222)
(5, 0.02229103791006222)
(5, 0.02229103791006222)
(5, 0.02229103791006222)
(2, 0.0063157940745176295)
(3, 0.0063157940745176295)
(3, 0.0063157940745176295)
(2, 0.005368424963339984)
(3, 0.002684212481669992)
(3, 0.002684212481669992)


'Os modelos de casas de vegetação capela e dente-de-serra possuem diferenças significativas em termos de estrutura e adequação climática:\n\n1. **Modelo Capela**:\n   - **Estrutura**: Este modelo é assim chamado pela sua semelhança com uma capela, com telhado de duas águas iguais.\n   - **Adequação Climática**: É um modelo de uso generalizado nas cinco regiões geográficas do país, mas sua melhor utilização é em condições de clima quente e úmido (Centro-Oeste e Sudeste), quente e seco (Nordeste) e equatorial, como o da Amazônia.\n\n2. **Modelo Dente-de-Serra**:\n   - **Estrutura**: Tem esse nome porque o perfil transversal deste modelo lembra a figura de dentes de serra.\n   - **Adequação Climática**: É indicado para condições de elevada temperatura e umidade, como as condições climáticas predominantes nas Regiões Sudeste, Centro-Oeste, Nordeste e Amazônica.\n\nEssas diferenças refletem a adaptação de cada modelo às condições climáticas específicas, visando otimizar o cultivo hidropônic