### Implementation of HippoRAG using only APIs to search over a PDF file

#### V2  
- Change the prompts to use "entities" in place of "named entities", this seems to work better for Brazillian Portugues
- Add node specificity calculation on ppr_graphdb

#### PDF extraction

In [17]:
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
import numpy as np
import faiss
import pickle
from langchain_core.prompts import ChatPromptTemplate
from langchain.output_parsers.json import SimpleJsonOutputParser
from neo4j import GraphDatabase
import os
load_dotenv()

from uuid import uuid4
unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"HippoRag-API-v2.1 - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
# os.environ["LANGCHAIN_API_KEY"] = "LANGCHAIN_API_KEY"  # Update to your API key

llm_model = "gpt-3.5-turbo-1106" # 1106 works better extracting more entities

In [2]:
# Extract text from the PDF

pdf_file_path = "/Users/viewitpro/Downloads/HIDROPONIA.pdf"
# pdf_file_path = "/Users/viewitpro/Downloads/MANUAL_DUSTER.pdf"

pdf_loader = PyPDFLoader(pdf_file_path)
pdf_pages = pdf_loader.load_and_split()
pdf_full_text = "\n".join([d.page_content for d in pdf_pages])

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=5000,
    chunk_overlap=0,
    length_function=len,
    is_separator_regex=False,
)

passages_doc_arr = text_splitter.create_documents([pdf_full_text])

#### RippoRAG INDEXING 
- Extract name entities from each passage

- Create Triplets
- Create P matrix N x P with number of times n appears in p 
- Encode Entities (OpenAI text-embedding-3-small with 256 vector size)
- Find similar entities to create E' relations triplets 
- Add to GraphDB

In [3]:
# Extract name entities from each passage


# changed the prompt to include "in the same language as the paragraph"
extract_entities_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Your task is to extract entities from the given paragraph, in the same language as the paragraph.
Respond with a JSON list of entities."""),
        ("human", """Paragraph:
```
Radio City
Radio City is India's first private FM radio station and was started on 3 July 2001.
It plays Hindi, English and regional songs.
Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.
```"""),
        ("ai", """{{"entities":
    ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]
}}"""),
        ("human", """Paragraph:```
{passage_text}
```"""),
    ]
)

extract_triplets_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """Your task is to construct an RDF (Resource Description Framework) graph from the given passages and entity lists. 
Respond with a JSON list of triples, with each triple representing a relationship in the RDF graph. 

Pay attention to the following requirements:
- Each triple should contain at least one, but preferably two, of the named entities in the list for each passage.
- Clearly resolve pronouns to their specific names to maintain clarity.
"""),
        ("human", """Convert the paragraph into a JSON dict, it has a named entity list and a triple list.
Paragraph:
```
Radio City
Radio City is India's first private FM radio station and was started on 3 July 2001.
It plays Hindi, English and regional songs.
Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.
```

{{"entities":
    ["Radio City", "India", "3 July 2001", "Hindi", "English", "May 2008", "PlanetRadiocity.com"]
}}"""),
        ("ai", """{{"triples": [
            ["Radio City", "located in", "India"],
            ["Radio City", "is", "private FM radio station"],
            ["Radio City", "started on", "3 July 2001"],
            ["Radio City", "plays songs in", "Hindi"],
            ["Radio City", "plays songs in", "English"]
            ["Radio City", "forayed into", "New Media"],
            ["Radio City", "launched", "PlanetRadiocity.com"],
            ["PlanetRadiocity.com", "launched in", "May 2008"],
            ["PlanetRadiocity.com", "is", "music portal"],
            ["PlanetRadiocity.com", "offers", "news"],
            ["PlanetRadiocity.com", "offers", "videos"],
            ["PlanetRadiocity.com", "offers", "songs"]
    ]
}}"""),
        ("human", """Convert the paragraph into a JSON dict, it has a entity list and a triple list.
Paragraph:
```
{passage_text}
```

{entities}"""),
    ]
)

# passages_arr = [ { "id":idx, "text":passage.page_content } for idx, passage in enumerate(passages_doc_arr[:4])]
passages_arr = [ { "id":idx, "text":passage.page_content } for idx, passage in enumerate(passages_doc_arr)]

for passage in passages_arr:
    passage["named_entities"] =[]
    passage["triples"] = []
    try:
        json_output_parser = SimpleJsonOutputParser()
        chain_entities = extract_entities_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
        named_entities = chain_entities.invoke({"passage_text": passage["text"]})
        passage["named_entities"] = named_entities["entities"]

        chain_triples = extract_triplets_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
        triples = chain_triples.invoke({"passage_text": passage["text"], "entities": named_entities})
        passage["triples"] = triples["triples"]
    except Exception as e:
        print(f"Error processing passage: {e}")
        continue



In [19]:
# save the passages to a file
pickle.dump(passages_arr, open("passages_arr.pkl", "wb"))

In [2]:
# load the passages from a file
passages_arr = pickle.load(open("passages_arr.pkl", "rb"))

In [3]:
# add to the passage named_entities entities created from the triples
for passage in passages_arr:
    named_entities = passage["named_entities"]
    named_entities = [entity.lower() for entity in named_entities]
    for triple in passage["triples"]:
        if len(triple) != 3:
            continue
        named_entities.extend([triple[0].lower(), triple[2].lower()])
    
    passage["named_entities"] = list(set(named_entities)) # remove duplicates

In [4]:
# create a unique id for each named entity
named_entities_dict = {}
named_entities_next_id = 0

for passage in passages_arr:
    named_entities = passage["named_entities"]
    for idx, named_entity in enumerate(named_entities):
        if named_entity not in named_entities_dict:
            named_entities_dict[named_entity] = named_entities_next_id
            named_entities_next_id += 1


In [None]:
# Create P matrix which contains the number
# of times each named_entitie appears in each passage

n_p_matrix = np.zeros((len(named_entities_dict), len(passages_arr)))
for idx, passage in enumerate(passages_arr):
    named_entities = passage["named_entities"]
    for named_entity in named_entities:
        # if named_entity.lower() == "modelo":
        #     print(f"Named entities: {named_entities}")
        named_entity_id = named_entities_dict[named_entity.lower()]
        # get number of times named_entity appears in the passage
        n_p_matrix[named_entity_id][idx] = passage["text"].lower().count(named_entity.lower())


# np.set_printoptions(edgeitems=10,linewidth=180)
# print(n_p_matrix)

In [6]:
# Encode Entities

entities = [key for key in named_entities_dict.keys()]

# Embedding Entities using OpenAI
openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=256)
entities_embeddings = openai_embeddings.embed_documents(entities)

In [7]:
# Find similar entities to create E' relations triplets  

# Build the index 
faiss_index = faiss.IndexHNSWFlat(256, 32)

# convert to numpy array
vectors = np.array(entities_embeddings)
# Add the vectors to the index
faiss_index.add(vectors)

# Perform the search
k = 2
distances, indices = faiss_index.search(vectors, k)

filter_distance = 0.7
# Create the similar enties list  
similar_entities = []
for idx, entity in enumerate(entities):
    for i in range(k):
        if indices[idx][i] != idx and distances[idx][i] <= filter_distance:

            similar_entities.append(
                {
                    "entity": entity, 
                    "similar_entity": entities[indices[idx][i]]
                }
            )


In [None]:
# for passage in passages_arr:
#     print(passage["named_entities"])
#     # print(passage["triples"])
#     print("\n\n\n")


In [9]:
URI = os.environ.get("NEO4J_URI")
AUTH = (os.environ.get("NEO4J_USER"), os.environ.get("NEO4J_PASSWORD"))

In [40]:
# Add to GraphDB 

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session() as session:

        # create all entities with embeddings and node id
        def create_all_entities(tx, all_entities):
            query = """
            UNWIND $all_entities AS ae
            MERGE (a:Entity {name: ae.name, node_id: ae.node_id, embedding: ae.embedding})
            """
            tx.run(query, all_entities=all_entities)
        
        all_entities = [{"name": entity, "node_id": named_entities_dict[entity], "embedding": entities_embeddings[named_entities_dict[entity]]} for entity in named_entities_dict.keys()]
        session.execute_write(lambda tx: create_all_entities(tx, all_entities))


        # create RELATE_TO relationships
        def create_relateto_relationships(tx, triplets):
            query = """
            UNWIND $triplets AS triplet
            MATCH (a:Entity {name: triplet.subject}), (b:Entity {name: triplet.object})
            MERGE (a)-[:RELATES_TO {type: triplet.predicate}]->(b)
            """
            tx.run(query, triplets=triplets)

        triplets = []
        for passage in passages_arr:
            for triple in passage["triples"]:
                if len(triple) != 3:
                    continue
                # print(triple)
                predicate=triple[1].replace(" ", "_").upper()
                subject=triple[0].lower()
                object=triple[2].lower()
                triplets.append({
                    "subject": subject, 
                    "predicate": predicate, 
                    "object": object,
                    "passageId_subject": passage["id"],
                    "passageId_object": passage["id"],
                })
        
        session.execute_write(lambda tx: create_relateto_relationships(tx, triplets))

        # create the vector index on  Neo4j
        def create_vector_index(tx):
            query = """
            CREATE VECTOR INDEX entitieDB IF NOT EXISTS
            FOR (m:Entity)
            ON m.embedding
            OPTIONS {indexConfig: {
                `vector.dimensions`: 256,
                `vector.similarity_function`: 'cosine'
            }}
            """
            tx.run(query)
        
        session.execute_write(lambda tx: create_vector_index(tx))

        # create similar entities graph
        def create_similar_entities(tx, similar_entities):
            query = """
            UNWIND $similar_entities AS se
            MATCH (a:Entity {name: se.entity}), (b:Entity {name: se.similar_entity})
            MERGE (a)-[:SIMILAR_TO]->(b)
            """
            tx.run(query, similar_entities=similar_entities)
        
        session.execute_write(lambda tx: create_similar_entities(tx, similar_entities))


#### RippoRAG RETRIEVAL
- Extract name entities from query

- Vector search on GraphDB to find similar Entities
- PPR on GraphDB to find "good" nodes
- Rank passages from nodes found 
- Send passages + query to LLM for answer

In [41]:
# Extract name entities from query

def extract_entities_from_query(user_query):
    # This prompt is a simpler version o the original, it works better for small paragraphs and less entities and
    # in other languages like portuguese
    extract_entities_custom_prompt = ChatPromptTemplate.from_messages(
        [
            ("system", """Your task is to extract all entities from the given paragraph, in the same language as the paragraph.
    Respond with a JSON list of entities like {{"entities":["entity1", "entity2", ...]}}"""),
            ("human", """Paragraph:```
    {passage_text}
    ```"""),
        ]
    )

    json_output_parser = SimpleJsonOutputParser()
    chain_query_entities = extract_entities_custom_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
    #chain_query_entities = extract_entities_prompt | ChatOpenAI(model=llm_model, temperature=0.0) | json_output_parser
    query_entities = chain_query_entities.invoke({"passage_text": user_query})
    query_entities["named_entities"] = query_entities["entities"] # change the name to named_entities

    return query_entities


In [42]:
#Vector search on GraphDB to find similar Entities

#https://neo4j.com/docs/cypher-manual/current/indexes/semantic-indexes/vector-indexes/

def vector_search_graphdb(query_entitie, min_score=0.8):
    openai_embeddings = OpenAIEmbeddings(model="text-embedding-3-small", dimensions=256)
    query_entitie_embedding = openai_embeddings.embed_query(query_entitie)
    results = []
    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        with driver.session() as session:
            def search_vector(tx, query_embedding):
                cypher_query = f"""
                CALL db.index.vector.queryNodes('entitieDB', 3, {query_embedding}) YIELD node, score
                RETURN node.node_id as id, node.name as name, score
                """
                return tx.run(cypher_query).data()

            db_result = session.execute_read(lambda tx: search_vector(tx, query_entitie_embedding))
            
            for result in db_result:
                if result["score"] >= min_score:
                    results.append(result)

            # return the SIMILAR_TO entities too  E'
            def similar_search(tx, similar_entities):
                cypher_query = f"""
                MATCH (a:Entity)-[:SIMILAR_TO]->(b:Entity)
                WHERE a.name IN $similar_entities
                RETURN b.node_id AS id, b.name AS name
                """
                return tx.run(cypher_query, similar_entities=similar_entities).data()

            similar_entities = [ result["name"] for result in results]
            db_result = session.execute_read(lambda tx: similar_search(tx, similar_entities))
            results.extend(db_result)

    return results


In [43]:
#PPR on GraphDB to find "good" nodes

# gds.pageRank only works on Neo4j 4.0 auraDS and its a payed feature on Neo4j 4.0 
# for this reason I will export the graph from auraDB to igraph and use the igraph implementation of PageRank  
import igraph as ig

def ppr_graphdb(nodes):
    # remove duplicated nodes
    unique_data = {}
    for item in nodes:
        unique_data[item['id']] = item

    unique_nodes = list(unique_data.values())

    with GraphDatabase.driver(URI, auth=AUTH) as driver:
        with driver.session() as session:
            #get nodes and relations from graphDB
            graph_nodes = session.execute_read(lambda tx: tx.run("MATCH (n:Entity) RETURN n.node_id AS node_id").data())
            relations = session.execute_read(lambda tx: tx.run("MATCH (a)-[r:RELATES_TO]->(b) RETURN a.node_id AS source, b.node_id AS target").data())

            # Create an igraph graph
            g = ig.Graph(directed=True) 
            # add nodes
            for gnode in graph_nodes:
                # g.add_vertex(name=str(gnode["node_id"]), labels=str(gnode["node_id"]), passageId=gnode["passageId"])
                g.add_vertex(name=str(gnode["node_id"]), labels=str(gnode["node_id"]))

            # add edges
            g.add_edges([(str(rel["source"]), str(rel["target"])) for rel in relations])

            # Personalized PageRank
            personalization = [0] * len(g.vs)
            # Set personalization vector 
            personalization_value  = 1.0 / len(nodes)
            for node in unique_nodes:
                idx = g.vs.find(name=str(node["id"])).index
                personalization[idx] += personalization_value 

                # calculate node specificity len(node_passages) ** -1
                node_sum = np.sum(n_p_matrix[node["id"]])
                if node_sum == 0:
                    print(f"Node sum{node['id']} is zero", node)
                else:
                    personalization[idx] *= node_sum ** -1


            #https://igraph.org/python/api/0.9.11/igraph._igraph.GraphBase.html#personalized_pagerank
            pagerank_scores = g.personalized_pagerank(damping=0.85, reset=personalization)

    return pagerank_scores



In [44]:
# create the final answer with the passages
from langchain_core.output_parsers.string import StrOutputParser

# User question
user_query = "Qual o modelo mais versátil?"
# user_query = "Quais as diferenças entre os modelos de casas de vegetação, capela e dente-de-serra?"

# Extract name entities from query
query_entities = extract_entities_from_query(user_query)
print("Query entities:",query_entities)

# find the nodes on the graphDB that are similar to the named entities in the query
base_nodes = []
if len(query_entities["named_entities"]) == 0:
    print("No named entities found in the query")
else:
    for query_entitie in query_entities["named_entities"]:
        results = vector_search_graphdb(query_entitie)
        base_nodes.extend(results)

base_nodes


Query entities: {'entities': ['modelo'], 'named_entities': ['modelo']}


[{'id': 100, 'name': 'modelo', 'score': 0.9999996423721313},
 {'id': 165, 'name': 'tipo', 'score': 0.8274419903755188},
 {'id': 165, 'name': 'tipo'},
 {'id': 100, 'name': 'modelo'}]

In [45]:
nodes_score = ppr_graphdb(base_nodes)

In [None]:
# import numpy as np
# indices = np.where(np.array(nodes_score) > 0)[0]
# print(indices)

# # nodes_score
# for idx in indices:
#     print(idx, entities[idx])

In [48]:
# get the passages score and order

def rank_passages(p_matrix, ppr_nodes_scores):
    passages_scores = np.dot(p_matrix.T, np.array(ppr_nodes_scores))
    print(passages_scores)
    print(np.argsort(passages_scores)[::-1])
    return passages_scores, np.argsort(passages_scores)[::-1]


passages_score, passages_order = rank_passages(n_p_matrix, nodes_score)


[0.         3.28205335 0.         0.01096668 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.        ]
[ 1  3 22 10  2  4  5  6  7  8  9 11 21 12 13 14 15 16 17 18 19 20  0]


In [49]:
# create the context with the passages 
context = ""
max_passages = 3
passage_count = 0

for idx in passages_order:
    if passages_score[idx] > 0:
        if passage_count > max_passages-1:
            break
        passage_count += 1
        context += f"{passages_arr[idx]['text']}\n\n"


template = """Answer the question based only on the provided context, answer in the same languague as the question.
{context}
Question: {question}
"""
rag_prompt_text = ChatPromptTemplate.from_messages(
    [
        ("system", "Answer the question based only on the provided context, answer in the same languague as the question."),
        ("human", """Context:```
{context}
```

Question: {question}"""),
    ]
)
# Create the chain to answer the question, will use a stronger model like GPT-4o
final_chain = rag_prompt_text | ChatOpenAI(model="gpt-4o", temperature=0.0) | StrOutputParser()

# Invoke the chain with a query
answer = final_chain.invoke({"context": context, "question": user_query})

answer

'O modelo mais versátil é o "teto em arco", pois ele pode ser utilizado nas Regiões Sul, Sudeste, Centro-Oeste e Norte, fazendo-se as devidas adaptações e adicionando acessórios como janelas advectivas, teto zenital e sistemas de refrigeração ou aquecimento.'