In [10]:
import dotenv
dotenv.load_dotenv(override=True)


True

In [2]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

In [3]:
INPUT_DIR = "./output/20240721-181603/artifacts"
LANCEDB_URI = f"./lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [4]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 1034


[2024-07-21T15:26:39Z WARN  lance::dataset] No existing dataset at /Users/arch7tect/PycharmProjects/ms-rag/lancedb/entity_description_embeddings.lance, it will be created


Unnamed: 0,level,title,type,description,source_id,degree,human_readable_id,id,size,graph_embedding,community,entity_type,top_level_node_id,x,y
0,0,"""PROJECT GUTENBERG""","""ORGANIZATION""","""Project Gutenberg is a non-profit organizatio...","01e84646075b255eab0a34d872336a89,10bab8e9773ee...",3,0,b45241d70f0e43fca764df95b2b81f77,3,,,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,"""A CHRISTMAS CAROL""","""EVENT""","""A Christmas Carol"" refers to both an eBook pr...","680dd6d2a970a49082fa4f34bf63a34e,7b678bbc20b82...",1,1,4119fd06010c494caa07f439b333f4c5,1,,,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,"""CHARLES DICKENS""","""PERSON""","""Charles Dickens, the renowned author, is cele...","680dd6d2a970a49082fa4f34bf63a34e,7b678bbc20b82...",0,2,d3835bf3dda84ead99deadbeac5d0d7d,0,,,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,"""ARTHUR RACKHAM""","""PERSON""","""Arthur Rackham is the illustrator of A Christ...",680dd6d2a970a49082fa4f34bf63a34e,0,3,077d2820ae1845bcbb1803379a3d1eae,0,,,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,"""J. B. LIPPINCOTT COMPANY""","""ORGANIZATION""","""J. B. Lippincott Company is the original publ...",680dd6d2a970a49082fa4f34bf63a34e,0,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,,,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0


In [5]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 169


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,"""PROJECT GUTENBERG""","""GEO""",1.0,"""Project Gutenberg operates globally, providin...",[879b3fc36c9a2427cdb8d5d41b60e11b],24b4a5f4db67418cbfa08c5316f0ab51,0,3,2,5
1,"""PROJECT GUTENBERG""","""DONATIONS""",1.0,"""Project Gutenberg accepts donations through t...",[9e59af410db84b25757e3bf90e036f39],e4b707e3e6964197855b82fc66ef59e7,1,3,1,4
2,"""PROJECT GUTENBERG""","""EMAIL NEWSLETTER""",1.0,"""Project Gutenberg uses an email newsletter to...",[9e59af410db84b25757e3bf90e036f39],109b8be5a8ee4180a1465cd23f019d7b,2,3,1,4
3,"""A CHRISTMAS CAROL""","""AUTHOR""",1.0,"""Charles Dickens wrote 'A Christmas Carol'.""",[7b678bbc20b82b0518ec0a86e295a115],49f771e31a0c4b35bc39e389f3623509,3,1,1,2
4,"""FRED""","""SCROOGE""",2.0,"Fred, despite recognizing the flaws in his unc...","[4033108a1f27d8d4a3caaa923d459730, dd304886e6a...",aa946d4379694a74ba0da37e69d2810a,4,3,70,73


In [6]:
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

FileNotFoundError: [Errno 2] No such file or directory: './output/20240721-181603/artifacts/create_final_covariates.parquet'

In [7]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 9


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,12,# Community Dynamics in Victorian Era\n\nThis ...,1,7.5,Community Dynamics in Victorian Era,The community's impact severity rating reflect...,This report explores the intricate relationshi...,"[{'explanation': 'Scrooge, initially portrayed...","{\n ""title"": ""Community Dynamics in Victori...",34439672-064b-4293-9295-bc302d2fff39
1,0,# Scrooge's Community\n\nThis report analyzes ...,0,7.5,Scrooge's Community,The severity of impact is rated as moderate du...,"This report analyzes Scrooge's interactions, e...",[{'explanation': 'Scrooge undergoes a profound...,"{\n ""title"": ""Scrooge's Community"",\n ""s...",59491cb5-3a1b-4b6b-94d9-15fd7d34ae0c
2,1,# Christmas Carol Community\n\nThe Christmas C...,0,8.5,Christmas Carol Community,This rating reflects the profound impact on in...,The Christmas Carol community revolves around ...,[{'explanation': 'Ebenezer Scrooge undergoes a...,"{\n ""title"": ""Christmas Carol Community"",\n...",53ff0483-8a44-40b3-b7e3-eca12273507c
3,3,# Scrooge's Family Dynamics\n\nThe community r...,0,6.5,Scrooge's Family Dynamics,The community is moderately impacted by the re...,"The community revolves around Scrooge, a centr...",[{'explanation': 'Scrooge's niece shares the s...,"{\n ""title"": ""Scrooge's Family Dynamics"",\n...",6d3aa1cc-6550-4897-a8b8-d3886d596cd2
4,5,# Verdant Oasis Plaza Community\n\nThe Verdant...,0,6.5,Verdant Oasis Plaza Community,The community faces a moderate level of impact...,The Verdant Oasis Plaza community is character...,[{'explanation': 'The Unity March at Verdant O...,"{\n ""title"": ""Verdant Oasis Plaza Community...",a601928d-859d-4fb8-8a7e-deea99287ab4


In [8]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 231


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,4033108a1f27d8d4a3caaa923d459730,"""How are you?"" returned the other.\n ""We...",300,[c305886e4aa2f6efcf64b57762777055],"[1fd3fa8bb5a2408790042ab9573779ee, c9632a35146...","[aa946d4379694a74ba0da37e69d2810a, 0a784e00c94..."
1,dbf014d7f9bcf97aa06ace38b6e41ccb,ustration]\n\n_IN BLACK AND WHITE_\n\n\n Tail...,300,[c305886e4aa2f6efcf64b57762777055],"[bf4e255cdac94ccc83a56435a5e4b075, de9e343f2e3...","[bac51e00d486420c8e91e824d8e17411, a0f326b9597..."
2,d222d20d61efac93225744c957c2b52a,"-fisted hand at the grindstone, Scrooge! a\nsq...",300,[c305886e4aa2f6efcf64b57762777055],"[bf4e255cdac94ccc83a56435a5e4b075, f1c6eed066f...","[4adee3aad6524a4aa4c4711c1ee05e64, d034e4fd8ac..."
3,e67524adaf0fbbef1a94a6f042c9810c,chill him. No wind that blew was bitterer tha...,300,[c305886e4aa2f6efcf64b57762777055],"[bf4e255cdac94ccc83a56435a5e4b075, b7702b90c7f...","[091e998370dd42d1b05ab0fcf6595a7e, 1e6cabc18fa..."
4,d487efa13462a71434c752212586764d,"You don't mean\nthat, I am sure?'\n\n'I do,' s...",300,[c305886e4aa2f6efcf64b57762777055],"[bf4e255cdac94ccc83a56435a5e4b075, 958beecdb5b...",[b16eda56dcec40f2b3e109fb9246bee3]


In [18]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
    api_base=os.environ.get("API_BASE"),
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=os.environ.get("API_BASE"),
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

In [19]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

In [20]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [21]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [22]:
result = await search_engine.asearch("Who is the girl?")
print(result.response)

Error embedding chunk {'OpenAIEmbedding': "Error code: 400 - {'error': {'message': 'invalid input type', 'type': 'api_error', 'param': None, 'code': None}}"}


ZeroDivisionError: Weights sum to zero, can't be normalized