In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
INPUT_DIR = "output/20240711-140302/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [4]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 30316


[2024-07-11T16:13:18Z WARN  lance::dataset] No existing dataset at /home/adhi/arxiv-graph-rag/output/20240711-140302/artifacts/lancedb/entity_description_embeddings.lance, it will be created


Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,"""HUMAIR RAJ KHAN""","""PERSON""","""Humair Raj Khan is one of the authors of the ...",95c6804eef9c9dd3174bf17b531dd58a,9.0,1,0,b45241d70f0e43fca764df95b2b81f77,1,,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,"""DEEPAK GUPTA""","""PERSON""","""DEEPAK GUPTA"" is a researcher who has contrib...","5363c64045e1cad12ae5f61560e3407b,924f5fd796896...",9.0,7,1,4119fd06010c494caa07f439b333f4c5,7,,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,"""ASIF EKBAL""","""PERSON""","""Asif Ekbal is a researcher who has made signi...","5363c64045e1cad12ae5f61560e3407b,924f5fd796896...",9.0,8,2,d3835bf3dda84ead99deadbeac5d0d7d,8,,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,"""DEPARTMENT OF COMPUTER SCIENCE AND ENGINEERING""","""ORGANIZATION""","""The Department of Computer Science and Engine...",95c6804eef9c9dd3174bf17b531dd58a,,1,3,077d2820ae1845bcbb1803379a3d1eae,1,,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,"""INDIAN INSTITUTE OF TECHNOLOGY PATNA""","""ORGANIZATION""","""The Indian Institute of Technology Patna is t...",95c6804eef9c9dd3174bf17b531dd58a,,2,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,2,,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0


In [5]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 8616


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,"""HUMAIR RAJ KHAN""","""VISUAL QUESTION ANSWERING SYSTEM""",1.0,"""Humair Raj Khan is one of the authors contrib...",[95c6804eef9c9dd3174bf17b531dd58a],88495b0666864d169607f9563fc9371a,0,1,3,4
1,"""DEEPAK GUPTA""","""VISUAL QUESTION ANSWERING SYSTEM""",1.0,"""Deepak Gupta is one of the authors contributi...",[95c6804eef9c9dd3174bf17b531dd58a],4349eae4722d48fd888860258fa66c3e,1,7,3,10
2,"""DEEPAK GUPTA""","""ASIF EKBAL""",2.0,"""DEEPAK GUPTA"" and ""ASIF EKBAL"" are mentioned ...","[924f5fd796896dbb146100ee1b0efe5c, 97803108ccd...",f4ef464d2f934d79acb1049b84d20cce,2,7,8,15
3,"""DEEPAK GUPTA""","""PUSHPAK BHATTACHARYYA""",2.0,"""Deepak Gupta and Pushpak Bhattacharyya have c...","[924f5fd796896dbb146100ee1b0efe5c, 97803108ccd...",1ae9bcd05cc9411d8e7d815c9023707d,3,7,5,12
4,"""DEEPAK GUPTA""","""ASSOCIATION FOR COMPUTATIONAL LINGUISTICS""",1.0,"""Deepak Gupta is mentioned as a researcher par...",[97803108ccdb45fe5fd69c1ed3eadebb],5586b37a9d584f66a950a2b7afe519da,4,7,76,83


In [6]:
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

FileNotFoundError: [Errno 2] No such file or directory: 'output/20240711-140302/artifacts/create_final_covariates.parquet'

In [7]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 1166


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,1100,# CLIP and Related Entities\n\nThe community r...,3,8.5,CLIP and Related Entities,The impact severity rating is high due to the ...,"The community revolves around the CLIP model, ...",[{'explanation': 'CLIP serves as a foundation ...,"{\n ""title"": ""CLIP and Related Entities"",\n...",7ed1bc71-4032-4cbf-8513-c93d633fb7d3
1,1101,# LAION400M and Vision-Language Models\n\nThe ...,3,8.5,LAION400M and Vision-Language Models,The impact severity rating is high due to the ...,The community revolves around the LAION400M da...,[{'explanation': 'LAION400M plays a crucial ro...,"{\n ""title"": ""LAION400M and Vision-Language...",be58c54e-1f00-4330-8bbb-8526d004e673
2,1102,# VL-T5 and Visual Grounding Tasks\n\nThe comm...,3,7.5,VL-T5 and Visual Grounding Tasks,The impact severity rating is high due to the ...,The community is centered around the VL-T5 mod...,[{'explanation': 'VL-T5 is a versatile model t...,"{\n ""title"": ""VL-T5 and Visual Grounding Ta...",34bfa943-22b1-4594-99a8-c06c3873299b
3,1103,# Language and Vision Encoder Community\n\nThe...,3,7.5,Language and Vision Encoder Community,The impact severity rating is high due to the ...,The community revolves around the Language Enc...,[{'explanation': 'The Language Encoder plays a...,"{\n ""title"": ""Language and Vision Encoder C...",2d80382b-3631-4472-8efa-39703c977c98
4,1104,"# OFA and Wang, P.\n\nThe community revolves a...",3,8.5,"OFA and Wang, P.",The impact severity rating is high due to the ...,The community revolves around the OFA organiza...,[{'explanation': 'OFA is a versatile organizat...,"{\n ""title"": ""OFA and Wang, P."",\n ""summ...",1507a244-a1f1-4b83-a349-d52ffb4cd05d


In [8]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 1543


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,95c6804eef9c9dd3174bf17b531dd58a,Towards Developing a Multilingual and Code-Mix...,300,[0068519174fae8baf0595037d2d50e86],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[88495b0666864d169607f9563fc9371a, 4349eae4722..."
1,fdfe95f68a99c38c85a099fb32b3c240,",\nwhich only use the output from the last lay...",300,[0068519174fae8baf0595037d2d50e86],"[f7e11b0e297a44a896dc67928368f600, 1fd3fa8bb5a...","[86a7f35ec92e4bcc80934183b5dc1d9f, 27d298dd805..."
2,85b6f5e34e5006de5cbfb1902ad59fc5,".et al., 2015 on VQA are mainly limited to the...",300,[0068519174fae8baf0595037d2d50e86],"[27f9fbe6ad8c4a8b9acee0d3596ed57c, e1fd0e904a5...","[2056cb0af71645939eb693f462b243d1, 04f29ef9dc5..."
3,39c8613b69eb43bf23a52ab4f6e33d52,majority of these\nmodels are predominantly b...,300,[0068519174fae8baf0595037d2d50e86],"[4a67211867e5464ba45126315a122a8a, 04dbbb2283b...","[6282cc3053c7490ab9c3825d5adf43d6, 4bb9f801e89..."
4,0048fc000b52684e515f8f5472ef9b8d,distillation objectives\nwhich ensure the inc...,300,[0068519174fae8baf0595037d2d50e86],"[e2f5735c7d714423a2c4f61ca2644626, deece7e64b2...","[17720d37c8aa4e8eba95193090638fdc, 35221a75248..."


In [9]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = "gpt-3.5-turbo"
embedding_model = "text-embedding-3-small"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

In [11]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

In [12]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [13]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [15]:
result = await search_engine.asearch("When LXMERT model released?")
print(result.response)

The LXMERT model was developed by Hao Tan and Mohit Bansal in 2019 [Data: Entities (84)]. This model is a state-of-the-art model for question understanding in natural language processing, focusing on multi-modal feature extraction through transformer blocks [Data: Entities (84)]. The LXMERT model plays a crucial role in various tasks such as visual question answering and behavior analysis [Data: Entities (84)]. Additionally, organizations like CF2, CF3, CMI, and SPF are involved in generating code-mixed questions used in pre-training the LXMERT model [Data: Entities (471, 472, 469, 470), Relationships (370, 371, 368, 369)].


In [16]:
question = "When LXMERT model released?"
result = await search_engine.asearch(question)
print(result.response)

The LXMERT model was developed by Hao Tan and Mohit Bansal in 2019 [Data: Entities (84)]. This model is a state-of-the-art model for question understanding in natural language processing, focusing on multi-modal feature extraction through transformer blocks. It plays a crucial role in tasks such as visual question answering and behavior analysis [Data: Relationships (37, 376, 375, +more)]. The LXMERT model has relationships with various other models like UNITER, VILBERT, and VL-BERT, indicating similarities in their approaches to multi-modal feature extraction [Data: Relationships (384, 37, 381, 380, 363, +more)].


In [17]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,270,"""LXMERT MODEL""","""The LXMERT model is referenced as a pre-train...",1,True
1,126,"""LXMERT TEACHER MODEL""","""LXMERT teacher model is a pre-trained model w...",2,True
2,84,"""LXMERT""","""LXMERT is a model developed by Tan and Bansal...",36,True
3,122,"""JOINT LXMERT""","""JOINT LXMERT"" is a multilingual model for que...",10,True
4,152,"""JOINT LXMERT MODEL""","The ""JOINT LXMERT MODEL"" is an organization th...",3,True


In [18]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,378,"""LXMERT""","""JOINT LXMERT""","""Joint LXMERT is an extension of the LXMERT mo...",1.0,46,1,True
1,44,"""TAN""","""LXMERT""","""Tan is involved in the research on LXMERT, in...",1.0,39,1,True
2,363,"""LXMERT""","""JOINT-LXMERT""","""The LXMERT model is compared to the Joint-LXM...",1.0,38,1,True
3,374,"""LXMERT""","""VG-QA""","""VG-QA provided a large amount of image-and-se...",1.0,37,1,True
4,416,"""M-BERT""","""JOINT LXMERT""","""Joint LXMERT is compared to M-BERT, suggestin...",1.0,14,1,True


In [19]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,709,LXMERT and VQA Community,# LXMERT and VQA Community\n\nThe community re...
1,731,LXMERT Teacher Model and English VQA Dataset,# LXMERT Teacher Model and English VQA Dataset...


In [20]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,53,"show the example\nFig. 8 Q5, where to infer ..."
1,29,We also reported the answer-type wise\nresult...
2,78,"ixing CMI, SPF,\nCF2 and CF3 and quality of th..."
3,36,Table 3: Performance of our proposed model on ...
4,26,each token. In\nour proposed knowledge distil...


In [21]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

This function takes a list of user queries and generates the next candidate questions.



In [22]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [23]:
question_history = [
    "Tell me about most powerful vision language model?",
    "When the model released?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- What are the key entities involved in the Vision and Language Models Paper?', '- How does the Language Model contribute to the Vision-Language Integration?', '- What is the significance of the CLIP-base model in the ICLR 2023 and Research Papers?', "- How does the Vision Model play a crucial role in the community's processes?", '- What advancements are showcased in Vision-Language Integration according to the Conversation History?']
