In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.indexer_adapters import read_indexer_entities, read_indexer_reports
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = "gpt-3.5-turbo"

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

In [13]:
# parquet files generated from indexing pipeline
INPUT_DIR = "output/20240711-140302/artifacts"
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"

# community level in the Leiden community hierarchy from which we will load the community reports
# higher value means we use reports from more fine-grained communities (at the cost of higher computation cost)
COMMUNITY_LEVEL = 2

In [14]:
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)
print(f"Report records: {len(report_df)}")
report_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


Report records: 1166


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,1100,# CLIP and Related Entities\n\nThe community r...,3,8.5,CLIP and Related Entities,The impact severity rating is high due to the ...,"The community revolves around the CLIP model, ...",[{'explanation': 'CLIP serves as a foundation ...,"{\n ""title"": ""CLIP and Related Entities"",\n...",7ed1bc71-4032-4cbf-8513-c93d633fb7d3
1,1101,# LAION400M and Vision-Language Models\n\nThe ...,3,8.5,LAION400M and Vision-Language Models,The impact severity rating is high due to the ...,The community revolves around the LAION400M da...,[{'explanation': 'LAION400M plays a crucial ro...,"{\n ""title"": ""LAION400M and Vision-Language...",be58c54e-1f00-4330-8bbb-8526d004e673
2,1102,# VL-T5 and Visual Grounding Tasks\n\nThe comm...,3,7.5,VL-T5 and Visual Grounding Tasks,The impact severity rating is high due to the ...,The community is centered around the VL-T5 mod...,[{'explanation': 'VL-T5 is a versatile model t...,"{\n ""title"": ""VL-T5 and Visual Grounding Ta...",34bfa943-22b1-4594-99a8-c06c3873299b
3,1103,# Language and Vision Encoder Community\n\nThe...,3,7.5,Language and Vision Encoder Community,The impact severity rating is high due to the ...,The community revolves around the Language Enc...,[{'explanation': 'The Language Encoder plays a...,"{\n ""title"": ""Language and Vision Encoder C...",2d80382b-3631-4472-8efa-39703c977c98
4,1104,"# OFA and Wang, P.\n\nThe community revolves a...",3,8.5,"OFA and Wang, P.",The impact severity rating is high due to the ...,The community revolves around the OFA organiza...,[{'explanation': 'OFA is a versatile organizat...,"{\n ""title"": ""OFA and Wang, P."",\n ""summ...",1507a244-a1f1-4b83-a349-d52ffb4cd05d


In [15]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

In [16]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 2000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

In [17]:
search_engine = GlobalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [18]:
result = await search_engine.asearch(
    "When LXMERT model released?"
)

print(result.response)

## LXMERT Model Release Information

The LXMERT model, a significant advancement in the field of vision-and-language tasks, was introduced in 2019. This model, developed by researchers such as Tan, Bansal, and others, marked a pivotal moment in the community's efforts to enhance question answering systems [Data: Reports (1003, 782, 379, 780, 505, +more)]. The release of LXMERT in 2019 at NeurIPS and other conferences signified a collaborative endeavor by the community to leverage large-scale pretraining and innovative approaches to improve performance on various vision-language tasks [Data: Reports (927, 676, 491, 822, 839, +more)].

## Evolution and Significance

The LXMERT model's introduction in 2019 at conferences like EMNLP-IJCNLP and ECCV marked a significant milestone in the community's pursuit of cross-modal understanding capabilities [Data: Reports (789, 738, 478, 731, 709, +more)]. Researchers like Tan, Bansal, and Parikh played crucial roles in the development of LXMERT, emp

In [9]:
# inspect the data used to build the context for the LLM responses
result.context_data["reports"]

Unnamed: 0,id,title,occurrence weight,content,rank
0,6,Vision-Language Multi-Task Learning Community,0.542373,# Vision-Language Multi-Task Learning Communit...,7.5
1,23,BERT and Related Entities,0.474576,# BERT and Related Entities\n\nThe community r...,8.5
2,19,Visual Question Answering (VQA) Community,0.471186,# Visual Question Answering (VQA) Community\n\...,8.0
3,17,Vision-Language Models and Researchers,0.461017,# Vision-Language Models and Researchers\n\nTh...,8.5
4,37,IEEECVF Conference on Computer Vision and Patt...,0.457627,# IEEECVF Conference on Computer Vision and Pa...,8.5
5,9,Association for Computational Linguistics Comm...,0.457627,# Association for Computational Linguistics Co...,8.5
6,11,LXMERT and Multimodal Question Understanding,0.450847,# LXMERT and Multimodal Question Understanding...,8.5
7,28,European Conference on Computer Vision 2022 in...,0.39661,# European Conference on Computer Vision 2022 ...,8.5
8,1,3DVLP and Related Entities,0.366102,# 3DVLP and Related Entities\n\nThe community ...,8.5
9,8,Large Models and Image Datasets Community,0.352542,# Large Models and Image Datasets Community\n\...,8.5


In [10]:
# inspect number of LLM calls and tokens
print(f"LLM calls: {result.llm_calls}. LLM tokens: {result.prompt_tokens}")

LLM calls: 3. LLM tokens: 25442
