In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm


## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [2]:
INPUT_DIR = "./output/20240827-100424/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [3]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 16308


Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,HERITAGE FOUNDATION,ORGANIZATION,The Heritage Foundation is a prominent conserv...,"0a145289ed32887ebc6d6fffb4fa4620,0a5dda433e8b3...",14,101,0,b45241d70f0e43fca764df95b2b81f77,101,,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,KEVIN D. ROBERTS,PERSON,"Kevin D. Roberts, PhD, is a prominent figure i...","24f63c581e3f81b1050fcf734111f175,b4a3aa8cd992c...",14,3,1,4119fd06010c494caa07f439b333f4c5,3,,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,RICK DEARBORN,PERSON,Rick Dearborn is a notable figure associated w...,"24f63c581e3f81b1050fcf734111f175,b3bcb1fbcf472...",14,3,2,d3835bf3dda84ead99deadbeac5d0d7d,3,,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,RUSS VUGHT,PERSON,Russ Vought is a prominent figure in American ...,"1c6198b6d80cf0b6eaa570d4e6f81c67,24f63c581e3f8...",14,6,3,077d2820ae1845bcbb1803379a3d1eae,6,,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,DONALD DEVINE,PERSON,Donald Devine is a Senior Scholar at The Fund ...,"24f63c581e3f81b1050fcf734111f175,6ac8dca760d14...",14,7,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,7,,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0


#### Read relationships

In [4]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 5213


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,HERITAGE FOUNDATION,PROJECT 2025,1.0,The Heritage Foundation is the organization be...,[24f63c581e3f81b1050fcf734111f175],8afd3e0d1758418c8562598cd88e58f2,0,101,47,148
1,HERITAGE FOUNDATION,KEVIN D. ROBERTS,1.0,Kevin D. Roberts is associated with the Herita...,[24f63c581e3f81b1050fcf734111f175],b1df23adf3e7493e9ddcd75237ae6de6,1,101,3,104
2,HERITAGE FOUNDATION,DERRICK MORGAN,1.0,Derrick Morgan is the Executive Vice President...,[81f8593f318736e914e69963e650643e],6768725a42f342fbb475c8612d3cab9f,2,101,1,102
3,HERITAGE FOUNDATION,WESLEY COOPERSMITH,1.0,Wesley Coopersmith serves as the Chief of Staf...,[81f8593f318736e914e69963e650643e],920c29ad995446f19c72ea40c332736c,3,101,1,102
4,HERITAGE FOUNDATION,SPENCER CHRETIEN,1.0,Spencer Chretien is the Associate Director of ...,[81f8593f318736e914e69963e650643e],87348de9087640709a503a6e658362a9,4,101,1,102


In [5]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

FileNotFoundError: [Errno 2] No such file or directory: './output/20240827-100424/artifacts/create_final_covariates.parquet'

#### Read community reports

In [5]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 669


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,649,# CFTC and FINRA Regulatory Community\n\nThe c...,3,8.0,CFTC and FINRA Regulatory Community,The impact severity rating is high due to the ...,The community is centered around the Commodity...,[{'explanation': 'The Commodity Futures Tradin...,"{\n ""title"": ""CFTC and FINRA Regulatory Com...",e93ce0f7-21bb-40fc-b2e6-f4bb4a60676f
1,650,# Securities Regulatory Community: SEC and CFT...,3,8.0,Securities Regulatory Community: SEC and CFTC,The impact severity rating is high due to the ...,The community centers around key regulatory en...,[{'explanation': 'The SEC (Securities and Exch...,"{\n ""title"": ""Securities Regulatory Communi...",a2c4aa3b-ecf4-469d-9f8f-748c1bbf6994
2,651,# FERC and Energy Regulation Community\n\nThe ...,3,8.5,FERC and Energy Regulation Community,The impact severity rating is high due to FERC...,The community is centered around the Federal E...,[{'explanation': 'The Federal Energy Regulator...,"{\n ""title"": ""FERC and Energy Regulation Co...",99e76ed5-1d7b-460f-a14e-94a382a5aa8c
3,652,"# Energy Regulatory Community: DOE, FERC, and ...",3,8.0,"Energy Regulatory Community: DOE, FERC, and NERC",The impact severity rating is high due to the ...,The community consists of key entities involve...,[{'explanation': 'The Department of Energy (DO...,"{\n ""title"": ""Energy Regulatory Community: ...",d238c245-30f8-4934-a085-a9b0f302302f
4,653,# Export-Import Bank and Economic Impact\n\nTh...,3,7.5,Export-Import Bank and Economic Impact,The impact severity rating is high due to the ...,The community centers around the Export-Import...,[{'explanation': 'The Export-Import Bank is an...,"{\n ""title"": ""Export-Import Bank and Econom...",ba5c57d3-f0c5-4618-8f31-274acded5921


#### Read text units

In [6]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 480


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,24f63c581e3f81b1050fcf734111f175,\n﻿\nProject 2025\nPRESIDENTIAL TRANSITION PRO...,1200,[875e7e8309ddeb8db0ca504a07dacd4c],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[8afd3e0d1758418c8562598cd88e58f2, b1df23adf3e..."
1,81f8593f318736e914e69963e650643e,"\nhis work, Mandate for Leadership 2025: The C...",1200,[875e7e8309ddeb8db0ca504a07dacd4c],"[b45241d70f0e43fca764df95b2b81f77, eae4259b19a...","[6768725a42f342fbb475c8612d3cab9f, 920c29ad995..."
2,eba601f04e859b380892ae8eb191e38a,"prescriptions, agency by agency for the incom...",1200,[875e7e8309ddeb8db0ca504a07dacd4c],"[eae4259b19a741ab9f9f6af18c4a0470, dde131ab575...","[665d723e81074aec865cddca987f59d5, b9475f78db9..."
3,e261f8d327bad1b7f36176ab820c607e,\nInstitute for Economic Policy Studies at Th...,1200,[875e7e8309ddeb8db0ca504a07dacd4c],"[b45241d70f0e43fca764df95b2b81f77, d3835bf3dda...","[56231f8e9d3a46ca9005576a089e5a52, e8515e3cc3b..."
4,cb16d88d05e7dbebdc3c58e6cd8d5dfa,the Civil Service” for cutting bureaucracy and...,1200,[875e7e8309ddeb8db0ca504a07dacd4c],"[b45241d70f0e43fca764df95b2b81f77, 17ed1d92075...","[43e2a410dde54d77bc65cdd7353f7bae, 2176e814cc4..."


In [7]:
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("GRAPHRAG_API_KEY")
api_base = os.getenv("GRAPHRAG_API_BASE")
llm_model = os.getenv("GRAPHRAG_LLM_MODEL")
embedding_model = os.getenv("GRAPHRAG_EMBEDDING_MODEL")

llm = ChatOpenAI(
    api_key=api_key,
    api_version="2024-02-15-preview",
    api_base=api_base,
    model=llm_model,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_version="2024-02-15-preview",
    api_base=api_base,
    api_type=OpenaiApiType.AzureOpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

### Create local search context builder

In [8]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    #covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [9]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [10]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [21]:
result = await search_engine.asearch("Executive Order 14020, issued by President Biden how does project 2025 would affect it?")
print(result.response)

# Impact of Project 2025 on Executive Order 14020

## Overview of Executive Order 14020

Executive Order 14020, issued by President Joseph R. Biden Jr., establishes the White House Gender Policy Council. This executive order aims to advance gender equity and equality, addressing issues such as gender-based violence, economic security, and healthcare access for women and girls. The order reflects the Biden administration's commitment to promoting gender equality and ensuring that federal policies and programs consider the needs and rights of women and girls [Data: Entities (783); Relationships (2859, 2408)].

## Introduction to Project 2025

Project 2025 is a coalition of over 50 leading conservative organizations dedicated to preparing for the incoming President's administration in the United States. This initiative focuses on developing effective governance strategies and personnel, ensuring a coordinated plan for the next conservative administration. Project 2025 aims to unite the co

In [20]:
question = "How does the Project 2025 plan to empower women with gender equality what are the steps in the plan? "
result = await search_engine.asearch(question)
print(result.response)

# Project 2025 and Gender Equality

Project 2025, a comprehensive initiative by a coalition of conservative organizations, aims to prepare for the next conservative administration in the United States. While the primary focus of Project 2025 is on governance strategies and policy reforms, it also addresses various social issues, including gender equality and women's empowerment.

## Gender Equality and Women's Empowerment

Project 2025 includes contributions from various organizations and individuals who emphasize the importance of gender equality. One of the key entities involved in promoting gender equality within the framework of Project 2025 is USAID, which focuses on protecting the rights and opportunities of women and children [Data: Entities (1525); Relationships (3094)].

### Key Steps in the Plan

1. **Policy Reforms and Strategic Planning**:
   - The initiative includes strategic documents like the DEIA Strategic Plan FY22–FY26 by the U.S. Department of Transportation, which 

#### Inspecting the context data used to generate the response

In [13]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,1538,FEMALES,,1,True
1,2973,WOMEN'S BUREAU,The Women's Bureau is a part of the Department...,1,True
2,1650,GENDER EQUITY AND WOMEN EMPOWERMENT,The initiative focuses on promoting gender equ...,1,True
3,106,INDEPENDENT WOMEN’S FORUM,The Independent Women’s Forum is a conservativ...,1,True
4,2457,SPECIAL REPRESENTATIVE FOR DOMESTIC WOMEN’S HE...,A proposed position to lead federal domestic p...,1,True


In [14]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,3095,USAID,GENEVA CONSENSUS DECLARATION,USAID's mission aligns with the goals of the G...,1.0,98,4,True
1,3094,USAID,GENDER EQUALITY,USAID focuses on gender equality as part of it...,1.0,96,4,True
2,3096,USAID,FEMALES,USAID's focus on gender equality includes the ...,1.0,96,4,True
3,3105,USAID,WOMANCARE GLOBAL INTERNATIONAL,WomanCare Global International has received fu...,1.0,96,4,True
4,2471,BIDEN ADMINISTRATION,GENEVA CONSENSUS DECLARATION,The Biden Administration is expected to align ...,1.0,267,3,True


In [15]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,300,USAID and Global Development Initiatives,# USAID and Global Development Initiatives\n\n...
1,300,USAID and Global Development Initiatives,# USAID and Global Development Initiatives\n\n...


In [16]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,142,"Civil Rights, \nwhich is the appropriate locu..."
1,321,", and sexuality.\nUnless the Supreme Court ove..."
2,153,"Release, “Fact Sheet: Prioritizing Climate in..."
3,4,the Civil Service” for cutting bureaucracy and...


In [19]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

   id            entity object_id     status start_date end_date  \
0  12  DR. JORDAN HAYES      NONE       TRUE       NONE     NONE   
1  13  DR. JORDAN HAYES      NONE  SUSPECTED       NONE     NONE   
2  18  DR. JORDAN HAYES      ALEX       TRUE       NONE     NONE   
3  49  DR. JORDAN HAYES      NONE  SUSPECTED       NONE     NONE   
4  74  DR. JORDAN HAYES      NONE  SUSPECTED       NONE     NONE   

                                         description  in_context  
0  Dr. Jordan Hayes contemplates their skepticism...        True  
1  Dr. Jordan Hayes mused over the layers of data...        True  
2  Dr. Jordan Hayes and Alex discovered a panel h...        True  
3  Dr. Jordan Hayes was focused on deciphering al...        True  
4  Dr. Jordan Hayes is analyzing the evolving ali...        True  


### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [17]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [18]:
question_history = [
    "what is project 2025",
    "What is there about women?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- What role does the Independent Women’s Forum play in Project 2025?', '- How does Project 2025 address issues related to gender equality?', '- What contributions has Alma Golden made to the 2025 Presidential Transition Project?', '- How does the Project 2025 coalition plan to empower women?', '- What is the focus of the Institute for Women’s Health within Project 2025?']
