In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
INPUT_DIR = "./output"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "community_reports"
ENTITY_TABLE = "entities"
COMMUNITY_TABLE = "communities"
RELATIONSHIP_TABLE = "relationships"
#COVARIATE_TABLE = "covariates"
TEXT_UNIT_TABLE = "text_units"
COMMUNITY_LEVEL = 2

In [3]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")

entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="default-entity-description",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 1824


Unnamed: 0,id,human_readable_id,title,type,description,text_unit_ids,frequency,degree,x,y
0,3e730b25-df4d-4390-a891-3cb559ab387a,0,MICROSOFT FABRIC,ORGANIZATION,Microsoft Fabric is a comprehensive cloud-base...,[422c343a682e7f78dae36ac94f2bf135ec09bd2784744...,103,483,0.0,0.0
1,09903b47-6e3a-478b-848f-590437b2048b,1,ONELAKE,ORGANIZATION,ONELAKE is a unified data lake storage service...,[422c343a682e7f78dae36ac94f2bf135ec09bd2784744...,15,23,0.0,0.0
2,20d4f924-8631-4262-aacf-05af47519b57,2,COPILOT,ORGANIZATION,Copilot is a comprehensive AI-powered feature ...,[422c343a682e7f78dae36ac94f2bf135ec09bd2784744...,52,120,0.0,0.0
3,5bffd675-01c6-4f6d-a247-99333ee64f05,3,MICROSOFT 365,ORGANIZATION,Microsoft 365 is a comprehensive cloud-based s...,[422c343a682e7f78dae36ac94f2bf135ec09bd2784744...,16,10,0.0,0.0
4,669265e3-7b18-43a8-b28e-e06005787d8e,4,AZURE AI FOUNDRY,ORGANIZATION,Azure AI Foundry is a comprehensive platform d...,[422c343a682e7f78dae36ac94f2bf135ec09bd2784744...,6,5,0.0,0.0


In [4]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 2641


Unnamed: 0,id,human_readable_id,source,target,description,weight,combined_degree,text_unit_ids
0,3877ccd6-2ae9-46bc-82de-3acf47ff371e,0,MICROSOFT FABRIC,ONELAKE,Microsoft Fabric is an integrated platform tha...,69.0,506,[422c343a682e7f78dae36ac94f2bf135ec09bd2784744...
1,abb9d1b1-2a68-4f09-a76c-8f94f71190ee,1,MICROSOFT FABRIC,COPILOT,Microsoft Fabric is a comprehensive platform d...,134.0,603,[422c343a682e7f78dae36ac94f2bf135ec09bd2784744...
2,ba7b4a6c-1a26-45b7-a0fa-e3dfd981aa88,2,MICROSOFT FABRIC,MICROSOFT 365,Microsoft Fabric is a comprehensive data manag...,43.0,493,[422c343a682e7f78dae36ac94f2bf135ec09bd2784744...
3,dca4c0cf-eb69-40eb-94d0-c94a5b2fb935,3,MICROSOFT FABRIC,AZURE AI FOUNDRY,Microsoft Fabric and Azure AI Foundry are two ...,27.0,488,[422c343a682e7f78dae36ac94f2bf135ec09bd2784744...
4,6b4b7190-1e56-44dc-a687-24710623bf6c,4,MICROSOFT FABRIC,POWER BI,Microsoft Fabric is a comprehensive platform d...,294.0,651,[422c343a682e7f78dae36ac94f2bf135ec09bd2784744...


In [5]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
# covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

# claims = read_indexer_covariates(covariate_df)

# print(f"Claim records: {len(claims)}")
# covariates = {"claims": claims}

In [6]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 270


Unnamed: 0,id,human_readable_id,community,level,parent,children,title,summary,full_content,rank,rating_explanation,findings,full_content_json,period,size
0,4a1b360a941242928932582e582b2d8a,266,266,3,167,[],Microsoft Fabric Ecosystem,The Microsoft Fabric community encompasses a r...,# Microsoft Fabric Ecosystem\n\nThe Microsoft ...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'Microsoft Fabric is deeply i...,"{\n ""title"": ""Microsoft Fabric Ecosystem"",\...",2025-05-17,252
1,9f6f2f0256e947afb79201c599dcd51b,267,267,3,167,[],Fabric Community Conference in Las Vegas,The Fabric Community Conference is an annual e...,# Fabric Community Conference in Las Vegas\n\n...,6.0,The impact severity rating is moderate due to ...,[{'explanation': 'The Fabric Community Confere...,"{\n ""title"": ""Fabric Community Conference i...",2025-05-17,2
2,f7d208f8d7d5448d913ae0f36140fe54,268,268,3,233,[],Microsoft Fabric Data Management Community,The community centers around Microsoft Fabric'...,# Microsoft Fabric Data Management Community\n...,7.5,The impact severity rating is high due to the ...,[{'explanation': 'One Lake serves as a pivotal...,"{\n ""title"": ""Microsoft Fabric Data Managem...",2025-05-17,8
3,b4b3237867a14c91b79c92cd324f7657,269,269,3,233,[],Delta Lake and Associated Technologies,"The community centers around Delta Lake, an op...",# Delta Lake and Associated Technologies\n\nTh...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'Delta Lake serves as a core ...,"{\n ""title"": ""Delta Lake and Associated Tec...",2025-05-17,3
4,41cc1742a4bf40c3ad88930d6cb5a8c3,153,153,2,19,[],Azure OpenAI and Its Ecosystem,"The community centers around Azure OpenAI, a s...",# Azure OpenAI and Its Ecosystem\n\nThe commun...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'Azure OpenAI is a comprehens...,"{\n ""title"": ""Azure OpenAI and Its Ecosyste...",2025-05-17,3


In [7]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 243


Unnamed: 0,id,human_readable_id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,422c343a682e7f78dae36ac94f2bf135ec09bd2784744d...,1,Tell us about your PDF experience.\nMicrosoft ...,1200,[d5835eb381e6b16dd4dd9c9b74c2bb54d06463d791e93...,"[3e730b25-df4d-4390-a891-3cb559ab387a, 09903b4...","[3877ccd6-2ae9-46bc-82de-3acf47ff371e, abb9d1b...",[]
1,7f05149428714a222f2c4fa84b8dc05f8991a7a9266cab...,2,"ises and in the cloud. For more information, s...",1200,[d5835eb381e6b16dd4dd9c9b74c2bb54d06463d791e93...,"[3e730b25-df4d-4390-a891-3cb559ab387a, 09903b4...","[3877ccd6-2ae9-46bc-82de-3acf47ff371e, 6b4b719...",[]
2,406c2d7deeffa71c9bcd8922a425f6548d58eabd180b0d...,3,"IoT Hub, Azure SQL DB Change Data Capture (CD...",1200,[d5835eb381e6b16dd4dd9c9b74c2bb54d06463d791e93...,"[3e730b25-df4d-4390-a891-3cb559ab387a, 09903b4...","[3877ccd6-2ae9-46bc-82de-3acf47ff371e, ba7b4a6...",[]
3,03b9935515464b3ba9fa660b98d2b7bd12a24ebe4f6494...,4,"For detailed instructions, see\nMoving your d...",1200,[d5835eb381e6b16dd4dd9c9b74c2bb54d06463d791e93...,"[3e730b25-df4d-4390-a891-3cb559ab387a, 5bffd67...","[ba7b4a6c-1a26-45b7-a0fa-e3dfd981aa88, 6b4b719...",[]
4,0c2447dd1987bf104efc25070271e6f085d4b9c2945d60...,5,", see Canceling, expiring, and closing.\nCance...",1200,[d5835eb381e6b16dd4dd9c9b74c2bb54d06463d791e93...,"[5bffd675-01c6-4f6d-a247-99333ee64f05, c44a355...","[0ad8a4e0-cedf-41fa-bc40-0a41babad623, f08cbdf...",[]


In [8]:
# feedback why should we pass the type to both config and the chat_model etc?

from graphrag.config.enums import ModelType, AuthType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

api_key = os.getenv("GRAPHRAG_API_KEY")
llm_model = os.getenv("GRAPHRAG_LLM_MODEL")
embedding_model = os.getenv("GRAPHRAG_EMBEDDING_MODEL")

chat_config = LanguageModelConfig(
    api_key=api_key,
    auth_type=AuthType.APIKey, 
    type=ModelType.AzureOpenAIChat,
    model=llm_model,
    deployment_name=llm_model,
    max_retries=20,
    api_base= os.getenv("GRAPHRAG_API_BASE"),
    api_version="2024-02-15-preview"
)
chat_model = ModelManager().get_or_create_chat_model(
    name="local_search",
    model_type=ModelType.AzureOpenAIChat,
    config=chat_config,
)

token_encoder = tiktoken.encoding_for_model(llm_model)

embedding_config = LanguageModelConfig(
    api_key=api_key,
    auth_type=AuthType.APIKey,
    type=ModelType.AzureOpenAIEmbedding,  # <-- Switch to AzureOpenAIEmbedding
    model=embedding_model,                # <-- This should be your Azure deployment name for embeddings
    deployment_name=embedding_model,      # <-- Same as above
    api_base=os.getenv("GRAPHRAG_API_BASE"),
    api_version="2024-02-15-preview"
)

text_embedder = ModelManager().get_or_create_embedding_model(
    name="local_search_embedding",
    model_type=ModelType.AzureOpenAIEmbedding,
    config=embedding_config,
)

In [9]:
llm_model

'gpt-4o'

In [10]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    #covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

In [11]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

model_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [12]:
search_engine = LocalSearch(
    model=chat_model,
    context_builder=context_builder,
    token_encoder=token_encoder,
    model_params=model_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [14]:
result = await search_engine.search("how do you do shortcuts?")
print(result.response)



### Understanding Shortcuts in OneLake

Shortcuts in OneLake are a feature designed to enhance data access control and sharing within Microsoft Fabric. They allow users to define security settings on subfolders within the shortcut root, providing a granular level of data access management. This is particularly useful in environments where data needs to be shared across different users, applications, or organizations while maintaining strict access controls [Data: Entities (290, 322, 323); Relationships (396, 423, 424)].

### Security and Data Sharing

The primary function of shortcuts is to facilitate data sharing while ensuring that security protocols are adhered to. By setting permissions at the subfolder level, organizations can ensure that only authorized users have access to specific data sets. This is crucial in maintaining data integrity and confidentiality, especially in large organizations where data is accessed by multiple departments or external partners [Data: Entities (39)