In [25]:
import os

import pandas as pd
import tiktoken

from graphrag.config.enums import ModelType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from graphrag.query.indexer_adapters import (
    read_indexer_communities,
    read_indexer_entities,
    read_indexer_reports,
)
from graphrag.query.structured_search.global_search.community_context import (
    GlobalCommunityContext,
)
from graphrag.query.structured_search.global_search.search import GlobalSearch

In [26]:
# feedback why should we pass the type to both config and the chat_model etc?

from graphrag.config.enums import ModelType, AuthType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

api_key = os.getenv("GRAPHRAG_API_KEY")
llm_model = os.getenv("GRAPHRAG_LLM_MODEL")
embedding_model = os.getenv("GRAPHRAG_EMBEDDING_MODEL")

config = LanguageModelConfig(
    api_key=api_key,
    auth_type=AuthType.APIKey, 
    type=ModelType.AzureOpenAIChat,
    model=llm_model,
    deployment_name=llm_model,
    max_retries=20,
    api_base= os.getenv("GRAPHRAG_API_BASE"),
    api_version="2024-02-15-preview"
)
model = ModelManager().get_or_create_chat_model(
    name="local_search",
    model_type=ModelType.AzureOpenAIChat,
    config=config,
)

token_encoder = tiktoken.encoding_for_model(llm_model)



In [27]:
INPUT_DIR = "./output"

COMMUNITY_REPORT_TABLE = "community_reports"
ENTITY_TABLE = "entities"
COMMUNITY_TABLE = "communities"
COMMUNITY_LEVEL = 2

In [28]:
community_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_TABLE}.parquet")
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")

communities = read_indexer_communities(community_df, report_df)
reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

print(f"Total report count: {len(report_df)}")
print(
    f"Report count after filtering by community level {COMMUNITY_LEVEL}: {len(reports)}"
)

report_df.head()

Total report count: 458
Report count after filtering by community level 2: 436


Unnamed: 0,id,human_readable_id,community,level,parent,children,title,summary,full_content,rank,rating_explanation,findings,full_content_json,period,size
0,ba053084dcaa4945babebf54486fefb6,456,456,4,444,[],Data Factory and Microsoft Fabric Community,"The community centers around Data Factory, a c...",# Data Factory and Microsoft Fabric Community\...,9.0,The impact severity rating is high due to the ...,[{'explanation': 'Data Factory serves as the c...,"{\n ""title"": ""Data Factory and Microsoft Fa...",2025-05-22,49
1,01f3b549e43a407daa72a221f7b05e9a,457,457,4,444,[],On-Premises Data Gateway and Local Data Source,The community centers around the On-Premises D...,# On-Premises Data Gateway and Local Data Sour...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'The On-Premises Data Gateway...,"{\n ""title"": ""On-Premises Data Gateway and ...",2025-05-22,2
2,a5ced721949e42b8b480394caee27ce1,436,436,3,255,[],Microsoft Fabric Community Dynamics,The Microsoft Fabric community is structured a...,# Microsoft Fabric Community Dynamics\n\nThe M...,9.0,The high impact severity rating reflects the c...,[{'explanation': 'The Activity Log is a crucia...,"{\n ""title"": ""Microsoft Fabric Community Dy...",2025-05-22,32
3,cf67628e7da24816b4e7a1552516d270,437,437,3,255,[],Fabric Activity Log and API Data,The community centers around the Fabric Activi...,# Fabric Activity Log and API Data\n\nThe comm...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'The Fabric Activity Log serv...,"{\n ""title"": ""Fabric Activity Log and API D...",2025-05-22,2
4,519cc9ef5ae846aea0cd3e18141b7b8d,438,438,3,266,[],Microsoft Fabric Administration Community,The community centers around the administratio...,# Microsoft Fabric Administration Community\n\...,8.5,The impact severity rating is high due to the ...,[{'explanation': 'The Admin Portal serves as t...,"{\n ""title"": ""Microsoft Fabric Administrati...",2025-05-22,4


In [29]:
context_builder = GlobalCommunityContext(
    community_reports=reports,
    communities=communities,
    entities=entities,  # default to None if you don't want to use community weights for ranking
    token_encoder=token_encoder,
)

### Parameter descriptions for Global Search
- **model**: The LLM/chat model used for generating intermediate and final responses.
- **context_builder**: Prepares and batches context data from community reports for the map-reduce process.
- **token_encoder**: Optional token encoder (e.g., from tiktoken) for managing token limits.
- **map_system_prompt**: Prompt template for the "map" phase (intermediate response generation). If not provided, uses default.
- **reduce_system_prompt**: Prompt template for the "reduce" phase (final aggregation). If not provided, uses default.
- **response_type**: Specifies the format of the final answer (e.g., "multiple paragraphs").
- **allow_general_knowledge**: If True, allows general knowledge (outside the dataset) to be included in the response.
- **general_knowledge_inclusion_prompt**: Custom prompt for including general knowledge.
- **json_mode**: If True, expects and parses JSON-formatted LLM responses in the map phase.
- **callbacks**: List of callback objects for logging or monitoring query execution.
- **max_data_tokens**: Maximum tokens allowed for context data (controls batching).
- **map_llm_params**: Dict of extra parameters to pass to the map-phase LLM calls (e.g., temperature, max_tokens).
- **reduce_llm_params**: Dict of extra parameters for the reduce-phase LLM call.
- **map_max_length**: Maximum number of tokens/words in a map-phase LLM response.
- **reduce_max_length**: Maximum number of tokens/words in a reduce-phase LLM response.
- **context_builder_params**: Dict of parameters to pass to the context builder.
- **concurrent_coroutines**: Number of parallel coroutines to use during the map phase.

In [30]:
### Testing the callback

from graphrag.callbacks.query_callbacks import QueryCallbacks

class PrintMapResultCallback(QueryCallbacks):
    def on_map_result(self, map_result, **kwargs):
        print("Intermediate map result:", map_result)

    # Optionally override other methods if needed
    # def on_reduce_result(self, reduce_result, **kwargs):
    #     print("Reduce result:", reduce_result)

# Instantiate your callback(s)
callbacks = [PrintMapResultCallback()]

In [31]:
context_builder_params = {
    "use_community_summary": False,  # False means using full community reports. True means using community short summaries.
    "shuffle_data": True,
    "include_community_rank": True,
    "min_community_rank": 0,
    "community_rank_name": "rank",
    "include_community_weight": True,
    "community_weight_name": "occurrence weight",
    "normalize_community_weight": True,
    "max_tokens": 20_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    "context_name": "Reports",
}

map_llm_params = {
    "max_tokens": 1000,
    "temperature": 0.0,
    "response_format": {"type": "json_object"},
}

reduce_llm_params = {
    "max_tokens": 5000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000-1500)
    "temperature": 0.0,
}

In [32]:
search_engine = GlobalSearch(
    model=model,
    callbacks=callbacks,
    context_builder=context_builder,
    token_encoder=token_encoder,
    max_data_tokens=12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
    map_llm_params=map_llm_params,
    reduce_llm_params=reduce_llm_params,
    allow_general_knowledge=False,  # set this to True will add instruction to encourage the LLM to incorporate general knowledge in the response, which may increase hallucinations, but could be useful in some use cases.
    json_mode=True,  # set this to False if your LLM model does not support JSON mode.
    context_builder_params=context_builder_params,
    concurrent_coroutines=32,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [33]:
context = await context_builder.build_context(
    query="how do you do shortcuts?",
    **context_builder_params
)
print(context)

ContextBuilderResult(context_chunks=['id|title|occurrence weight|content|rank\n235|Microsoft Fabric Notebooks and Associated Features|0.10679611650485436|"# Microsoft Fabric Notebooks and Associated Features\n\nThe community centers around the Microsoft Fabric Notebooks, which serve as an advanced interactive programming tool for data processing and analysis. Key entities include the Fabric AI Skill, datasets, and various features that enhance user experience, such as session expiry control and debugging capabilities. The relationships among these entities highlight their interdependencies and collaborative functionalities.\n\n## Central Role of Notebooks in Microsoft Fabric\n\nNotebooks are the core component of Microsoft Fabric, facilitating a collaborative environment for users engaged in data processing and machine learning. They allow users to author and execute code in a preconfigured setting, supporting multiple programming languages, including Python. This centrality is undersc

In [34]:
result = await search_engine.search("how do you do shortcuts?")
print(result.response)

### Understanding Shortcuts in Data Management

Shortcuts are a powerful feature designed to enhance data accessibility and management across various platforms. They serve as links or references to existing data locations, allowing users to access and manage data without the need for duplication. This functionality is particularly beneficial for organizations that rely on multiple data sources and need a streamlined approach to data management.

### Key Features of Shortcuts

1. **Unified Namespace**: Shortcuts provide a unified namespace for accessing existing data sources, such as Google Cloud Storage and S3. This eliminates the need for data duplication and simplifies data access and management [Data: Reports (241)].

2. **Semantic Model Tables**: Within the Microsoft Fabric Lakehouse community, shortcuts enable users to create semantic model tables. This allows for the linking of disparate data silos without moving or copying data, thereby streamlining data management and enhancing

In [35]:
# inspect the data used to build the context for the LLM responses
result.context_data["reports"]

Unnamed: 0,id,title,occurrence weight,content,rank
0,235,Microsoft Fabric Notebooks and Associated Feat...,0.106796,# Microsoft Fabric Notebooks and Associated Fe...,9.0
1,135,Microsoft Fabric Capacity Management Community,0.067961,# Microsoft Fabric Capacity Management Communi...,9.0
2,348,Champions Network and Community Resources for ...,0.067961,# Champions Network and Community Resources fo...,8.5
3,388,Microsoft Fabric Data Management Community,0.058252,# Microsoft Fabric Data Management Community\n...,9.0
4,101,Microsoft Fabric Community Insights,0.058252,# Microsoft Fabric Community Insights\n\nThe M...,9.0
...,...,...,...,...,...
431,230,Mirrored Azure Databricks Catalog and VNet Dat...,0.029126,# Mirrored Azure Databricks Catalog and VNet D...,8.5
432,386,Strategic and Tactical Planning in Data Manage...,0.019417,# Strategic and Tactical Planning in Data Mana...,8.5
433,19,Data Residency and Encryption Keys in Microsof...,0.019417,# Data Residency and Encryption Keys in Micros...,8.5
434,195,Fabric CI/CD Python Library and Microsoft Fabric,0.009709,# Fabric CI/CD Python Library and Microsoft Fa...,9.0


In [36]:
# inspect number of LLM calls and tokens
print(
    f"LLM calls: {result.llm_calls}. Prompt tokens: {result.prompt_tokens}. Output tokens: {result.output_tokens}."
)

LLM calls: 35. Prompt tokens: 290736. Output tokens: 2433.


In [37]:
import asyncio
from IPython.display import display, clear_output, Markdown

async def stream_answer():
    output = ""  # Define here!
    async for chunk in search_engine.stream_search("what is the difference between pipeline and dataflow?"):
        for char in chunk:
            output += char
            clear_output(wait=True)
            display(Markdown(output))
            await asyncio.sleep(0.003)  # Simulate typing speed

await stream_answer()


### Overview

Pipelines and dataflows are both integral components of data management and processing within Microsoft Fabric, but they serve distinct purposes and functionalities. Understanding their differences is crucial for effectively leveraging them in data workflows.

### Pipelines

Pipelines are primarily used for orchestrating and automating complex data workflows. They are designed to handle data movement and transformation across various systems and services, often involving multiple steps and dependencies. Pipelines are essential for managing data integration and transformation processes, enabling users to automate workflows and ensure efficient data handling. They are particularly suited for enterprise-scale data solutions, addressing unique industry needs by integrating with tools like Azure Event Hubs and Azure Analysis Services [Data: Reports (429, 219, 310, +more)].

Pipelines are more technical and require advanced configuration and scripting. They are capable of managing the flow of data across different systems, making them suitable for complex data integration scenarios where data needs to be moved and transformed across various environments [Data: Reports (123, 153, 17, +more)].

### Dataflows

Dataflows, on the other hand, focus on the transformation and preparation of data. They are often used to clean, aggregate, and organize data for analysis, making them more user-friendly and accessible to users who may not have extensive programming skills. Dataflows are typically associated with specific tools like Power BI, where they are used to create reusable data transformation logic that can be shared across different reports and dashboards [Data: Reports (123, 153, 17, +more)].

Dataflows are designed to centralize data preparation logic, enhancing data consistency and minimizing the frequency of refreshes on source systems. This centralization is crucial for optimizing data management and supporting robust data analysis and reporting capabilities [Data: Reports (59, 5)].

### Key Differences

- **Purpose**: Pipelines are used for orchestrating data workflows, while dataflows focus on data transformation and preparation.
- **Complexity**: Pipelines handle complex data integration and require advanced configuration, whereas dataflows offer a more user-friendly interface for data transformation.
- **Integration**: Pipelines integrate with various systems and services for data movement, while dataflows are often tied to specific platforms like Power BI for data preparation.
- **User Base**: Pipelines are suited for technical users managing complex workflows, while dataflows cater to users needing accessible data transformation tools.

In summary, while both pipelines and dataflows are essential for data management within Microsoft Fabric, they cater to different aspects of data processing. Pipelines are the backbone for orchestrating complex workflows, whereas dataflows provide a streamlined approach to data transformation and preparation.