In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import tiktoken
from graphrag.config.enums import ModelType, AuthType
from graphrag.config.models.language_model_config import LanguageModelConfig
from graphrag.language_model.manager import ModelManager
from crewai import Agent, Task, Crew, LLM
from crewai.tools import BaseTool
from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
    read_indexer_communities,
)
from graphrag.vector_stores.lancedb import LanceDBVectorStore
from graphrag.query.structured_search.local_search.mixed_context import LocalSearchMixedContext
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.query.structured_search.global_search.community_context import GlobalCommunityContext
from graphrag.query.structured_search.global_search.search import GlobalSearch

load_dotenv()

api_key = os.getenv("GRAPHRAG_API_KEY")
llm_model = os.getenv("GRAPHRAG_LLM_MODEL")
embedding_model = os.getenv("GRAPHRAG_EMBEDDING_MODEL")
api_base = os.getenv("GRAPHRAG_API_BASE")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class GraphRagBaseConfig:
    def __init__(self, llm_model, embedding_model, api_key, api_base):
        self.llm_model = llm_model
        self.embedding_model = embedding_model
        self.api_key = api_key
        self.api_base = api_base

    def get_chat_model(self, name="default"):
        chat_config = LanguageModelConfig(
            api_key=self.api_key,
            auth_type=AuthType.APIKey,
            type=ModelType.AzureOpenAIChat,
            model=self.llm_model,
            deployment_name=self.llm_model,
            max_retries=20,
            api_base=self.api_base,
            api_version="2024-02-15-preview"
        )
        return ModelManager().get_or_create_chat_model(
            name=name,
            model_type=ModelType.AzureOpenAIChat,
            config=chat_config
        )

    def get_embedding_model(self, name="embedding"):
        embedding_config = LanguageModelConfig(
            api_key=self.api_key,
            auth_type=AuthType.APIKey,
            type=ModelType.AzureOpenAIEmbedding,
            model=self.embedding_model,
            deployment_name=self.embedding_model,
            api_base=self.api_base,
            api_version="2024-02-15-preview"
        )
        return ModelManager().get_or_create_embedding_model(
            name=name,
            model_type=ModelType.AzureOpenAIEmbedding,
            config=embedding_config
        )


In [3]:
config = GraphRagBaseConfig(llm_model, embedding_model, api_key, api_base)
#chat_model = config.get_chat_model()
#embedding_model = config.get_embedding_model()
token_encoder = tiktoken.encoding_for_model(llm_model)

In [4]:


class LocalSearchTool(BaseTool):
    name: str = "local_search"
    description: str = "Tool to perform local search operations using GraphRAG's index."

    # Declare all extra fields as Pydantic fields here
    input_dir: str = "./output/"
    llm_model: str = None
    embedding_model: str = None
    api_key: str = None
    api_base: str = None   
    
    def __init__(
        self,
        input_dir="./output/",
        llm_model=None,
        embedding_model=None,
        api_key=None,
        api_base=None,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.input_dir = input_dir

        # Setup paths and configs
        LANCEDB_URI = f"{self.input_dir}/lancedb"
        COMMUNITY_REPORT_TABLE = "community_reports"
        ENTITY_TABLE = "entities"
        COMMUNITY_TABLE = "communities"
        RELATIONSHIP_TABLE = "relationships"
        TEXT_UNIT_TABLE = "text_units"
        COMMUNITY_LEVEL = 2

        # Load DataFrames
        entity_df = pd.read_parquet(f"{self.input_dir}/{ENTITY_TABLE}.parquet")
        community_df = pd.read_parquet(f"{self.input_dir}/{COMMUNITY_TABLE}.parquet")
        relationship_df = pd.read_parquet(f"{self.input_dir}/{RELATIONSHIP_TABLE}.parquet")
        report_df = pd.read_parquet(f"{self.input_dir}/{COMMUNITY_REPORT_TABLE}.parquet")
        text_unit_df = pd.read_parquet(f"{self.input_dir}/{TEXT_UNIT_TABLE}.parquet")

        entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)
        relationships = read_indexer_relationships(relationship_df)
        reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
        text_units = read_indexer_text_units(text_unit_df)

        # Load LanceDB Vector Store
        description_embedding_store = LanceDBVectorStore(
            collection_name="default-entity-description",
        )
        description_embedding_store.connect(db_uri=LANCEDB_URI)

        # Model and embedder
        import tiktoken
        from graphrag.config.enums import ModelType, AuthType
        from graphrag.config.models.language_model_config import LanguageModelConfig
        from graphrag.language_model.manager import ModelManager

        token_encoder = tiktoken.encoding_for_model(llm_model)

        # Reuse global configs if possible
        chat_config = LanguageModelConfig(
            api_key=api_key,
            auth_type=AuthType.APIKey,
            type=ModelType.AzureOpenAIChat,
            model=llm_model,
            deployment_name=llm_model,
            max_retries=20,
            api_base=api_base,
            api_version="2024-02-15-preview"
        )
        chat_model = ModelManager().get_or_create_chat_model(
            name="local_search",
            model_type=ModelType.AzureOpenAIChat,
            config=chat_config,
        )

        embedding_config = LanguageModelConfig(
            api_key=api_key,
            auth_type=AuthType.APIKey,
            type=ModelType.AzureOpenAIEmbedding,
            model=embedding_model,
            deployment_name=embedding_model,
            api_base=api_base,
            api_version="2024-02-15-preview"
        )
        text_embedder = ModelManager().get_or_create_embedding_model(
            name="local_search_embedding",
            model_type=ModelType.AzureOpenAIEmbedding,
            config=embedding_config,
        )

        # Context builder
        _context_builder = LocalSearchMixedContext(
            community_reports=reports,
            text_units=text_units,
            entities=entities,
            relationships=relationships,
            entity_text_embeddings=description_embedding_store,
            embedding_vectorstore_key=EntityVectorStoreKey.ID,
            text_embedder=text_embedder,
            token_encoder=token_encoder,
        )

        local_context_params = {
            "text_unit_prop": 0.5,
            "community_prop": 0.1,
            "conversation_history_max_turns": 5,
            "conversation_history_user_turns_only": True,
            "top_k_mapped_entities": 10,
            "top_k_relationships": 10,
            "include_entity_rank": True,
            "include_relationship_weight": True,
            "include_community_rank": False,
            "return_candidate_context": False,
            "embedding_vectorstore_key": EntityVectorStoreKey.ID,
            "max_tokens": 12_000,
        }
        model_params = {
            "max_tokens": 2_000,
            "temperature": 0.0,
        }

        self._search_engine = LocalSearch(
            model=chat_model,
            context_builder=_context_builder,
            token_encoder=token_encoder,
            model_params=model_params,
            context_builder_params=local_context_params,
            response_type="multiple paragraphs"
        )

    def _run(self, query: str) -> str:
        import asyncio
        try:
            loop = asyncio.get_running_loop()
        except RuntimeError:
            loop = None

        if loop and loop.is_running():
            # If in Jupyter, nest coroutine with ensure_future and run until complete
            import nest_asyncio
            nest_asyncio.apply()
            future = asyncio.ensure_future(self._search_engine.search(query))
            result = loop.run_until_complete(future)
        else:
            result = asyncio.run(self._search_engine.search(query))
        return result.response


In [5]:



class GlobalSearchTool(BaseTool):
    name: str = "global_search"
    description: str = "Tool to perform global search operations using GraphRAG's index."
    input_dir: str = "./output/"
    llm_model: str = None
    api_key: str = None
    api_base: str = None

    def __init__(
        self,
        input_dir="./output",
        llm_model=None,
        api_key=None,
        api_base=None,
        callbacks=None,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.input_dir = input_dir

        COMMUNITY_REPORT_TABLE = "community_reports"
        ENTITY_TABLE = "entities"
        COMMUNITY_TABLE = "communities"
        COMMUNITY_LEVEL = 2

        # Load DataFrames
        community_df = pd.read_parquet(f"{self.input_dir}/{COMMUNITY_TABLE}.parquet")
        entity_df = pd.read_parquet(f"{self.input_dir}/{ENTITY_TABLE}.parquet")
        report_df = pd.read_parquet(f"{self.input_dir}/{COMMUNITY_REPORT_TABLE}.parquet")

        communities = read_indexer_communities(community_df, report_df)
        reports = read_indexer_reports(report_df, community_df, COMMUNITY_LEVEL)
        entities = read_indexer_entities(entity_df, community_df, COMMUNITY_LEVEL)

        token_encoder = tiktoken.encoding_for_model(llm_model)

        # LLM Model Config
        chat_config = LanguageModelConfig(
            api_key=api_key,
            auth_type=AuthType.APIKey,
            type=ModelType.AzureOpenAIChat,
            model=llm_model,
            deployment_name=llm_model,
            max_retries=20,
            api_base=api_base,
            api_version="2024-02-15-preview"
        )
        chat_model = ModelManager().get_or_create_chat_model(
            name="global_search",
            model_type=ModelType.AzureOpenAIChat,
            config=chat_config,
        )

        # Context builder
        _context_builder = GlobalCommunityContext(
            community_reports=reports,
            communities=communities,
            entities=entities,
            token_encoder=token_encoder,
        )

        context_builder_params = {
            "use_community_summary": False,
            "shuffle_data": True,
            "include_community_rank": True,
            "min_community_rank": 0,
            "community_rank_name": "rank",
            "include_community_weight": True,
            "community_weight_name": "occurrence weight",
            "normalize_community_weight": True,
            "max_tokens": 20_000,
            "context_name": "Reports",
        }
        map_llm_params = {
            "max_tokens": 1000,
            "temperature": 0.0,
            "response_format": {"type": "json_object"},
        }
        reduce_llm_params = {
            "max_tokens": 5000,
            "temperature": 0.0,
        }

        self._search_engine = GlobalSearch(
            model=chat_model,
            callbacks=callbacks if callbacks is not None else [],
            context_builder=_context_builder,
            token_encoder=token_encoder,
            max_data_tokens=12_000,
            map_llm_params=map_llm_params,
            reduce_llm_params=reduce_llm_params,
            allow_general_knowledge=False,
            json_mode=True,
            context_builder_params=context_builder_params,
            concurrent_coroutines=32,
            response_type="multiple paragraphs",
        )

    def _run(self, query: str) -> str:
        import asyncio
        try:
            loop = asyncio.get_running_loop()
        except RuntimeError:
            loop = None

        if loop and loop.is_running():
            # If in Jupyter, nest coroutine with ensure_future and run until complete
            import nest_asyncio
            nest_asyncio.apply()
            future = asyncio.ensure_future(self._search_engine.search(query))
            result = loop.run_until_complete(future)
        else:
            result = asyncio.run(self._search_engine.search(query))
        return result.response


In [6]:
graphrag_agent = Agent(
    role="You are an expert in Microsoft Fabric product",
    goal="Your goal is to answer the user's questions about Microsoft Fabric.",
    backstory='As an expert in Microsoft Fabric, you have extensive knowledge about its features,' 
    ' functionalities, and best practices. You are here to assist users in understanding and utilizing Microsoft Fabric effectively.',
    tools=[
        LocalSearchTool(
            input_dir="./output/before-tuning",
            llm_model=llm_model,
            embedding_model=embedding_model,
            api_key=api_key,
            api_base=api_base,
        ),
            GlobalSearchTool(
            input_dir="./output/before-tuning",
            llm_model=llm_model,
            api_key=api_key,
            api_base=api_base,
        ),],
    llm=LLM(model=f'azure/{llm_model}'),
    verbose=True,
)


In [7]:
task_instructions = """
You are an expert in Microsoft Fabric product. Your goal is to answer the user's question below about Microsoft Fabric.

Users' question:
{question}
YOu must follow the below rules:
1. You must use both tools provided to you to answer the user's questions.
2- once you get the results from the tools, you must combine them and provide a final answer to the user.
3- you must not repeat the texts or the contexts that are identical or very similar in both results.
4- you must not provide the results of the tools to the user, you must only provide the final answer.
5- if the user speficies that you need to use a specific tool, you must use that sepcific tool for exampl use local search you use the local search tool only.
6- if the user asks what were the difference between the tool results, you must provide the differences between the two results. Otherwise, you must not provide the differences between the two results."""


async def execute_task(task_instructions):
    task = Task(
        description=task_instructions,
        agent=graphrag_agent,
        expected_output="A concise and accurate answer to the user's question about Microsoft Fabric."
    
    )
    crew = Crew(agents=[graphrag_agent], tasks=[task])
    result = crew.kickoff()
    return result

In [12]:
result = None
async def main():
    query_instruction = "What are the features of a pipeline in Fabric?"
    result = await execute_task(query_instruction)
    print(type(result))
    print(result)

# Execute example
await main()


[1m[95m# Agent:[00m [1m[92mYou are an expert in Microsoft Fabric product[00m
[95m## Task:[00m [92mWhat are the features of a pipeline in Fabric?[00m






[1m[95m# Agent:[00m [1m[92mYou are an expert in Microsoft Fabric product[00m
[95m## Thought:[00m [92mThought: To provide a concise and accurate answer about the features of a pipeline in Microsoft Fabric, I need to gather relevant information regarding pipelines. I will start by performing a local search to find specific details about pipeline features within Microsoft Fabric.[00m
[95m## Using tool:[00m [92mlocal_search[00m
[95m## Tool Input:[00m [92m
"{\"query\": \"features of a pipeline in Microsoft Fabric\"}"[00m
[95m## Tool Output:[00m [92m
### Overview of Pipelines in Microsoft Fabric

Pipelines in Microsoft Fabric are a crucial component for automating data workflows and processes. They enable users to orchestrate the movement and transformation of data across various systems and environments, ensuring that data is efficiently processed and made available for analysis and decision-making [Data: Entities (517, 97)].

### Key Features

1. **Automation and Orc

In [11]:
type(result)

NoneType