In [38]:
# #!pip install --upgrade pymilvus
# !pip install git+https://github.com/zc277584121/graphrag.git

In [None]:
api_key = '9101287fb67e484b9970dd8d7e31aa05' # Your OpenAI API key
llm_model = "gpt-4o"  # Or gpt-4-turbo-preview
embedding_model = "text-embedding-ada-002"

In [17]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm


In [106]:
INPUT_DIR = r"output/20241014-013407/artifacts"
# diesl-graph\classical_graphrag\output\20241014-011736
LANCEDB_URI = f"{INPUT_DIR}/lancedb"
# diesl-graph\classical_graphrag\output\20241013-131846
COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [107]:
# import os  
  
# # Get the current working directory  
# current_path = os.getcwd()  
# print("Current Path:", current_path)  


In [108]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 230


Unnamed: 0,level,title,type,description,source_id,degree,human_readable_id,id,size,graph_embedding,community,entity_type,top_level_node_id,x,y
0,0,JOB #1036,IDENTIFIER,A unique identifier for the construction project,0ddf19c304abd325f4a40aa247b3f917,0,0,b45241d70f0e43fca764df95b2b81f77,0,,,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,COST 8,COST,The cost associated with the construction project,0ddf19c304abd325f4a40aa247b3f917,0,1,4119fd06010c494caa07f439b333f4c5,0,,,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,DEW CONSTRUCTION,COMPANY,The construction company responsible for the p...,0ddf19c304abd325f4a40aa247b3f917,1,2,d3835bf3dda84ead99deadbeac5d0d7d,1,,0.0,,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,CHARLOTTE CENTRAL SCHOOL - PHASE 2 IMPROVEMENTS,PROJECT,"""CHARLOTTE CENTRAL SCHOOL - PHASE 2 IMPROVEMEN...","0ddf19c304abd325f4a40aa247b3f917,f6dea1cdbc4c0...",3,3,077d2820ae1845bcbb1803379a3d1eae,3,,0.0,,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,CONFORMED CONSTRUCTION SET,DOCUMENT,The finalized set of construction documents re...,0ddf19c304abd325f4a40aa247b3f917,1,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,1,,0.0,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0


In [109]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 95


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,DEW CONSTRUCTION,CHARLOTTE CENTRAL SCHOOL - PHASE 2 IMPROVEMENTS,1.0,DEW Construction is responsible for the Charlo...,[0ddf19c304abd325f4a40aa247b3f917],553b285bba60460ab1ed8341ae61282b,0,1,3,4
1,CHARLOTTE CENTRAL SCHOOL - PHASE 2 IMPROVEMENTS,CONFORMED CONSTRUCTION SET,1.0,The Conformed Construction Set is a finalized ...,[0ddf19c304abd325f4a40aa247b3f917],cec95bf17e7e4c939b56c9c6f402a29f,1,3,1,4
2,CHARLOTTE CENTRAL SCHOOL - PHASE 2 IMPROVEMENTS,WATERPROOFING,1.0,Waterproofing is part of the improvements bein...,[f6dea1cdbc4c0c8a12ef0d5965afef47],599164aead034bc19446efacc77554d2,2,3,9,12
3,SHEET WATERPROOFING,FOUNDATION WALLS,1.0,Sheet waterproofing is to be installed vertica...,[0ddf19c304abd325f4a40aa247b3f917],bbf148ae4d48422f8fdef754cfa2b9e4,3,21,1,22
4,SHEET WATERPROOFING,PEEL-AND-STICK SELF-ADHERING MODIFIED BITUMINO...,1.0,Peel-and-stick self-adhering modified bitumino...,[7eefdfb69a0de379a4353c7d43a55125],de61b2670999433f807a6a1dc2b81e43,4,21,2,23


In [110]:
# # NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# # Please see the GRAPHRAG_CLAIM_* settings
# covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

# claims = read_indexer_covariates(covariate_df)

# print(f"Claim records: {len(claims)}")
# covariates = {"claims": claims}

In [111]:
api_key = '9101287fb67e484b9970dd8d7e31aa05' # Your OpenAI API key
llm_model = "gpt-4o"  # Or gpt-4-turbo-preview
embedding_model = "text-embedding-ada-002"

In [112]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 15


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,10,# Drainage System Components and Relationships...,1,7.5,Drainage System Components and Relationships,The impact severity rating is high due to the ...,The community revolves around key components o...,[{'explanation': 'The Drainage Panel is a spec...,"{\n ""title"": ""Drainage System Components an...",01796ec1-2135-4d8c-893b-e284fd292f1c
1,11,# Sheet Waterproofing and Associated Materials...,1,7.5,Sheet Waterproofing and Associated Materials,The impact severity rating is high due to the ...,The community revolves around Sheet Waterproof...,[{'explanation': 'Sheet Waterproofing is the c...,"{\n ""title"": ""Sheet Waterproofing and Assoc...",fccdc70e-c457-4a4b-a041-af976a36ee71
2,12,# Sheet Waterproofing Materials and Adhesives\...,1,6.5,Sheet Waterproofing Materials and Adhesives,The impact severity rating is moderate to high...,The community revolves around the materials us...,[{'explanation': 'Adhesives are a crucial comp...,"{\n ""title"": ""Sheet Waterproofing Materials...",1271fea3-d940-40dd-8e51-3bfd6921c1d6
3,13,# NRCA Waterproofing Manual and Installation G...,1,7.5,NRCA Waterproofing Manual and Installation Gui...,The impact severity rating is high due to the ...,The community revolves around the NRCA Waterpr...,[{'explanation': 'The NRCA Waterproofing Manua...,"{\n ""title"": ""NRCA Waterproofing Manual and...",e17f7a11-1f6e-4512-90d5-0c23c3b8a4bc
4,14,# Peel-and-Stick Self-Adhering Modified Bitumi...,1,6.5,Peel-and-Stick Self-Adhering Modified Bitumino...,The impact severity rating is moderately high ...,The community revolves around the Peel-and-Sti...,[{'explanation': 'The Peel-and-Stick Self-Adhe...,"{\n ""title"": ""Peel-and-Stick Self-Adhering ...",07e1f14d-6c85-4b68-bcda-ddeb9ed25dd1


In [113]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 6


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,0ddf19c304abd325f4a40aa247b3f917,---- Page 1 ----\nJob #\n1036\nCost 8\nConform...,700,[cab74f1df7dac21621fdfaa68ba7b5ed],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[553b285bba60460ab1ed8341ae61282b, cec95bf17e7..."
1,7eefdfb69a0de379a4353c7d43a55125,Data: Provide data for each product specified...,700,[cab74f1df7dac21621fdfaa68ba7b5ed],"[19a7f254a5d64566ab5cc15472df02de, 4a67211867e...","[de61b2670999433f807a6a1dc2b81e43, 3e95dacfe57..."
2,13283ec70c74c53260e8ec02348c370c,Closeout Submittals for additional warranty r...,700,[cab74f1df7dac21621fdfaa68ba7b5ed],"[85c79fd84f5e4f918471c386852204c5, 3138f39f2bc...","[38f51478f41f48db9bee570859b6f43e, 14555b518e9..."
3,ec7d3b0ae56f9ecd310dd1b921eebf4e,ile filter fabric on earth side.\n1. Thickness...,700,[cab74f1df7dac21621fdfaa68ba7b5ed],"[19a7f254a5d64566ab5cc15472df02de, de988724cfd...","[1f1545308e9347af91fd03b94aadc21f, 6ea81acaf23..."
4,9caca4f8e3c3fe9e1d0d08ece49b608c,substrate surfaces and conditions prior to co...,688,[cab74f1df7dac21621fdfaa68ba7b5ed],"[19a7f254a5d64566ab5cc15472df02de, f7e11b0e297...","[b07a7f088364459098cd8511ff27a4c8, 8870cf2b5df..."


In [120]:
api_key = "9101287fb67e484b9970dd8d7e31aa05"
# llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
# embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
api_base="https://diesl-eus-openai-dev.openai.azure.com/"
api_version="2024-02-15-preview"
api_key = '9101287fb67e484b9970dd8d7e31aa05' # Your OpenAI API key
llm_model = "gpt-4o"  # Or gpt-4-turbo-preview
embedding_model = "text-embedding-ada-002"
llm = ChatOpenAI(
    api_key=api_key,
    api_base=api_base,
    api_version=api_version,
    model=llm_model,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=4,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=api_base,
    api_version=api_version,
    api_type=OpenaiApiType.AzureOpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=3,
)

In [121]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

In [122]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [123]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="prioritized list",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [125]:
result = await search_engine.asearch("""

Step 1: Leverage the 'Section Includes' portion of the document to identify the key product(s) within the document.  
Step 2: Identify the aliases used to refer to the key products outlined within the 'Section Includes' portion of the PDF.  
  a) Example aliases include 'GYPBD-1, RF-1'  
  Return only a numbered list of products that have an alias.  
Step 3: Using the list of products and aliases you created in step 2, verify no product or alias is excluded from our list. If you identify a new product and alias, add this new product to the existing list.  
""")
print(result.response)

Exception in _asearch
Traceback (most recent call last):
  File "c:\Users\aziz1\AppData\Local\Programs\Python\Python312\Lib\site-packages\graphrag\query\structured_search\local_search\search.py", line 82, in asearch
    response = await self.llm.agenerate(
               ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aziz1\AppData\Local\Programs\Python\Python312\Lib\site-packages\graphrag\query\llm\oai\chat_openai.py", line 110, in agenerate
    async for attempt in retryer:
  File "c:\Users\aziz1\AppData\Local\Programs\Python\Python312\Lib\site-packages\tenacity\asyncio\__init__.py", line 166, in __anext__
    do = await self.iter(retry_state=self._retry_state)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aziz1\AppData\Local\Programs\Python\Python312\Lib\site-packages\tenacity\asyncio\__init__.py", line 153, in iter
    result = await action(retry_state)
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\aziz1\AppData\Local\Programs\Python\Python312\Lib\si




In [119]:
question = """
Step 1: Leverage the 'Section Includes' portion of the document to identify the key product(s) within the PDF.  
Step 2: Identify the aliases used to refer to the key products outlined within the 'Section Includes' portion of the PDF.  
  a) Example aliases include 'GYPBD-1, RF-1'  
  Return only a numbered list of products that have an alias.  
Step 3: Using the list of products and aliases you created in step 2, verify no product or alias is excluded from our list. If you identify a new product and alias, add this new product to the existing list.  
"""
result = await search_engine.asearch(question)
print(result.response)

### Step 1: Key Products in 'Section Includes'

The 'Section Includes' portion of the document identifies the following key products:

1. **Sheet Waterproofing** for vertical installation at foundation walls.
2. **Waterstops** for casting into concrete.
3. **Below-grade waterproofing accessories**.

### Step 2: Aliases for Key Products

The document does not explicitly mention aliases like 'GYPBD-1' or 'RF-1' for the key products listed in the 'Section Includes' portion. However, it does reference specific products and standards:

1. **Sheet Waterproofing**:
   - **Carlisle Coatings & Waterproofing Inc; MiraDRI 860/861**
   - **GCP Applied Technologies; Bituthene System 4000**
   - **W. R. Meadows, Inc; MEL-ROL**

2. **Waterstops**:
   - No specific aliases mentioned, but it is noted that waterstops need to be provided and placed according to the manufacturer's installation requirements [Data: Entities (113); Relationships (26)].

3. **Below-grade waterproofing accessories**:
   - No s

In [78]:
!python -m graphrag.query --root . --method local """

Step 1: Leverage the 'Section Includes' portion of the document to identify the key product(s) within the PDF.  
Step 2: Identify the aliases used to refer to the key products outlined within the 'Section Includes' portion of the PDF.  
  a) Example aliases include 'GYPBD-1, RF-1'  
  Return only a numbered list of products that have an alias.  
Step 3: Using the list of products and aliases you created in step 2, verify no product or alias is excluded from our list. If you identify a new product and alias, add this new product to the existing list.  
"""


SyntaxError: unmatched ')' (2098973318.py, line 5)

In [None]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

  from .autonotebook import tqdm as notebook_tqdm
