In [1]:
import dotenv
dotenv.load_dotenv(override=True)


True

In [2]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

In [3]:
INPUT_DIR = "./output/20240704-221917/artifacts"
LANCEDB_URI = f"./lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

In [4]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 828


Unnamed: 0,level,title,type,description,source_id,degree,human_readable_id,id,size,graph_embedding,community,top_level_node_id,x,y
0,0,"""PROJECT GUTENBERG""","""ORGANIZATION""",Project Gutenberg is a pioneering organization...,"01e84646075b255eab0a34d872336a89,10bab8e9773ee...",13,0,b45241d70f0e43fca764df95b2b81f77,13,,,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,"""UNITED STATES""","""GEO""",The United States is frequently mentioned in v...,"01e84646075b255eab0a34d872336a89,28f242c451594...",1,1,4119fd06010c494caa07f439b333f4c5,1,,,4119fd06010c494caa07f439b333f4c5,0,0
2,0,"""CHARLES DICKENS""","""PERSON""",Charles Dickens is the renowned British noveli...,"680dd6d2a970a49082fa4f34bf63a34e,95f1f8f5bdbf0...",1,2,d3835bf3dda84ead99deadbeac5d0d7d,1,,0.0,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,"""ARTHUR RACKHAM""","""PERSON""","Arthur Rackham, an illustrator renowned for hi...","680dd6d2a970a49082fa4f34bf63a34e,95f1f8f5bdbf0...",1,3,077d2820ae1845bcbb1803379a3d1eae,1,,0.0,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,"""A CHRISTMAS CAROL""","""EVENT""","""A Christmas Carol"" is a novel authored by Cha...","680dd6d2a970a49082fa4f34bf63a34e,7b678bbc20b82...",5,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,5,,0.0,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0


In [5]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 323


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,"""PROJECT GUTENBERG""","""UNITED STATES""",7.0,"Project Gutenberg, operating within the United...","[01e84646075b255eab0a34d872336a89, 28f242c4515...",5b9fa6a959294dc29c8420b2d7d3096f,0,13,1,14
1,"""PROJECT GUTENBERG""","""SUZANNE SHELL""",1.0,"""Suzanne Shell's production work for A Christm...",[680dd6d2a970a49082fa4f34bf63a34e],b84d71ed9c3b45819eb3205fd28e13a0,1,13,2,15
2,"""PROJECT GUTENBERG""","""JANET BLENKINSHIP""",1.0,"""Janet Blenkinship's production work for A Chr...",[680dd6d2a970a49082fa4f34bf63a34e],b0b464bc92a541e48547fe9738378dab,2,13,2,15
3,"""PROJECT GUTENBERG""","""ONLINE DISTRIBUTED PROOFREADING TEAM""",1.0,"""The Online Distributed Proofreading Team's wo...",[680dd6d2a970a49082fa4f34bf63a34e],44c65dda6fb7472dae36f6eea720ab47,3,13,3,16
4,"""PROJECT GUTENBERG""","""GENERAL TERMS OF USE AND REDISTRIBUTING PROJE...",1.0,"""Project Gutenberg establishes and enforces th...",[da3ca9f93aac15c67f6acf3cca2fc229],5d97ff82691c4482973d73d1860e4757,4,13,1,14


In [6]:
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

FileNotFoundError: [Errno 2] No such file or directory: './output/20240704-221917/artifacts/create_final_covariates.parquet'

In [7]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 39


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,38,# Scrooge's Transformation and the Spirit of C...,2,8.5,Scrooge's Transformation and the Spirit of Chr...,The impact severity rating is high due to the ...,This report delves into the pivotal characters...,"[{'explanation': 'The Spirit, identified as th...","{\n ""title"": ""Scrooge's Transformation and ...",37dd346a-4c36-43ea-aef6-7a248cc63a55
1,37,# The Transformation of Ebenezer Scrooge: A St...,2,8.5,The Transformation of Ebenezer Scrooge: A Stud...,The impact severity rating is high due to the ...,This report delves into the intricate network ...,[{'explanation': 'Ebenezer Scrooge's journey b...,"{\n ""title"": ""The Transformation of Ebeneze...",86662713-973c-4144-b045-ee2183c87978
2,15,"# Scrooge, The Clerk, and Christmas-time: A St...",1,7.5,"Scrooge, The Clerk, and Christmas-time: A Stud...",The impact severity rating is relatively high ...,This report delves into the intricate relation...,[{'explanation': 'Scrooge and The Clerk share ...,"{\n ""title"": ""Scrooge, The Clerk, and Chris...",5020dc21-78db-4c9e-baa3-709d909769c7
3,16,# Spectral Guidance and Redemption in A Christ...,1,8.5,Spectral Guidance and Redemption in A Christma...,The high impact severity rating reflects the p...,This report explores the interconnected roles ...,[{'explanation': 'The Ghost is central to 'A C...,"{\n ""title"": ""Spectral Guidance and Redempt...",0edb24dc-71f3-4d72-b2b4-2772ae1771d6
4,17,# Scrooge's Counting-House and the Spirit of C...,1,4.0,Scrooge's Counting-House and the Spirit of Chr...,"The impact severity rating is moderate, reflec...",This report delves into the dynamics within Sc...,[{'explanation': 'Scrooge exhibits a clear aut...,"{\n ""title"": ""Scrooge's Counting-House and ...",cdbb296f-b83b-4845-8a5f-a618a8bd9336


In [8]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 231


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids
0,680dd6d2a970a49082fa4f34bf63a34e,﻿The Project Gutenberg eBook of A Christmas Ca...,300,[c305886e4aa2f6efcf64b57762777055],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[5b9fa6a959294dc29c8420b2d7d3096f, b84d71ed9c3..."
1,95f1f8f5bdbf0bee3a2c6f2f4a4907f6,THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL...,300,[c305886e4aa2f6efcf64b57762777055],"[d3835bf3dda84ead99deadbeac5d0d7d, 077d2820ae1...","[2171091ada0942d8ae7944df11659f6e, bcfdc48e5f0..."
2,3a450ed2b7fb1e5fce66f92698c13824,"1958,\n 1962, 1964, 1966, 1967, 1969, 1971, 1...",300,[c305886e4aa2f6efcf64b57762777055],"[de988724cfdf45cebfba3b13c43ceede, 96aad7cb4b7...","[8d8da35190bf43c5878fa38f3eb4f3d2, 2fb7e14a3f1..."
3,95b143eba145d91eacae7be3e4ebaf0c,".\n Mr. Fezziwig, a kind-hearted, jovial old ...",300,[c305886e4aa2f6efcf64b57762777055],"[9646481f66ce4fd2b08c2eddda42fc82, d91a266f766...","[d99eabad5dfd47278692569d2a9395b1, 0ec262c2cfe..."
4,c390f1b92e2888f78b58f6af5b12afa0,"debtors.\n Mrs. Cratchit, wife of Bob Cratch...",300,[c305886e4aa2f6efcf64b57762777055],"[de988724cfdf45cebfba3b13c43ceede, bf4e255cdac...","[a621663edba64d99b7e50f1e53f32ee7, 5ecf534a9ff..."


In [9]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.OpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=None,
    api_type=OpenaiApiType.OpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

In [10]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

In [11]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [12]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

In [16]:
result = await search_engine.asearch("Who is the girl?")
print(result.response)

The character referred to as "the girl" in the context of Ebenezer Scrooge's transformation story appears in various forms and plays different roles throughout the narrative. The data tables provide insights into several characters that could be identified as "the girl," each contributing to the story's themes of regret, missed opportunities, and the impact of Scrooge's choices on his life and the lives of others.

### The Various Representations of "The Girl"

1. **"THE GIRL" and "GIRL"**:
   - "THE GIRL" is depicted as a character reflecting on the changes in her relationship and the impact of material circumstances on love. She is one of the two children revealed by the Spirit, described as yellow, meagre, ragged, and wolfish, yet prostrate in humility. This portrayal embodies themes of love, materialism, and the human condition [Data: Entities (134)].
   - "GIRL" is a servant or family member at the nephew's house who interacts with Scrooge, guiding him to the dining room. This int