In [1]:
## analyze Wikipedia articles of differnet cities

In [2]:
from pathlib import Path
import requests
from llama_index import SimpleDirectoryReader

In [3]:
wiki_titles = ['Toronto','Seattle','Chicago','Boston','Houston',]

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            # 'exintro': True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']
    
    data_path = Path('data')
    if not data_path.exists():
        Path.mkdir(data_path)
        
    with open(data_path / f"{title}.txt",'w') as fp:
        fp.write(wiki_text)

In [4]:
# load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]).load_data()

## Defining set of indexes

In [5]:
# define a vector index over the documetns of each city

from llama_index import VectorStoreIndex, ServiceContext, StorageContext, SimpleKeywordTableIndex
from llama_index.llms import OpenAI

# set service context
llm_gpt4 = OpenAI(temperature=0, model='gpt-4')
service_context = ServiceContext.from_defaults(llm=llm_gpt4, chunk_size=1024)

# Build city document index
vector_indices = {}
for wiki_title in wiki_titles:
    storage_context = StorageContext.from_defaults()
    # build vector index
    vector_indices[wiki_title] = VectorStoreIndex.from_documents(
        city_docs[wiki_title],
        service_context=service_context,
        storage_context=storage_context)
    # set id for vector index
    vector_indices[wiki_title].index_struct.index_id = wiki_title
    # persist to disk
    storage_context.persist(persist_dir=f"./storage/{wiki_title}")

In [6]:
## Querying a vector index lets perform sematnic search
response = (
    vector_indices['Boston']
    .as_query_engine()
    .query("What are the sports teams in Boston?"))

print(str(response))

Boston is home to several professional sports teams. These include the New England Patriots and the Boston Red Sox in football and baseball respectively. The city also has the Boston Bruins in the National Hockey League and the Boston Celtics in the National Basketball Association. In addition to these, Boston has esports teams such as the Boston Uprising in the Overwatch League and the Boston Breach in the Call of Duty League. The New England Revolution of Major League Soccer also share Gillette Stadium with the Patriots.


## Defining a Graph for Compare/Contrast

In [7]:
# define a composed graph
# contains a keyword table composed on top of existing indexes

# set the summary text for each vector index
index_summaries = {}
for wiki_title in wiki_titles:
    # set summary text for city
    index_summaries[wiki_title] = (
        f"This content contains Wikipedia articles about {wiki_title}. "
        f"Use this index if you need to look up specific facts about {wiki_title}.\n"
    )

In [8]:
# next compose a keyord table on top of vecotr indexes
from llama_index.indices.composability import ComposableGraph

graph = ComposableGraph.from_indices(
    SimpleKeywordTableIndex,
    [index for _, index in vector_indices.items()],
    [summary for _, summary in index_summaries.items()],
    max_keywords_per_chunk=50,)

# gt root index
root_index = graph.get_index(graph.index_struct.index_id)

# set id of root index
root_index.set_index_id('compare_contrast')
root_summary = (
    'This index contains Wikipedia articles about mutliple cities. '
    'Use this index if you want to compare multiple cities. '
)

In [9]:
# define decompose_transform
from llama_index.indices.query.query_transform.base import (
    DecomposeQueryTransform,
)

decompose_transform = DecomposeQueryTransform(llm=llm_gpt4, verbose=True)

# define custom query engines
from llama_index.query_engine.transform_query_engine import (
    TransformQueryEngine,
)

custom_query_engines = {}
for index in vector_indices.values():
    query_engine = index.as_query_engine(service_context=service_context)
    query_engine = TransformQueryEngine(
        query_engine,
        query_transform=decompose_transform,
        #transform_extra_info={"index_summary": index.index_struct.summary},
    )
    custom_query_engines[index.index_id] = query_engine
custom_query_engines[graph.root_id] = graph.root_index.as_query_engine(
    retriever_mode="simple",
    response_mode="tree_summarize",
    service_context=service_context,
)

# define query engine
query_engine = graph.as_query_engine(custom_query_engines=custom_query_engines)

# query the graph
query_str = "Compare and contrast the arts and culture of Houston and Boston. "
response_chatgpt = query_engine.query(query_str)

[1;3;33m> Current query: Compare and contrast the arts and culture of Houston and Boston. 
[0m[1;3;38;5;200m> New query: What is the arts and culture scene like in Houston?
[0m[1;3;33m> Current query: Compare and contrast the arts and culture of Houston and Boston. 
[0m[1;3;38;5;200m> New query: What is the arts and culture scene like in Houston?
[0m[1;3;33m> Current query: Compare and contrast the arts and culture of Houston and Boston. 
[0m[1;3;38;5;200m> New query: What is the arts and culture scene like in Houston?
[0m[1;3;33m> Current query: Compare and contrast the arts and culture of Houston and Boston. 
[0m[1;3;38;5;200m> New query: What is the arts and culture scene like in Houston?
[0m

## Defining unified query interface

In [10]:
from llama_index.tools.query_engine import QueryEngineTool

query_engine_tools = []

# add vector index tools
for wiki_title in wiki_titles:
    index = vector_indices[wiki_title]
    summary = index_summaries[wiki_title]

    query_engine = index.as_query_engine(service_context=service_context)
    vector_tool = QueryEngineTool.from_defaults(
        query_engine, description=summary
    )
    query_engine_tools.append(vector_tool)


# add graph tool
graph_description = (
    "This tool contains Wikipedia articles about multiple cities. "
    "Use this tool if you want to compare multiple cities. "
)
graph_tool = QueryEngineTool.from_defaults(
    query_engine, description=graph_description
)
query_engine_tools.append(graph_tool)

In [11]:
## define routing
from llama_index.query_engine.router_query_engine import RouterQueryEngine
from llama_index.selectors.llm_selectors import LLMSingleSelector


router_query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(service_context=service_context),
    query_engine_tools=query_engine_tools,
)

## Querifying unified interface

In [12]:
# ask a compare/contrast question
response = router_query_engine.query(
    "Compare and contrast the sports of Houston and Boston.",
)
print(str(response))

The sports of Houston and Boston cannot be compared or contrasted based on the given context information.


In [13]:
response = router_query_engine.query("What are the sports teams in Boston?")
print(str(response))

Boston is home to several professional sports teams including the New England Patriots in the National Football League, the Boston Red Sox in Major League Baseball, the Boston Bruins in the National Hockey League, and the Boston Celtics in the National Basketball Association. In addition to these, Boston also has esports teams such as the Boston Uprising in the Overwatch League and the Boston Breach in the Call of Duty League. The city also hosts the New England Revolution of Major League Soccer.
