In [1]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader
)

from llama_index.core import SummaryIndex
from llama_index.core.schema import IndexNode
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.callbacks import CallbackManager
from llama_index.llms.openai import OpenAI

In [2]:
wiki_titles = [
    "Toronto",
    "Seattle",
    "Chicago",
    "Boston",
    "Houston",
]

In [3]:
from pathlib import Path

import requests

for title in wiki_titles:
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            # 'exintro': True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response["query"]["pages"].values()))
    wiki_text = page["extract"]

    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", "w") as fp:
        fp.write(wiki_text)

In [4]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(
        input_files=[f"data/{wiki_title}.txt"]
    ).load_data()

In [12]:
llm = OpenAI(temperature=0, model="gpt-4-turbo-preview")
callback_manager = CallbackManager([])

### Setup Agent

#### Define Toolset
Each tool corresponds to a simple top-k RAG pipeline over a single document

In [13]:
from llama_index.agent.openai import OpenAIAgent
from llama_index.core import load_index_from_storage, StorageContext
from llama_index.core.node_parser import SentenceSplitter
import os

node_parser = SentenceSplitter()

# Build agents dictionary
query_engine_tools = []

for idx, wiki_title in enumerate(wiki_titles):
    nodes = node_parser.get_nodes_from_documents(city_docs[wiki_title])

    if not os.path.exists(f"./data/{wiki_title}"):
        # build vector index
        vector_index = VectorStoreIndex(
            nodes, callback_manager=callback_manager
        )
        vector_index.storage_context.persist(
            persist_dir=f"./data/{wiki_title}"
        )
    else:
        vector_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=f"./data/{wiki_title}"),
            callback_manager=callback_manager,
        )
    # define query engines
    vector_query_engine = vector_index.as_query_engine(llm=llm)

    # define tools
    query_engine_tools.append(
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name=f"vector_tool_{wiki_title}",
                description=(
                    "Useful for questions related to specific aspects of"
                    f" {wiki_title} (e.g. the history, arts and culture,"
                    " sports, demographics, or more)."
                ),
            ),
        )
    )

### Setup OpenAI Agent

In [14]:
from llama_index.core.agent import AgentRunner
from llama_index.agent.openai import OpenAIAgentWorker, OpenAIAgent
from llama_index.agent.openai import OpenAIAgentWorker

openai_step_engine = OpenAIAgentWorker.from_tools(
    query_engine_tools, llm=llm, verbose=True
)
agent = AgentRunner(openai_step_engine)
# # alternative
# agent = OpenAIAgent.from_tools(query_engine_tools, llm=llm, verbose=True)

### Run Some Queries

#### Out of the box

In [15]:
response = agent.chat(
    "Tell me about the demographics of Boston, and compare that with the demographics of Chicago"
)

Added user message to memory: Tell me about the demographics of Boston, and compare that with the demographics of Chicago
=== Calling Function ===
Calling function: vector_tool_Boston with args: {"input": "demographics"}
Got output: In 2020, Boston's population was estimated at 691,531, marking a 12% increase from 2010. The city is notably the third-most densely populated large U.S. city with over half a million residents and stands as the most densely populated state capital. The demographic composition reveals a diverse age distribution: 21.9% of residents were 19 or under, 14.3% aged 20 to 24, 33.2% between 25 and 44, 20.4% from 45 to 64, and 10.1% were 65 or older. The median age was 30.8 years. The gender ratio showed 92.0 males for every 100 females, and this ratio slightly decreased to 89.9 males for every 100 females aged 18 and over.

Household dynamics indicated that 20.4% had children under 18, 25.5% were married couples living together, 16.3% had a female householder with n

In [16]:
print(str(response))

### Demographics of Boston

- **Population (2020):** 691,531, a 12% increase from 2010.
- **Density:** Third-most densely populated large U.S. city.
- **Age Distribution:** Diverse, with a median age of 30.8 years.
- **Household Dynamics:** 20.4% with children under 18, 25.5% married couples, 54.0% non-families.
- **Economic Indicators:** Median household income of $51,739; significant racial wealth gap.
- **Racial and Ethnic Composition:** Non-Hispanic Whites 47%, African-Americans 22%, significant Irish and Italian communities, growing Latin American populations.
- **Religious Affiliation:** 57% Christian, 33% no religious affiliation.

### Demographics of Chicago

- **Historical Migration:** Early 20th-century influx of European immigrants; significant African American migration during the Great Migration.
- **Racial and Ethnic Groups (as of July 2019):** Non-Hispanic Whites 32.8%, Blacks 30.1%, Hispanics 29.0%.
- **Population Trends:** Decline from over 3.6 million in 1950 to under