# Test Complex Queries over Multiple Documents (text-davinci-003 vs. ChatGPT)

Test complex queries over both text-davinci-003 and ChatGPT

In [None]:
!pip install llama-index

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting llama-index
  Downloading llama_index-0.4.17.tar.gz (122 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.8/122.8 KB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting langchain
  Downloading langchain-0.0.98-py3-none-any.whl (337 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m337.8/337.8 KB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai>=0.26.4
  Downloading openai-0.27.0-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 KB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses_json
  Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6

In [None]:
# My OpenAI Key
import os
os.environ['OPENAI_API_KEY'] = ""

In [None]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [7]:
from llama_index import (
    GPTVectorStoreIndex, 
    GPTSimpleKeywordTableIndex, 
    GPTListIndex, 
    SimpleDirectoryReader,
    LLMPredictor,
    ServiceContext
)
from langchain.llms.openai import OpenAIChat, OpenAI
import requests

#### Load Datasets

Load Wikipedia pages as well as Paul Graham's "What I Worked On" essay

In [8]:
wiki_titles = ["Toronto", "Seattle", "San Francisco", "Chicago", "Boston", "Washington, D.C.", "Cambridge, Massachusetts", "Houston"]

In [9]:
from pathlib import Path

import requests
for title in wiki_titles:
    response = requests.get(
        'https://en.wikipedia.org/w/api.php',
        params={
            'action': 'query',
            'format': 'json',
            'titles': title,
            'prop': 'extracts',
            # 'exintro': True,
            'explaintext': True,
        }
    ).json()
    page = next(iter(response['query']['pages'].values()))
    wiki_text = page['extract']

    data_path = Path('data')
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{title}.txt", 'w') as fp:
        fp.write(wiki_text)


In [10]:
# Load all wiki documents
city_docs = {}
for wiki_title in wiki_titles:
    city_docs[wiki_title] = SimpleDirectoryReader(input_files=[f"data/{wiki_title}.txt"]).load_data()


### Building the document indices
Build a vector index for the wiki pages about cities and persons, and PG essay

In [11]:
# LLM Predictor (text-davinci-003)
llm_predictor_davinci = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-003"))
service_context_davinci = ServiceContext.from_defaults(llm_predictor=llm_predictor_davinci)

# # LLM Predictor (gpt-3.5-turbo)
llm_predictor_chatgpt = LLMPredictor(llm=OpenAIChat(temperature=0, model_name="gpt-3.5-turbo"))
service_context_chatgpt = ServiceContext.from_defaults(llm_predictor=llm_predictor_chatgpt)

In [12]:
# Build city document index
city_indices = {}
for wiki_title in wiki_titles:
    city_indices[wiki_title] = GPTVectorStoreIndex.from_documents(city_docs[wiki_title])

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 17592 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 14402 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 19954 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 22057 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_c

### Build Graph: Keyword Table Index on top of vector indices! 

We compose a keyword table index on top of all the vector indices.

In [14]:
# set summaries for each city
index_summaries = {}
for wiki_title in wiki_titles:
    # set summary text for city
    index_summaries[wiki_title] = f"Wikipedia articles about {wiki_title}"

In [15]:
from llama_index.indices.composability import ComposableGraph

graph = ComposableGraph.from_indices(
    GPTSimpleKeywordTableIndex,
    [index for _, index in city_indices.items()], 
    [summary for _, summary in index_summaries.items()],
    max_keywords_per_chunk=50
)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


### Compare Queries (text-davinci-003 vs. ChatGPT)

**Simple Query**

In [25]:
query_engine_davinci = graph.as_query_engine(
    custom_query_engines={
        graph.root_index.index_id: graph.root_index.as_query_engine(
            retriever_mode='simple',
            service_context=service_context_davinci,
            response_mode='tree_summarize', 
        )
    }
)
query_engine_chatgpt = graph.as_query_engine(
    custom_query_engines={
        graph.root_index.index_id: graph.root_index.as_query_engine(
            retriever_mode='simple',
            service_context=service_context_chatgpt,
            response_mode='tree_summarize', 
        )
    }
)
query_str = "Tell me more about Boston"
response_davinci = query_engine_davinci.query(query_str)
response_chatgpt = query_engine_chatgpt.query(query_str)

INFO:llama_index.indices.keyword_table.retrievers:> Starting query: Tell me more about Boston
INFO:llama_index.indices.keyword_table.retrievers:query keywords: ['tell', 'boston']
INFO:llama_index.indices.keyword_table.retrievers:> Extracted keywords: ['boston']
INFO:llama_index.token_counter.token_counter:> [retrieve] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [retrieve] Total embedding token usage: 5 tokens
INFO:llama_index.indices.common_tree.base:> Building index from nodes: 1 chunks
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 802 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token usage: 4801 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [get_response] Total LLM token u

In [21]:
print(response_davinci)


Boston is the capital and largest city of the Commonwealth of Massachusetts and the cultural and financial center of the New England region of the Northeastern United States. It is one of the oldest municipalities in America, founded on the Shawmut Peninsula in 1630 by Puritan settlers from the English town of the same name. It is a center of scientific research and innovation, with nearly 5,000 startups, and is home to a number of colleges and universities, notably Harvard and MIT. It has a long seafaring tradition, and was a major port for both domestic and international trade in the 19th century. It has seen waves of immigration, with Irish, Germans, Lebanese, Syrians, French Canadians, and Russian and Polish Jews settling in the city. It was an early port of the Atlantic triangular slave trade in the New England colonies, but was soon overtaken. Boston is also known for its philanthropy, with households in the city claiming the highest average rate of philanthropy in the United St

In [22]:
print(response_chatgpt)

Boston is a city in the New England region of the United States with a population of 675,647 as of 2020. It is known for its rich history and is considered the economic and cultural center of the region. The city has many firsts, including the first public park, first public or state school, first subway system, and first large public library in the United States. Boston is also a global pioneer in innovation and entrepreneurship, with nearly 5,000 startups. The city's economy includes finance, professional and business services, biotechnology, information technology, and government activities. Boston is a popular tourist destination, with Faneuil Hall alone drawing more than 20 million visitors per year. The city is home to many prestigious hospitals and universities, including Massachusetts General Hospital, Harvard Medical School, and Boston University.


**Complex Query 1**

In [None]:
query_str = (
    "Tell me the airports in Seattle, Houston, and Toronto. "
    "If only one city is provided, return the airport information for that city. "
    "If airports for multiple cities are provided, compare and contrast the airports. "
)
response_davinci = query_engine_davinci.query(query_str)
response_chatgpt = query_engine_chatgpt.query(query_str)

In [None]:
print(response_davinci)


The airports in Seattle, Houston, and Toronto are Seattle–Tacoma International Airport (IATA: SEA), George Bush Intercontinental Airport (IATA: IAH), Toronto Pearson International Airport (IATA: YYZ), and Billy Bishop Toronto City Airport (IATA: YTZ). Seattle–Tacoma International Airport is the largest airport in the Pacific Northwest region of the United States, serving over 44 million passengers annually. George Bush Intercontinental Airport is the largest airport in Houston, serving over 40 million passengers annually. Toronto Pearson International Airport is the busiest airport in Canada, serving over 50 million passengers annually. Billy Bishop Toronto City Airport is a smaller airport located on the Toronto Islands, serving over 2 million passengers annually.


In [None]:
print(response_chatgpt)

Airports in Seattle: Seattle-Tacoma International Airport.
Airports in Houston: George Bush Intercontinental Airport, William P. Hobby Airport, and Ellington Airport.
Airports in Toronto: Toronto Pearson International Airport, Billy Bishop Toronto City Airport, Buttonville Municipal Airport, and Downsview Airport.

Seattle has one major airport, Seattle-Tacoma International Airport. Houston has three airports: George Bush Intercontinental Airport, William P. Hobby Airport, and Ellington Airport. Toronto has four airports: Toronto Pearson International Airport, Billy Bishop Toronto City Airport, Buttonville Municipal Airport, and Downsview Airport. Toronto has a mix of commercial and smaller airports, while Houston has a mix of commercial, military, government, and general aviation airports.


**Complex Query 2**

In [None]:
query_str = (
    "Look at Houston and Boston. "
    "If only one city is provided, provide information about the sports teams for that city. "
    "If context for multiple cities are provided, compare and contrast the sports environment of the cities. "
)
response_davinci = query_engine_davinci.query(query_str)
response_chatgpt = query_engine_chatgpt.query(query_str)

In [None]:
print(response_davinci)


Houston has teams for every major professional league. The Houston Astros are a Major League Baseball team that have won the World Series in 2017, 2022, and appeared in it in 2005, 2019, and 2021. The Houston Rockets are a National Basketball Association franchise based in the city since 1971, and have won two NBA Championships. The Houston Texans are a National Football League expansion team formed in 2002, and the Houston Dynamo is a Major League Soccer franchise that has been based in Houston since 2006, winning two MLS Cup titles. The Houston Dash team plays in the National Women's Soccer League, and the Houston SaberCats are a rugby team that plays in Major League Rugby. 

Boston also has teams for every major professional league. The Boston Red Sox are a Major League Baseball team that have won the World Series in 2004, 2007, 2013, and 2018. The Boston Celtics are a National Basketball Association team that have won 17 championships, most recently in 2008. The Boston Bruins are 

In [None]:
print(response_chatgpt)

If only one city is provided, Houston has sports teams for every major professional league except the National Hockey League, including the Houston Astros (MLB), Houston Rockets (NBA), Houston Texans (NFL), Houston Dynamo (MLS), Houston Dash (National Women's Soccer League), and Houston SaberCats (rugby).

If context for multiple cities are provided, Boston has teams in the four major North American men's professional sports leagues plus Major League Soccer, and has won 39 championships in these leagues. Boston is one of eight cities to have won championships in all four major American sports leagues. During a particularly impressive 17-year stretch from 2001 to 2018, the city's professional sports teams won twelve championships. The Celtics and Bruins remain competitive for titles in the century’s third decade, though the Patriots and Red Sox have fallen off from these recent glory days. In contrast, Houston has not won as many championships as Boston, but has hosted several major spo

**Complex Query 3**

In [None]:
query_str = (
    "Look at Houston and Boston. "
    "If only one city is provided, provide information about the arts and culture for that city. "
    "If context for multiple cities are provided, compare and contrast the arts and culture of the two cities. "
)
response_davinci = query_engine_davinci.query(query_str)
response_chatgpt = query_engine_chatgpt.query(query_str)

In [None]:
print(response_davinci)


Houston and Boston both have a wide range of cultural attractions. In Houston, the Theater District is a 17-block area in the center of Downtown Houston that is home to the Bayou Place entertainment complex, restaurants, movies, plazas, and parks. The Museum District's cultural institutions and exhibits attract more than 7 million visitors a year. Notable facilities include The Museum of Fine Arts, the Houston Museum of Natural Science, the Contemporary Arts Museum Houston, the Station Museum of Contemporary Art, the Holocaust Museum Houston, the Children's Museum of Houston, and the Houston Zoo. Houston also has many annual events celebrating the diverse cultures of the city, such as the Houston Livestock Show and Rodeo, the Houston Gay Pride Parade, the Houston Greek Festival, Art Car Parade, the Houston Auto Show, the Houston International Festival, and the Bayou City Art Festival.

In Boston, the Freedom Trail is a 2.5-mile walking tour of 16 historically significant sites in down

In [None]:
print(response_chatgpt)

There is no information about the arts and culture of Houston provided, but for Boston, there is a rich cultural history with a strong literary culture and a center for classical music. The city is also home to several art museums and galleries, including the Museum of Fine Arts and the Isabella Stewart Gardner Museum. The Institute of Contemporary Art is housed in a contemporary building designed by Diller Scofidio + Renfro in the Seaport District. Boston's South End Art and Design District (SoWa) and Newbury St. are both art gallery destinations.


**Complex Query 4**

In [None]:
query_str = (
    "Look at Toronto and San Francisco. "
    "If only one city is provided, provide information about the demographics for that city. "
    "If context for multiple cities are provided, compare and contrast the demographics of the two cities. "
)
response_davinci = query_engine_davinci.query(query_str)
response_chatgpt = query_engine_chatgpt.query(query_str)

In [None]:
print(response_davinci)


In Toronto, the population is 2,731,571 people, with a median age of 39.2 years. The racial makeup of the city is 51.5% White, 20.3% Asian, 8.6% African American, 0.8% Native American, 0.2% Pacific Islander, and 18.6% from other races. The city is also home to a large Hispanic population, making up 6.2% of the population. The three most commonly reported ethnic origins are White (46.9%), Asian (20.3%), and Black (8.6%). Christianity is the most commonly reported religion (48.4%), followed by no religion and secular perspectives (31.2%). English is the predominant language spoken by Torontonians with approximately 79% of residents having proficiency in the language, although only 43.2% of Torontonians reported English as their mother tongue.

When comparing Toronto and San Francisco, we can see that Toronto has a larger population than San Francisco, with a median age that is slightly higher. The racial makeup of Toronto is slightly more White than San Francisco, while San Francisco ha

In [None]:
print(response_chatgpt)

Only information about Toronto is provided in the context, so demographics for Toronto can be provided. However, there is no context information about San Francisco to compare and contrast with Toronto.
