In [None]:
# download files
!mkdir data
!wget "https://www.dropbox.com/s/948jr9cfs7fgj99/UBER.zip?dl=1" -O data/UBER.zip
!unzip data/UBER.zip -d data

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from llama_index import download_loader, GPTVectorStoreIndex
from pathlib import Path

### Ingest Unstructured Data Through the Unstructured.io Reader

Leverage the capabilities of Unstructured.io HTML parsing.
Downloaded through LlamaHub.

In [None]:
years = [2022, 2021, 2020, 2019]

In [None]:
UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True, use_gpt_index_import=True)

In [None]:
loader = UnstructuredReader()
doc_set = {}
all_docs = []
for year in years:
    year_docs = loader.load_data(file=Path(f'./data/UBER/UBER_{year}.html'), split_documents=False)
    # insert year metadata into each year
    for d in year_docs:
        d.extra_info = {"year": year}
    doc_set[year] = year_docs
    all_docs.extend(year_docs)

### Setup a Vector Index for each SEC filing

We setup a separate vector index for each SEC filing from 2019-2022.

We also optionally initialize a "global" index by dumping all files into the vector store.

In [None]:
from llama_index.indices.service_context import ServiceContext

service_context = ServiceContext.from_defaults(chunk_size=512)
# initialize simple vector indices + global vector index
# NOTE: don't run this cell if the indices are already loaded! 
index_set = {}
for year in years:
    cur_index = GPTVectorStoreIndex.from_documents(doc_set[year], service_context=service_context)
    index_set[year] = cur_index
    

### Composing a Graph to synthesize answers across 10-K filings (2019-2022)

We want our queries to aggregate/synthesize information across *all* 10-K filings. To do this, we define a List index
on top of the 4 vector indices.

In [None]:
from llama_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from llama_index.composability import ComposableGraph

In [None]:
# set summary text for each doc
index_summaries = {}
for year in years:
    index_summaries[year] = f"UBER 10-k Filing for {year} fiscal year"

In [None]:
# set number of output tokens
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

In [None]:
# define a list index over the vector indices
# allows us to synthesize information across each index
graph = ComposableGraph.from_indices(
    GPTListIndex,
    children_indices=[index_set[y] for y in years],
    index_summaries=index_summaries,
    service_context=service_context
)

In [None]:
query_engine = graph.as_query_engine(
    response_mode='tree_summarize'
)

In [None]:
import asyncio
import time

cross_query_str = (
    "Compare/contrast the risk factors described in the Uber 10-K across years. Give answer in bullet points."
)

start_time = time.perf_counter()
task = query_engine.aquery(cross_query_str)
response = asyncio.run(task)
elapsed_time = time.perf_counter() - start_time

In [None]:
print(str(response))
print(str(elapsed_time))