In [1]:
from pydantic import BaseModel
from unstructured.partition.html import partition_html
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

In [32]:
from llama_index.llms import AzureOpenAI
from llama_index.embeddings import AzureOpenAIEmbedding
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index import set_global_service_context
from llama_index import GPTVectorStoreIndex
from llama_index import SimpleDirectoryReader
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index import StorageContext, load_index_from_storage

In [3]:
import openai
from Utilities.envVars import *
import os

# Set OpenAI API key and endpoint
azure_endpoint  = f"{OpenAiEndPoint}"
api_key = OpenAiKey
api_version = OpenAiVersion

embeddingModelType = "azureopenai"
temperature = 0.3
tokenLength = 1000

if (embeddingModelType == 'azureopenai'):
        openai.api_type = "azure"
        openai.api_key = OpenAiKey
        openai.api_version = OpenAiVersion
        openai.api_base = f"{OpenAiEndPoint}"

        llm = AzureOpenAI(
                model="gpt-35-turbo-16k",
                deployment_name=OpenAiChat,
                api_key=api_key,
                azure_endpoint=azure_endpoint,
                api_version=api_version,
                )
        embeddings = AzureOpenAIEmbedding(
                model="text-embedding-ada-002",
                deployment_name=OpenAiEmbedding,
                api_key=api_key,
                azure_endpoint=azure_endpoint,
                api_version=api_version,
                )
        logging.info("LLM Setup done")
        service_context = ServiceContext.from_defaults(
                llm=llm,
                embed_model=embeddings,
        )

        set_global_service_context(service_context)
elif embeddingModelType == "openai":
       print ("OpenAI")

#### Data Loading

In [5]:
bofA2018 = SimpleDirectoryReader(input_files=["data/BOFA/BofA 2018.pdf"]).load_data()
bofA2019 = SimpleDirectoryReader(input_files=["data/BOFA/BofA 2019.pdf"]).load_data()
bofA2020 = SimpleDirectoryReader(input_files=["data/BOFA/BofA 2020.pdf"]).load_data()
bofA2021 = SimpleDirectoryReader(input_files=["data/BOFA/BofA 2021.pdf"]).load_data()
bofA2022 = SimpleDirectoryReader(input_files=["data/BOFA/BofA 2022.pdf"]).load_data()

In [7]:
print(f'Loaded 2018 10-K with {len(bofA2018)} pages')
print(f'Loaded 2019 10-K with {len(bofA2019)} pages')
print(f'Loaded 2020 10-K with {len(bofA2020)} pages')
print(f'Loaded 2021 10-K with {len(bofA2021)} pages')
print(f'Loaded 2022 10-K with {len(bofA2022)} pages')

Loaded 2018 10-K with 211 pages
Loaded 2019 10-K with 501 pages
Loaded 2020 10-K with 512 pages
Loaded 2021 10-K with 611 pages
Loaded 2022 10-K with 352 pages


#### Now, we can build an (in-memory) VectorStoreIndex over the documents that we've loaded.

In [8]:
### Build the Index - One time

# index_2018 = VectorStoreIndex.from_documents(bofA2018)
# index_2019 = VectorStoreIndex.from_documents(bofA2019)
# index_2020 = VectorStoreIndex.from_documents(bofA2020)
# index_2021 = VectorStoreIndex.from_documents(bofA2021)
# index_2022 = VectorStoreIndex.from_documents(bofA2022)

# Persist the Index - One time
# index_2018.storage_context.persist(persist_dir="Data/BOFA/2018")
# index_2019.storage_context.persist(persist_dir="Data/BOFA/2019")
# index_2020.storage_context.persist(persist_dir="Data/BOFA/2020")
# index_2021.storage_context.persist(persist_dir="Data/BOFA/2021")
# index_2022.storage_context.persist(persist_dir="Data/BOFA/2022")

In [40]:
# ### Build the Index - One time - JMPC

# Jpmc2018 = SimpleDirectoryReader(input_files=["data/JPMC/JPMC2018.pdf"]).load_data()
# Jpmc2019 = SimpleDirectoryReader(input_files=["data/JPMC/JPMC2019.pdf"]).load_data()
# Jpmc2020 = SimpleDirectoryReader(input_files=["data/JPMC/JPMC2020.pdf"]).load_data()
# Jpmc2021 = SimpleDirectoryReader(input_files=["data/JPMC/JPMC2021.pdf"]).load_data()
# Jpmc2022 = SimpleDirectoryReader(input_files=["data/JPMC/JPMC2022.pdf"]).load_data()

# jpmc_2018 = VectorStoreIndex.from_documents(Jpmc2018)
# jpmc_2019 = VectorStoreIndex.from_documents(Jpmc2019)
# jpmc_2020 = VectorStoreIndex.from_documents(Jpmc2020)
# jpmc_2021 = VectorStoreIndex.from_documents(Jpmc2021)
# jpmc_2022 = VectorStoreIndex.from_documents(Jpmc2022)

# jpmc_2018.storage_context.persist(persist_dir="Data/JPMC/2018")
# jpmc_2019.storage_context.persist(persist_dir="Data/JPMC/2019")
# jpmc_2020.storage_context.persist(persist_dir="Data/JPMC/2020")
# jpmc_2021.storage_context.persist(persist_dir="Data/JPMC/2021")
# jpmc_2022.storage_context.persist(persist_dir="Data/JPMC/2022")

  return int.__new__(cls, int(value))
  return int.__new__(cls, int(value))


In [47]:
# ### Build the Index - One time - MS

# MS2018 = SimpleDirectoryReader(input_files=["data/MS/MS2018.pdf"]).load_data()
# MS2019 = SimpleDirectoryReader(input_files=["data/MS/MS2019.pdf"]).load_data()
# MS2020 = SimpleDirectoryReader(input_files=["data/MS/MS2020.pdf"]).load_data()
# MS2021 = SimpleDirectoryReader(input_files=["data/MS/MS2021.pdf"]).load_data()
# MS2022 = SimpleDirectoryReader(input_files=["data/MS/MS2022.pdf"]).load_data()

# MS_2018 = VectorStoreIndex.from_documents(MS2018)
# MS_2019 = VectorStoreIndex.from_documents(MS2019)
# MS_2020 = VectorStoreIndex.from_documents(MS2020)
# MS_2021 = VectorStoreIndex.from_documents(MS2021)
# MS_2022 = VectorStoreIndex.from_documents(MS2022)

# MS_2018.storage_context.persist(persist_dir="Data/MS/2018")
# MS_2019.storage_context.persist(persist_dir="Data/MS/2019")
# MS_2020.storage_context.persist(persist_dir="Data/MS/2020")
# MS_2021.storage_context.persist(persist_dir="Data/MS/2021")
# MS_2022.storage_context.persist(persist_dir="Data/MS/2022")

In [52]:
# ### Build the Index - One time - GS

# GS2018 = SimpleDirectoryReader(input_files=["data/GS/GS2018.pdf"]).load_data()
# GS2019 = SimpleDirectoryReader(input_files=["data/GS/GS2019.pdf"]).load_data()
# GS2020 = SimpleDirectoryReader(input_files=["data/GS/GS2020.pdf"]).load_data()
# GS2021 = SimpleDirectoryReader(input_files=["data/GS/GS2021.pdf"]).load_data()
# GS2022 = SimpleDirectoryReader(input_files=["data/GS/GS2022.pdf"]).load_data()

# GS_2018 = VectorStoreIndex.from_documents(GS2018)
# GS_2019 = VectorStoreIndex.from_documents(GS2019)
# GS_2020 = VectorStoreIndex.from_documents(GS2020)
# GS_2021 = VectorStoreIndex.from_documents(GS2021)
# GS_2022 = VectorStoreIndex.from_documents(GS2022)

# GS_2018.storage_context.persist(persist_dir="Data/GS/2018")
# GS_2019.storage_context.persist(persist_dir="Data/GS/2019")
# GS_2020.storage_context.persist(persist_dir="Data/GS/2020")
# GS_2021.storage_context.persist(persist_dir="Data/GS/2021")
# GS_2022.storage_context.persist(persist_dir="Data/GS/2022")

In [69]:
# Load the Index - Every time
# rebuild storage context
storageContext = StorageContext.from_defaults(persist_dir="Data/BOFA/2018")
# load index
bofa2018Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/BOFA/2019")
bofa2019Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/BOFA/2020")
bofa2020Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/BOFA/2021")
bofa2021Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/BOFA/2022")
bofa2022Index = load_index_from_storage(storageContext)

In [41]:
# Load the Index - Every time
# rebuild storage context
storageContext = StorageContext.from_defaults(persist_dir="Data/JPMC/2018")
# load index
jpmc2018Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/JPMC/2019")
jpmc2019Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/JPMC/2020")
jpmc2020Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/JPMC/2021")
jpmc2021Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/JPMC/2022")
jpmc2022Index = load_index_from_storage(storageContext)


In [53]:
# Load the Index - Every time
# rebuild storage context
storageContext = StorageContext.from_defaults(persist_dir="Data/MS/2018")
# load index
MS2018Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/MS/2019")
MS2019Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/MS/2020")
MS2020Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/MS/2021")
MS2021Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/MS/2022")
MS2022Index = load_index_from_storage(storageContext)


In [54]:
# Load the Index - Every time
# rebuild storage context
storageContext = StorageContext.from_defaults(persist_dir="Data/GS/2018")
# load index
GS2018Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/GS/2019")
GS2019Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/GS/2020")
GS2020Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/GS/2021")
GS2021Index = load_index_from_storage(storageContext)

storageContext = StorageContext.from_defaults(persist_dir="Data/GS/2022")
GS2022Index = load_index_from_storage(storageContext)

#### Simple QA
Now we are ready to run some queries against our indices!
To do so, we first configure a QueryEngine, which just captures a set of configurations for how we want to query the underlying index.

For a VectorStoreIndex, the most common configuration to adjust is similarity_top_k which controls how many document chunks (which we call Node objects) are retrieved to use as context for answering our question.

In [36]:
bofa2018Engine = bofa2018Index.as_query_engine(similarity_top_k=3)
bofa2019Engine = bofa2019Index.as_query_engine(similarity_top_k=3)
bofa2020Engine = bofa2020Index.as_query_engine(similarity_top_k=3)
bofa2021Engine = bofa2021Index.as_query_engine(similarity_top_k=3)
bofa2022Engine = bofa2022Index.as_query_engine(similarity_top_k=3)

In [43]:
jpmc2018Engine = jpmc2018Index.as_query_engine(similarity_top_k=3)
jpmc2019Engine = jpmc2019Index.as_query_engine(similarity_top_k=3)
jpmc2020Engine = jpmc2020Index.as_query_engine(similarity_top_k=3)
jpmc2021Engine = jpmc2021Index.as_query_engine(similarity_top_k=3)
jpmc2022Engine = jpmc2022Index.as_query_engine(similarity_top_k=3)

In [55]:
MS2018Engine = MS2018Index.as_query_engine(similarity_top_k=3)
MS2019Engine = MS2019Index.as_query_engine(similarity_top_k=3)
MS2020Engine = MS2020Index.as_query_engine(similarity_top_k=3)
MS2021Engine = MS2021Index.as_query_engine(similarity_top_k=3)
MS2022Engine = MS2022Index.as_query_engine(similarity_top_k=3)

In [56]:
GS2018Engine = GS2018Index.as_query_engine(similarity_top_k=3)
GS2019Engine = GS2019Index.as_query_engine(similarity_top_k=3)
GS2020Engine = GS2020Index.as_query_engine(similarity_top_k=3)
GS2021Engine = GS2021Index.as_query_engine(similarity_top_k=3)
GS2022Engine = GS2022Index.as_query_engine(similarity_top_k=3)

In [37]:
response = await bofa2021Engine.aquery('What is the revenue of BofA in 2021? Answer in millions with page reference')
print(response)

The revenue of BofA in 2021 is $89,113 million. This information can be found on page 92.


#### For more complex financial analysis, one often needs to reference multiple documents.

As a example, let's take a look at how to do compare-and-contrast queries over both Lyft and Uber financials.
For this, we build a SubQuestionQueryEngine, which breaks down a complex compare-and-contrast query, into simpler sub-questions to execute on respective sub query engine backed by individual indices.

In [38]:
query_engine_tools = [
    QueryEngineTool(
        query_engine=bofa2018Engine, 
        metadata=ToolMetadata(name='2018Bofa_10k', description='Provides information about Bank of America financials for year 2018')
    ),
    QueryEngineTool(
        query_engine=bofa2019Engine, 
        metadata=ToolMetadata(name='2019Bofa_10k', description='Provides information about BofA financials for year 2019')
    ),
    QueryEngineTool(
        query_engine=bofa2020Engine, 
        metadata=ToolMetadata(name='2020Bofa_10k', description='Provides information about BAC financials for year 2020')
    ),
    QueryEngineTool(
        query_engine=bofa2021Engine, 
        metadata=ToolMetadata(name='2021Bofa_10k', description='Provides information about Bank of America financials for year 2021')
    ),
    QueryEngineTool(
        query_engine=bofa2022Engine, 
        metadata=ToolMetadata(name='2022Bofa_10k', description='Provides information about BofA financials for year 2022')
    ),
]

s_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=query_engine_tools)

In [39]:
response = await s_engine.aquery('Compare and contrast the customer segments and geographies that grew the fastest')
print(response)

Generated 10 sub questions.
[1;3;38;2;237;90;200m[2018Bofa_10k] Q: What were the customer segments that grew the fastest in 2018?
[0m[1;3;38;2;90;149;237m[2019Bofa_10k] Q: What were the customer segments that grew the fastest in 2019?
[0m[1;3;38;2;11;159;203m[2020Bofa_10k] Q: What were the customer segments that grew the fastest in 2020?
[0m[1;3;38;2;155;135;227m[2021Bofa_10k] Q: What were the customer segments that grew the fastest in 2021?
[0m[1;3;38;2;237;90;200m[2022Bofa_10k] Q: What were the customer segments that grew the fastest in 2022?
[0m[1;3;38;2;90;149;237m[2018Bofa_10k] Q: Which geographies experienced the fastest growth in 2018?
[0m[1;3;38;2;11;159;203m[2019Bofa_10k] Q: Which geographies experienced the fastest growth in 2019?
[0m[1;3;38;2;155;135;227m[2020Bofa_10k] Q: Which geographies experienced the fastest growth in 2020?
[0m[1;3;38;2;237;90;200m[2021Bofa_10k] Q: Which geographies experienced the fastest growth in 2021?
[0m[1;3;38;2;90;149;237m[2022

In [22]:
response = await s_engine.aquery('Compare revenue growth of BofA from 2020 to 2021.  Show the growth comparision in millions')
print(response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[2020Bofa_10k] Q: What was the revenue of BofA in 2020?
[0m[1;3;38;2;90;149;237m[2021Bofa_10k] Q: What was the revenue of BofA in 2021?
[0m[1;3;38;2;90;149;237m[2021Bofa_10k] A: The revenue of BofA in 2021 was $94,883 million.
[0m[1;3;38;2;237;90;200m[2020Bofa_10k] A: The revenue of Bank of America in 2020 was $42.2 billion.
[0mThe revenue growth of BofA from 2020 to 2021 was $52,683 million.


In [25]:
response = await s_engine.aquery("Can you compare and contrast the risk factors in 2021 vs. 2020?")
print(response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[2021Bofa_10k] Q: What are the risk factors in 2021?
[0m[1;3;38;2;90;149;237m[2020Bofa_10k] Q: What are the risk factors in 2020?
[0m[1;3;38;2;90;149;237m[2020Bofa_10k] A: The risk factors in 2020 include the adverse effects of the pandemic on businesses and financial conditions, negative economic conditions leading to decreased demand and lower fees, volatility and disruptions in the capital and credit markets, potential downgrades to credit ratings, operational losses, potential disruptions to business continuity, potential harm from participation in government relief programs, and uncertainty regarding the magnitude and duration of the pandemic and its future impacts on the global economy. Additionally, there are risks related to competition, the development and introduction of new products and services, the failure of models and strategies to properly manage risk, the management and aggregation of data, and the impacts of climate

  response = await s_engine.aquery("Can you compare and contrast the risk factors in 2021 vs. 2020?")


In [27]:
response = await s_engine.aquery("Can you compare and contrast the cash flow in 2022 vs. 2020?")
print(response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[2020Bofa_10k] Q: What is the cash flow in 2020?
[0m[1;3;38;2;90;149;237m[2022Bofa_10k] Q: What is the cash flow in 2022?
[0m[1;3;38;2;237;90;200m[2020Bofa_10k] A: The cash flow in 2020 is $218,903 million.
[0m[1;3;38;2;90;149;237m[2022Bofa_10k] A: The cash flow in 2022 is a net cash used in operating activities of $6,327 million.
[0mThe cash flow in 2022 is a net cash used in operating activities of $6,327 million, while the cash flow in 2020 is $218,903 million.


In [28]:
response = await s_engine.aquery("compare the results of the equity derivatives business over all years?")
print(response)

Generated 5 sub questions.
[1;3;38;2;237;90;200m[2018Bofa_10k] Q: What is the revenue of the equity derivatives business in 2018?
[0m[1;3;38;2;90;149;237m[2019Bofa_10k] Q: What is the revenue of the equity derivatives business in 2019?
[0m[1;3;38;2;11;159;203m[2020Bofa_10k] Q: What is the revenue of the equity derivatives business in 2020?
[0m[1;3;38;2;155;135;227m[2021Bofa_10k] Q: What is the revenue of the equity derivatives business in 2021?
[0m[1;3;38;2;237;90;200m[2022Bofa_10k] Q: What is the revenue of the equity derivatives business in 2022?
[0m[1;3;38;2;11;159;203m[2020Bofa_10k] A: The revenue of the equity derivatives business in 2020 was $5,425 million.
[0m[1;3;38;2;90;149;237m[2019Bofa_10k] A: The revenue of the equity derivatives business in 2019 was $4,501 million.
[0m[1;3;38;2;237;90;200m[2022Bofa_10k] A: The revenue of the equity derivatives business in 2022 is $6,572 million.
[0m[1;3;38;2;155;135;227m[2021Bofa_10k] A: The revenue of the equity derivativ

In [44]:
query_engine_tools1 = [
    QueryEngineTool(
        query_engine=bofa2018Engine, 
        metadata=ToolMetadata(name='2018Bofa_10k', description='Provides information about Bank of America financials for year 2018')
    ),
    QueryEngineTool(
        query_engine=bofa2019Engine, 
        metadata=ToolMetadata(name='2019Bofa_10k', description='Provides information about BofA financials for year 2019')
    ),
    QueryEngineTool(
        query_engine=bofa2020Engine, 
        metadata=ToolMetadata(name='2020Bofa_10k', description='Provides information about BAC financials for year 2020')
    ),
    QueryEngineTool(
        query_engine=bofa2021Engine, 
        metadata=ToolMetadata(name='2021Bofa_10k', description='Provides information about Bank of America financials for year 2021')
    ),
    QueryEngineTool(
        query_engine=bofa2022Engine, 
        metadata=ToolMetadata(name='2022Bofa_10k', description='Provides information about BofA financials for year 2022')
    ),
    QueryEngineTool(
        query_engine=jpmc2018Engine, 
        metadata=ToolMetadata(name='2018jpmc_10k', description='Provides information about JP Morgan Chase financials for year 2018')
    ),
    QueryEngineTool(
        query_engine=jpmc2019Engine, 
        metadata=ToolMetadata(name='2019jpmc_10k', description='Provides information about JPMC financials for year 2019')
    ),
    QueryEngineTool(
        query_engine=jpmc2020Engine, 
        metadata=ToolMetadata(name='2020jpmc_10k', description='Provides information about JP Morgan financials for year 2020')
    ),
    QueryEngineTool(
        query_engine=jpmc2021Engine, 
        metadata=ToolMetadata(name='2021jpmc_10k', description='Provides information about JPMC financials for year 2021')
    ),
    QueryEngineTool(
        query_engine=jpmc2022Engine, 
        metadata=ToolMetadata(name='2022jpmc_10k', description='Provides information about J P Morgan Chase financials for year 2022')
    ),
]

s1_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=query_engine_tools1)

In [45]:
response = await s1_engine.aquery("compare the results of the equity derivatives business between BoFA and JPMC for 2022?")
print(response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[2022Bofa_10k] Q: What is the revenue of the equity derivatives business for BoFA in 2022?
[0m[1;3;38;2;90;149;237m[2022jpmc_10k] Q: What is the revenue of the equity derivatives business for JPMC in 2022?
[0m[1;3;38;2;237;90;200m[2022Bofa_10k] A: The revenue of the equity derivatives business for BoFA in 2022 is $6,572 million.
[0m[1;3;38;2;90;149;237m[2022jpmc_10k] A: The revenue of the equity derivatives business for JPMC in 2022 is not provided in the given context information.
[0mThe results of the equity derivatives business for BoFA in 2022 are provided as $6,572 million. However, the revenue of the equity derivatives business for JPMC in 2022 is not provided in the given context information. Therefore, a direct comparison of the results between BoFA and JPMC for 2022 cannot be made based on the given information.


In [46]:
response = await s1_engine.aquery("compare the results of the cash flow business between BoFA and JPMC for 2022?")
print(response)

Generated 2 sub questions.
[1;3;38;2;237;90;200m[2022Bofa_10k] Q: What is the cash flow of BoFA for 2022?
[0m[1;3;38;2;90;149;237m[2022jpmc_10k] Q: What is the cash flow of JPMC for 2022?
[0m[1;3;38;2;90;149;237m[2022jpmc_10k] A: The cash flow of JPMC for 2022 is a net decrease of $173.6 billion.
[0m[1;3;38;2;237;90;200m[2022Bofa_10k] A: The cash flow of Bank of America (BoFA) for 2022 is a net cash used in operating activities of $6,327 million, a net cash used in investing activities of $2,529 million, and a net cash provided by financing activities of $106,039 million.
[0mThe cash flow results for Bank of America (BoFA) and JPMC for 2022 show that BoFA had a net cash used in operating activities of $6,327 million, a net cash used in investing activities of $2,529 million, and a net cash provided by financing activities of $106,039 million. On the other hand, JPMC had a net decrease of $173.6 billion. Therefore, based on the provided information, BoFA had a more positive cash

In [57]:
query_engine_tools2 = [
    QueryEngineTool(
        query_engine=bofa2018Engine, 
        metadata=ToolMetadata(name='2018Bofa_10k', description='Provides information about Bank of America financials for year 2018')
    ),
    QueryEngineTool(
        query_engine=bofa2019Engine, 
        metadata=ToolMetadata(name='2019Bofa_10k', description='Provides information about BofA financials for year 2019')
    ),
    QueryEngineTool(
        query_engine=bofa2020Engine, 
        metadata=ToolMetadata(name='2020Bofa_10k', description='Provides information about BAC financials for year 2020')
    ),
    QueryEngineTool(
        query_engine=bofa2021Engine, 
        metadata=ToolMetadata(name='2021Bofa_10k', description='Provides information about Bank of America financials for year 2021')
    ),
    QueryEngineTool(
        query_engine=bofa2022Engine, 
        metadata=ToolMetadata(name='2022Bofa_10k', description='Provides information about BofA financials for year 2022')
    ),
    QueryEngineTool(
        query_engine=jpmc2018Engine, 
        metadata=ToolMetadata(name='2018jpmc_10k', description='Provides information about JP Morgan Chase financials for year 2018')
    ),
    QueryEngineTool(
        query_engine=jpmc2019Engine, 
        metadata=ToolMetadata(name='2019jpmc_10k', description='Provides information about JPMC financials for year 2019')
    ),
    QueryEngineTool(
        query_engine=jpmc2020Engine, 
        metadata=ToolMetadata(name='2020jpmc_10k', description='Provides information about JP Morgan financials for year 2020')
    ),
    QueryEngineTool(
        query_engine=jpmc2021Engine, 
        metadata=ToolMetadata(name='2021jpmc_10k', description='Provides information about JPMC financials for year 2021')
    ),
    QueryEngineTool(
        query_engine=jpmc2022Engine, 
        metadata=ToolMetadata(name='2022jpmc_10k', description='Provides information about J P Morgan Chase financials for year 2022')
    ),
    QueryEngineTool(
        query_engine=MS2018Engine, 
        metadata=ToolMetadata(name='2018MS_10k', description='Provides information about JP Morgan Chase financials for year 2018')
    ),
    QueryEngineTool(
        query_engine=MS2019Engine, 
        metadata=ToolMetadata(name='2019MS_10k', description='Provides information about MS financials for year 2019')
    ),
    QueryEngineTool(
        query_engine=MS2020Engine, 
        metadata=ToolMetadata(name='2020MS_10k', description='Provides information about JP Morgan financials for year 2020')
    ),
    QueryEngineTool(
        query_engine=MS2021Engine, 
        metadata=ToolMetadata(name='2021MS_10k', description='Provides information about MS financials for year 2021')
    ),
    QueryEngineTool(
        query_engine=MS2022Engine, 
        metadata=ToolMetadata(name='2022MS_10k', description='Provides information about J P Morgan Chase financials for year 2022')
    ),
    QueryEngineTool(
        query_engine=GS2018Engine, 
        metadata=ToolMetadata(name='2018GS_10k', description='Provides information about JP Morgan Chase financials for year 2018')
    ),
    QueryEngineTool(
        query_engine=GS2019Engine, 
        metadata=ToolMetadata(name='2019GS_10k', description='Provides information about GS financials for year 2019')
    ),
    QueryEngineTool(
        query_engine=GS2020Engine, 
        metadata=ToolMetadata(name='2020GS_10k', description='Provides information about JP Morgan financials for year 2020')
    ),
    QueryEngineTool(
        query_engine=GS2021Engine, 
        metadata=ToolMetadata(name='2021GS_10k', description='Provides information about GS financials for year 2021')
    ),
    QueryEngineTool(
        query_engine=GS2022Engine, 
        metadata=ToolMetadata(name='2022GS_10k', description='Provides information about J P Morgan Chase financials for year 2022')
    ),
]

s2_engine = SubQuestionQueryEngine.from_defaults(query_engine_tools=query_engine_tools2)

In [58]:
response = await s2_engine.aquery("compare the results of the equity derivatives business between all the companies.")
print(response)

Generated 20 sub questions.
[1;3;38;2;237;90;200m[2018Bofa_10k] Q: What is the revenue of the equity derivatives business for Bank of America in 2018?
[0m[1;3;38;2;90;149;237m[2019Bofa_10k] Q: What is the revenue of the equity derivatives business for Bank of America in 2019?
[0m[1;3;38;2;11;159;203m[2020Bofa_10k] Q: What is the revenue of the equity derivatives business for Bank of America in 2020?
[0m[1;3;38;2;155;135;227m[2021Bofa_10k] Q: What is the revenue of the equity derivatives business for Bank of America in 2021?
[0m[1;3;38;2;237;90;200m[2022Bofa_10k] Q: What is the revenue of the equity derivatives business for Bank of America in 2022?
[0m[1;3;38;2;90;149;237m[2018jpmc_10k] Q: What is the revenue of the equity derivatives business for JP Morgan Chase in 2018?
[0m[1;3;38;2;11;159;203m[2019jpmc_10k] Q: What is the revenue of the equity derivatives business for JP Morgan Chase in 2019?
[0m[1;3;38;2;155;135;227m[2020jpmc_10k] Q: What is the revenue of the equity 

In [62]:
response = await s2_engine.aquery("compare the results of the net revenue for all the companies.")
print(response)

Generated 20 sub questions.
[1;3;38;2;237;90;200m[2018Bofa_10k] Q: What is the net revenue for Bank of America in 2018?
[0m[1;3;38;2;90;149;237m[2019Bofa_10k] Q: What is the net revenue for Bank of America in 2019?
[0m[1;3;38;2;11;159;203m[2020Bofa_10k] Q: What is the net revenue for Bank of America in 2020?
[0m[1;3;38;2;155;135;227m[2021Bofa_10k] Q: What is the net revenue for Bank of America in 2021?
[0m[1;3;38;2;237;90;200m[2022Bofa_10k] Q: What is the net revenue for Bank of America in 2022?
[0m[1;3;38;2;90;149;237m[2018jpmc_10k] Q: What is the net revenue for JP Morgan Chase in 2018?
[0m[1;3;38;2;11;159;203m[2019jpmc_10k] Q: What is the net revenue for JP Morgan Chase in 2019?
[0m[1;3;38;2;155;135;227m[2020jpmc_10k] Q: What is the net revenue for JP Morgan Chase in 2020?
[0m[1;3;38;2;237;90;200m[2021jpmc_10k] Q: What is the net revenue for JP Morgan Chase in 2021?
[0m[1;3;38;2;90;149;237m[2022jpmc_10k] Q: What is the net revenue for JP Morgan Chase in 2022?
[0m

In [80]:
from llama_index import GPTListIndex, LLMPredictor
from llama_index.composability import ComposableGraph
from llama_index.indices.query.query_transform import DecomposeQueryTransform
llm_predictor = LLMPredictor(llm)

decompose_transform = DecomposeQueryTransform(
   llm_predictor,
   verbose=True
)

vectorIndicies = {}
vectorIndicies[('Bank of America', '2018')] = bofa2018Index
vectorIndicies[('Bank of America', '2019')] = bofa2019Index
vectorIndicies[('Bank of America', '2020')] = bofa2020Index
vectorIndicies[('Bank of America', '2021')] = bofa2021Index
vectorIndicies[('Bank of America', '2022')] = bofa2022Index

vectorIndicies[('J P Morgan Chase', '2018')] = jpmc2018Index
vectorIndicies[('J P Morgan Chase', '2019')] = jpmc2019Index
vectorIndicies[('J P Morgan Chase', '2020')] = jpmc2020Index
vectorIndicies[('J P Morgan Chase', '2021')] = jpmc2021Index
vectorIndicies[('J P Morgan Chase', '2022')] = jpmc2022Index

vectorIndicies[('Morgan Stanley', '2018')] = MS2018Index
vectorIndicies[('Morgan Stanley', '2019')] = MS2019Index
vectorIndicies[('Morgan Stanley', '2020')] = MS2020Index
vectorIndicies[('Morgan Stanley', '2021')] = MS2021Index
vectorIndicies[('Morgan Stanley', '2022')] = MS2022Index

vectorIndicies[('Goldman Sachs', '2018')] = GS2018Index
vectorIndicies[('Goldman Sachs', '2019')] = GS2019Index
vectorIndicies[('Goldman Sachs', '2020')] = GS2020Index
vectorIndicies[('Goldman Sachs', '2021')] = GS2021Index
vectorIndicies[('Goldman Sachs', '2022')] = GS2022Index

# set summary text for each doc
years = [2018, 2019, 2020, 2021, 2022]
companies = ['Bank of America', 'J P Morgan Chase', 'Morgan Stanley', 'Goldman Sachs']
summaries = {}
for company in companies:
    for year in years:
        summaries[(company, year)] = f"{company} 10-k Filing for {year} fiscal year"


graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index for _, index in vectorIndicies.items()],
    [summary for _, summary in summaries.items()],
)

In [93]:
query_configs = [
    {
        "index_struct_type": "dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 3,
            # "include_summary": True
        },
        #"query_transform": decompose_transform
    },
    {
        "index_struct_type": "list",
        "query_mode": "default",
        "query_kwargs": {
            "response_mode": "tree_summarize",
            "verbose": True
        }
    },
]

In [101]:
response = vectorIndicies[('Bank of America', '2018')].as_query_engine(similarity_top_k=3).query("What is the operating cash flow?")
print(response)

The operating cash flow for the given context is $24,381 million in 2018.


In [102]:
response = vectorIndicies[('Morgan Stanley', '2020')].as_query_engine(similarity_top_k=3).query("What is the operating cash flow?")
print(response)

The operating cash flow for the given context is $14,202 million.


In [82]:
risk_query_str = (
    "Describe the current risk factors. If the year is provided in the information, "
    "provide that as well. If the context contains risk factors for multiple years, "
    "explicitly provide the following:\n"
    "- A description of the risk factors for each year\n"
    "- A summary of how these risk factors are changing across years"
)
response_summary = graph.as_query_engine(query_configs=query_configs).query(risk_query_str)
print(response_summary)

The current risk factors described in the context information are related to various aspects such as financial markets, economic conditions, fiscal and monetary policies, regulatory policies, global uncertainties, and the impact of the COVID-19 pandemic. These risk factors can adversely affect businesses, financial institutions, and the global economy. The specific year for these risk factors is not provided in the information. Therefore, it is not possible to provide a specific description of the risk factors for each year or summarize how these risk factors are changing across years.


In [94]:
risk_query_str = ("compare the results of the equity derivatives business between BoFA and JPMC for 2022?")
response_summary = graph.as_query_engine(query_configs=query_configs).query(risk_query_str)
print(response_summary)

I'm sorry, but I cannot provide information about the results of the equity derivatives business between Bank of America (BoFA) and JPMorgan Chase (JPMC) for 2022. The provided context information does not mention any specific details or comparisons regarding the equity derivatives business of these two banks.


#### Storing the data in Cognitive Search

In [111]:
# set up Azure Cognitive Search
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from llama_index.vector_stores import CognitiveSearchVectorStore
from llama_index.vector_stores.cogsearch import (
    IndexManagement,
    MetadataIndexFieldType,
    CognitiveSearchVectorStore,
)

indexName = "secllama"
SearchServiceCredential = AzureKeyCredential(SearchKey)

# Use index client to demonstrate creating an index
indexClient = SearchIndexClient(
    endpoint=f"https://{SearchService}.search.windows.net",
    credential=SearchServiceCredential,
)

# Use search client to demonstration using existing index
searchClient = SearchClient(
    endpoint=f"https://{SearchService}.search.windows.net",
    index_name=indexName,
    credential=SearchServiceCredential,
)

vectorStore = CognitiveSearchVectorStore(
    search_or_index_client=indexClient,
    index_name=indexName,
    #filterable_metadata_field_keys=metadata_fields,
    index_management=IndexManagement.CREATE_IF_NOT_EXISTS,
    id_field_key="id",
    chunk_field_key="content",
    embedding_field_key="embedding",
    metadata_string_field_key="li_jsonMetadata",
    doc_id_field_key="li_doc_id",
)

In [112]:
vectorStoreContext = StorageContext.from_defaults(vector_store=vectorStore)
boaIndex2018 = VectorStoreIndex.from_documents(bofA2018, storage_context=vectorStoreContext)
boaIndex2019 = VectorStoreIndex.from_documents(bofA2019, storage_context=vectorStoreContext)
boaIndex2020 = VectorStoreIndex.from_documents(bofA2020, storage_context=vectorStoreContext)
boaIndex2021 = VectorStoreIndex.from_documents(bofA2021, storage_context=vectorStoreContext)
boaIndex2022 = VectorStoreIndex.from_documents(bofA2022, storage_context=vectorStoreContext)

In [116]:
# Query Data
queryEngine = boaIndex2022.as_query_engine(similarity_top_k=3)
response = queryEngine.query("What is the cash flow?")
print(response)

The cash flow for the given context information is as follows:

2018: Net cash provided by operating activities - $24,381 million
Net cash used in investing activities - $2,163 million
Net cash used in financing activities - $21,824 million

2017: Net cash provided by operating activities - $8,402 million
Net cash used in investing activities - $6,776 million
Net cash used in financing activities - $17,127 million

2016: Net cash used in operating activities - $2,007 million
Net cash used in investing activities - $65,789 million
Net cash used in financing activities - $9,980 million
