In [1]:
from llama_index.llms import AzureOpenAI
from llama_index.embeddings import AzureOpenAIEmbedding
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index import set_global_service_context
from llama_index import download_loader
from llama_index import GPTVectorStoreIndex
from llama_index import SimpleDirectoryReader

In [2]:
import openai
from Utilities.envVars import *
import os

# Set OpenAI API key and endpoint
azure_endpoint  = f"{OpenAiEndPoint}"
api_key = OpenAiKey
api_version = OpenAiVersion

embeddingModelType = "azureopenai"
temperature = 0.3
tokenLength = 1000

if (embeddingModelType == 'azureopenai'):
        openai.api_type = "azure"
        openai.api_key = OpenAiKey
        openai.api_version = OpenAiVersion
        openai.api_base = f"{OpenAiEndPoint}"

        llm = AzureOpenAI(
                model="gpt-35-turbo-16k",
                deployment_name=OpenAiChat,
                api_key=api_key,
                azure_endpoint=azure_endpoint,
                api_version=api_version,
                )
        embeddings = AzureOpenAIEmbedding(
                model="text-embedding-ada-002",
                deployment_name=OpenAiEmbedding,
                api_key=api_key,
                azure_endpoint=azure_endpoint,
                api_version=api_version,
                )
        logging.info("LLM Setup done")
        service_context = ServiceContext.from_defaults(
                llm=llm,
                embed_model=embeddings,
        )

        set_global_service_context(service_context)
elif embeddingModelType == "openai":
       print ("OpenAI")

In [3]:
# from llama_index import download_loader
# SECFilingsLoader = download_loader('SECFilingsLoader')

# loader = SECFilingsLoader(tickers=['TSLA'],amount=3,filing_type="10-K")
# loader.load_data()

In [4]:
documents = SimpleDirectoryReader("data\\TSLA\\2022").load_data()
print(f"Loaded {len(documents)} documents for year 2022")
index = VectorStoreIndex.from_documents(documents)
#index.query('What are the risk factors of Tesla for the year 2022?')
retriever = index.as_retriever()
queryEngine = index.as_query_engine()
query = 'What are the risk factors of Tesla for the year 2022?'
answer = queryEngine.query(query)
print(answer.get_formatted_sources())
print("query was:", query)
print("answer was:", answer)


Loaded 1 documents for year 2022
> Source (Doc id: ba212719-142d-40d8-9478-36e8b0e90201): We also offer maintenance, installation, operation, financial and other services related to our p...

> Source (Doc id: 7f9658f4-3342-4dba-b030-67d973a02d6c): In 2022, we had over 200 graduates from Tesla START programs, with an additional 100+ graduating ...
query was: What are the risk factors of Tesla for the year 2022?
answer was: The risk factors for Tesla in 2022 include the impact of macroeconomic conditions resulting from the global COVID-19 pandemic, such as government regulations and shifting social behaviors that may limit or close non-essential transportation and business activities. The pandemic-related issues have also exacerbated port congestion, intermittent supplier shutdowns and delays, and labor shortages, leading to increased expenses, supply chain challenges, and difficulty in hiring and retaining workers. Additionally, the company may face challenges in launching and ramping 

In [5]:
from llama_index import ServiceContext
#doc_set = {}
index_set = {}
all_docs = []
years = [2022, 2021]
for year in years:
    year_docs = SimpleDirectoryReader(f"Data\\TSLA\\{year}").load_data()
    for d in year_docs:
        d.extra_info = {"year": year}
    print(f"Loaded {len(year_docs)} documents for year {year}")
    cur_index = VectorStoreIndex.from_documents(year_docs)
    print(f"Created index for year {year}")
    index_set[year] = cur_index
    # insert year metadata into each year
    #doc_set[year] = year_docs
    all_docs.extend(year_docs)

Loaded 1 documents for year 2022
Created index for year 2022
Loaded 1 documents for year 2021
Created index for year 2021


In [6]:
response = index_set[2021].as_query_engine(similarity_top_k=3).query("What were some of the biggest risk factors in 2020?")
print(response)
response = index_set[2022].as_query_engine(similarity_top_k=3).query("What were some of the significant acquisitions?")
print(response)

Some of the biggest risk factors in 2020 were the worldwide impact of the COVID-19 pandemic, which led to temporary suspensions of operations at manufacturing facilities, shutdowns of suppliers, and temporary employee furloughs and compensation reductions. Additionally, there were challenges in product deliveries and deployments due to impediments to administrative activities. Global trade conditions and consumer trends originating from the pandemic also had adverse impacts, such as port congestion, intermittent supplier shutdowns and delays, and a shortfall of semiconductors. Labor shortages resulting from the pandemic, including worker absenteeism, also posed challenges in hiring and retaining manufacturing and service workers.
Based on the given context information, there is no mention of any significant acquisitions.


In [12]:
# NOTE: this global index is a single vector store containing all documents
# Only relevant for the section below: "Can a single vector index answer questions across years?"
global_index = VectorStoreIndex.from_documents(all_docs)

risk_query_str = "What are some of the biggest risk factors in each year?"
response = global_index.as_query_engine(similarity_top_k=3).query(risk_query_str)
print(str(response))

Some of the biggest risk factors in each year include:
- In 2022, the risks include being impacted by macroeconomic conditions resulting from the global COVID-19 pandemic, delays in launching and ramping up production, and events outside of their control such as natural disasters, wars, and health epidemics.
- In 2022, the risks include being impacted by events outside of their control such as natural disasters, wars, and health epidemics, as well as the potential impact of the global COVID-19 pandemic on economic markets, manufacturing operations, supply chains, employment, and consumer behavior.
- In both years, the risks include being impacted by events outside of their control such as natural disasters, wars, and health epidemics, as well as the potential impact of the global COVID-19 pandemic. Additionally, there are risks related to government and economic incentives supporting the development and adoption of their products, and compliance with evolving laws and regulations.


In [13]:
risk_query_str = "Compare the revenue from 2021 to 2022?"
response = global_index.as_query_engine(similarity_top_k=3).query(risk_query_str)
print(str(response))

In 2022, the revenue increased compared to 2021. Automotive sales revenue increased by $23.09 billion, or 52%. Automotive leasing revenue increased by $834 million, or 51%. Services and other revenue increased by $2.29 billion, or 60%. Energy generation and storage revenue increased by $1.12 billion, or 40%.


In [7]:
from llama_index import GPTListIndex, LLMPredictor
from llama_index.composability import ComposableGraph

# set summary text for each doc
summaries = {}
for year in years:
    summaries[year] = f"TSLA 10-k Filing for {year} fiscal year"

# set number of output tokens
llm_predictor = LLMPredictor(llm)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

graph = ComposableGraph.from_indices(
    GPTListIndex,
    [index_set[y] for y in years],
    [summaries[y] for y in years],
    service_context=service_context
)

risk_query_str = (
    "Describe the current risk factors. If the year is provided in the information, "
    "provide that as well. If the context contains risk factors for multiple years, "
    "explicitly provide the following:\n"
    "- A description of the risk factors for each year\n"
    "- A summary of how these risk factors are changing across years"
)

query_configs = [
    {
        "index_struct_type": "dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 3,
            # "include_summary": True
        }
    },
    {
        "index_struct_type": "list",
        "query_mode": "default",
        "query_kwargs": {
            "response_mode": "tree_summarize",
        }
    },
]

response_summary = graph.as_query_engine(query_configs=query_configs).query(risk_query_str)
print(response_summary)

The current risk factors described in the context information are related to the impact of the global COVID-19 pandemic on the company's business. These risks include government regulations and shifting social behaviors that have limited or closed non-essential transportation, government functions, business activities, and person-to-person interactions. The company temporarily suspended operations at its manufacturing facilities worldwide and implemented temporary employee furloughs and compensation reductions during the scaled-back operations in the U.S. The specific year for these risk factors is not mentioned in the context information. Additionally, the context information does not provide risk factors for multiple years or any information about how these risk factors may be changing across years.


In [8]:
# from llama_index import download_loader
# from pathlib import Path

# UnstructuredReader = download_loader("UnstructuredReader", refresh_cache=True)

# loader = UnstructuredReader()
# doc_set = {}
# all_docs = []
# years = [2022, 2021, 2020, 2019]
# for year in years:
#     year_docs = loader.load_data(file=Path(f'Data\\UBER\\UBER_{year}.html'), split_documents=False)
#     # insert year metadata into each year
#     for d in year_docs:
#         d.extra_info = {"year": year}
#     doc_set[year] = year_docs
#     all_docs.extend(year_docs)  

# from llama_index import ServiceContext
# service_context = ServiceContext.from_defaults(chunk_size=512)

# # initialize simple vector indices + global vector index
# # NOTE: don't run this cell if the indices are already loaded! 
# index_set = {}
# for year in years:
#     cur_index = GPTVectorStoreIndex.from_documents(doc_set[year], service_context=service_context)
#     index_set[year] = cur_index
#     #cur_index.save_to_disk(f'index_{year}.json')

In [9]:
# response = index_set[2020].as_query_engine(similarity_top_k=3).query("What were some of the biggest risk factors in 2020?")
# print(response)

# response = index_set[2020].as_query_engine(similarity_top_k=3).query("What were some of the significant acquisitions?")
# print(response)

In [10]:
# # NOTE: this global index is a single vector store containing all documents
# # Only relevant for the section below: "Can a single vector index answer questions across years?"
# global_index = GPTVectorStoreIndex.from_documents(all_docs, service_context=service_context)

# risk_query_str = "What are some of the biggest risk factors in each year?"
# response = global_index.as_query_engine(similarity_top_k=3).query(risk_query_str)
# print(str(response))

In [11]:
# from llama_index import GPTListIndex, LLMPredictor
# from langchain import OpenAI
# from llama_index.composability import ComposableGraph

# # set summary text for each doc
# summaries = {}
# for year in years:
#     summaries[year] = f"UBER 10-k Filing for {year} fiscal year"

# # set number of output tokens
# llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, max_tokens=512))
# service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

# graph = ComposableGraph.from_indices(
#     GPTListIndex,
#     [index_set[y] for y in years],
#     [summaries[y] for y in years],
#     service_context=service_context
# )

# risk_query_str = (
#     "Describe the current risk factors. If the year is provided in the information, "
#     "provide that as well. If the context contains risk factors for multiple years, "
#     "explicitly provide the following:\n"
#     "- A description of the risk factors for each year\n"
#     "- A summary of how these risk factors are changing across years"
# )

# query_configs = [
#     {
#         "index_struct_type": "dict",
#         "query_mode": "default",
#         "query_kwargs": {
#             "similarity_top_k": 1,
#             # "include_summary": True
#         }
#     },
#     {
#         "index_struct_type": "list",
#         "query_mode": "default",
#         "query_kwargs": {
#             "response_mode": "tree_summarize",
#         }
#     },
# ]

# response_summary = graph.as_query_engine(query_configs=query_configs).query(risk_query_str)
# print(response_summary)