In [10]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
)
from llama_index.core import SummaryIndex
from llama_index.core.schema import IndexNode
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.llms.openai import OpenAI
from llama_index.core.callbacks import CallbackManager

from pathlib import Path
import PyPDF2

In [11]:
pdf_file_paths = [
    Path("goog-10-k-2023.pdf"),
    Path("goog-10-k-2022.pdf"),
    # Add more file paths...
]

In [12]:
for file_path in pdf_file_paths:
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        pdf_text = ''
        for page in pdf_reader.pages:
            pdf_text += page.extract_text()

    data_path = Path("data")
    if not data_path.exists():
        Path.mkdir(data_path)

    with open(data_path / f"{file_path.stem}.txt", "w") as fp:
        fp.write(pdf_text)

In [13]:
# Load all 10-K PDF documents
pdf_docs = {}
for file_path in pdf_file_paths:
    pdf_docs[file_path.stem] = SimpleDirectoryReader(
        input_files=[f"data/{file_path.stem}.txt"]
    ).load_data()

# Define Global LLM and Embeddings
import os

os.environ["OPENAI_API_KEY"] = ""
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

Settings.llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

In [14]:
# Build Document Agent for each Document
from llama_index.agent.openai import OpenAIAgent
from llama_index.core import load_index_from_storage, StorageContext
from llama_index.core.node_parser import SentenceSplitter
import os

node_parser = SentenceSplitter()

# Build agents dictionary
agents = {}
query_engines = {}

# this is for the baseline
all_nodes = []

for idx, file_path in enumerate(pdf_file_paths):
    nodes = node_parser.get_nodes_from_documents(pdf_docs[file_path.stem])
    all_nodes.extend(nodes)

    if not os.path.exists(f"./data/{file_path.stem}"):
        # build vector index
        vector_index = VectorStoreIndex(nodes)
        vector_index.storage_context.persist(
            persist_dir=f"./data/{file_path.stem}"
        )
    else:
        vector_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=f"./data/{file_path.stem}"),
        )

    # build summary index
    summary_index = SummaryIndex(nodes)
    # define query engines
    vector_query_engine = vector_index.as_query_engine(llm=Settings.llm)
    summary_query_engine = summary_index.as_query_engine(llm=Settings.llm)

    # define tools
    query_engine_tools = [
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name="vector_tool",
                description=(
                    "Useful for questions related to specific aspects of"
                    f" {file_path.stem} (e.g. financial statements, risk factors, management discussion and analysis, or more)."
                ),
            ),
        ),
        QueryEngineTool(
            query_engine=summary_query_engine,
            metadata=ToolMetadata(
                name="summary_tool",
                description=(
                    "Useful for any requests that require a holistic summary"
                    f" of EVERYTHING about {file_path.stem}. For questions about"
                    " more specific sections, please use the vector_tool."
                ),
            ),
        ),
    ]

    # build agent
    function_llm = OpenAI(model="gpt-4")
    agent = OpenAIAgent.from_tools(
        query_engine_tools,
        llm=function_llm,
        verbose=True,
        system_prompt=f"""\
You are a specialized agent designed to answer queries about {file_path.stem}.
You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
""",
    )

    agents[file_path.stem] = agent
    query_engines[file_path.stem] = vector_index.as_query_engine(
        similarity_top_k=2
    )


In [15]:
# Build Retriever-Enabled OpenAI Agent
# define tool for each document agent
all_tools = []
for file_path in pdf_file_paths:
    pdf_summary = (
        f"This content contains information from the 10-K PDF: {file_path.stem}. Use"
        f" this tool if you want to answer any questions about {file_path.stem}.\n"
    )
    doc_tool = QueryEngineTool(
        query_engine=agents[file_path.stem],
        metadata=ToolMetadata(
            name=f"tool_{file_path.stem}",
            description=pdf_summary,
        ),
    )
    all_tools.append(doc_tool)

In [19]:
# define an "object" index and retriever over these tools
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex

obj_index = ObjectIndex.from_objects(
    all_tools,
    index_cls=VectorStoreIndex,
)

from llama_index.agent.openai import OpenAIAgent

top_agent = OpenAIAgent.from_tools(
    tool_retriever=obj_index.as_retriever(similarity_top_k=3),
    system_prompt=""" \
You are an agent designed to answer queries about a set of given 10-K PDF documents.
Please always use the tools provided to answer a question. Do not rely on prior knowledge.\
""",
    verbose=True,
)

# Define Baseline Vector Store Index
base_index = VectorStoreIndex(all_nodes)
base_query_engine = base_index.as_query_engine(similarity_top_k=4)

# Running Example Queries
# Replace with your own example queries
response = top_agent.query("What were the total revenues for the company in the latest fiscal year? What were the total revenues for the company in  2022?")
print(response)

Added user message to memory: What were the total revenues for the company in the latest fiscal year? What were the total revenues for the company in  2022?
=== Calling Function ===
Calling function: tool_goog-10-k-2023 with args: {"input": "total revenues"}
Added user message to memory: total revenues
=== Calling Function ===
Calling function: vector_tool with args: {
  "input": "total revenues"
}
Got output: Total revenues for the year ended December 31, 2023 were $307.4 billion.

Got output: The total revenues for the year ended December 31, 2023 were $307.4 billion.

=== Calling Function ===
Calling function: tool_goog-10-k-2022 with args: {"input": "total revenues"}
Added user message to memory: total revenues
=== Calling Function ===
Calling function: vector_tool with args: {
  "input": "total revenues"
}
Got output: Total revenues for the company are generated through various sources such as brand advertising, Google Cloud services, Google other revenues (including Google Play, 