### Multi document agent 
In this notebook, we will build a multi-document agent. The architecture is that each document would have its own individual agent and there would be an overachring agent that decides which of the agents to allocate a question to

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# needed only once to download the llama index documentation website
# domain = "docs.llamaindex.ai"
# docs_url = "https://docs.llamaindex.ai/en/latest/"
# !wget -e robots=off --recursive --no-clobber --page-requisites --html-extension --convert-links --restrict-file-names=windows --domains {domain} --no-parent {docs_url}


Both --no-clobber and --convert-links were specified, only --convert-links will be used.
--2024-02-11 01:28:00--  https://docs.llamaindex.ai/en/latest/
Resolving docs.llamaindex.ai (docs.llamaindex.ai)... 104.18.1.163, 104.18.0.163
Connecting to docs.llamaindex.ai (docs.llamaindex.ai)|104.18.1.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘docs.llamaindex.ai/en/latest/index.html’

docs.llamaindex.ai/     [ <=>                ] 226.37K  1.31MB/s    in 0.2s    

2024-02-11 01:28:02 (1.31 MB/s) - ‘docs.llamaindex.ai/en/latest/index.html’ saved [231805]

--2024-02-11 01:28:02--  https://docs.llamaindex.ai/en/latest/genindex.html
Reusing existing connection to docs.llamaindex.ai:443.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘docs.llamaindex.ai/en/latest/genindex.html’

docs.llamaindex.ai/     [     <=>            ]   1002K  1.16MB/s    in 0.8s    

2024-02-11 01:28:03 (1.16 MB

In [2]:
from llama_hub.file.unstructured.base import UnstructuredReader
from llama_index.llms.openai import OpenAI
from llama_index.service_context import ServiceContext
from pathlib import Path

In [3]:
reader = UnstructuredReader()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kosisochukwuasuzu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kosisochukwuasuzu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
all_files_gen = Path("./docs.llamaindex.ai/").rglob("*")
all_files = [f.resolve() for f in all_files_gen]

In [5]:
len(all_files)

322

In [6]:
suffixes = set([file.suffix for file in all_files])

In [7]:
suffixes

{'', '.css', '.html', '.js'}

In [8]:
all_html_files = [file for file in all_files if file.suffix.lower() == ".html"]

In [9]:
len(all_html_files)

227

In [10]:
from llama_index.schema import Document

doc_limit = 100

In [11]:
docs = []
for idx in range(doc_limit):
    f = all_html_files[idx]
    print(f"Idx {idx}/{len(all_html_files)}")
    loaded_doc = reader.load_data(file=f, split_documents=True)
    start_idx = 72
    loaded_doc = Document(
        text= "\n\n".join([d.get_content() for d in loaded_doc[72:]]),
        metadata={"path": str(f)}
    )
    print(loaded_doc.metadata["path"])
    docs.append(loaded_doc)

Idx 0/227
/Users/kosisochukwuasuzu/Developer/ai-startups/test-demos/pdfchat/src/llamaindex/qa/multidocagentqa/docs.llamaindex.ai/en/latest/index.html
Idx 1/227
/Users/kosisochukwuasuzu/Developer/ai-startups/test-demos/pdfchat/src/llamaindex/qa/multidocagentqa/docs.llamaindex.ai/en/latest/genindex.html
Idx 2/227
/Users/kosisochukwuasuzu/Developer/ai-startups/test-demos/pdfchat/src/llamaindex/qa/multidocagentqa/docs.llamaindex.ai/en/latest/search.html
Idx 3/227
/Users/kosisochukwuasuzu/Developer/ai-startups/test-demos/pdfchat/src/llamaindex/qa/multidocagentqa/docs.llamaindex.ai/en/latest/understanding/understanding.html
Idx 4/227
/Users/kosisochukwuasuzu/Developer/ai-startups/test-demos/pdfchat/src/llamaindex/qa/multidocagentqa/docs.llamaindex.ai/en/latest/understanding/using_llms/using_llms.html
Idx 5/227
/Users/kosisochukwuasuzu/Developer/ai-startups/test-demos/pdfchat/src/llamaindex/qa/multidocagentqa/docs.llamaindex.ai/en/latest/understanding/using_llms/privacy.html
Idx 6/227
/Users/

In [12]:
llm = OpenAI(model="mistralai/Mixtral-8x7B-Instruct-v0.1", temperature=0.0)
service_context = ServiceContext.from_defaults(llm=llm)

In [13]:
from llama_index.indices.vector_store import VectorStoreIndex
from llama_index.indices.list.base import SummaryIndex


In [14]:
import nest_asyncio

nest_asyncio.apply()

#### Building document agent for each html document
This means we will have ~227 agents. We define two query engines on each document, we have the semantic search engine the summaraization engine, these are then passed to the agent as tools, we would be using openais function calling capability to acheive this

In [15]:
from llama_index.agent.openai.base import OpenAIAgent
from llama_index.indices.loading import load_index_from_storage
from llama_index.storage import StorageContext
from llama_index.tools.query_engine import QueryEngineTool
from llama_index.tools.types import ToolMetadata
from llama_index.node_parser import SentenceSplitter
import os
from tqdm import tqdm
import pickle


In [16]:
from textwrap import dedent

In [17]:
async def build_agent_per_doc(nodes, file_base):
    print(file_base)
    vi_out_path = f"./data/llamaindex_docs/{file_base}" # output path for vector index
    summary_out_path = f"./data/llamaindex_docs/{file_base}_summary.pkl" # output file for summary index
    
    if not os.path.exists(vi_out_path):
        Path("./data/llamaindex_docs/").mkdir(parents=True, exist_ok=True)
        vector_index = VectorStoreIndex(nodes, service_context=service_context)
        vector_index.storage_context.persist(persist_dir=vi_out_path)
        
    else:
        vector_index = load_index_from_storage(StorageContext.from_defaults(persist_dir=vi_out_path),
                                               service_context=service_context)
    
    # build the summary index
    summary_index = SummaryIndex(nodes, service_context=service_context)
    
    vector_query_engine = vector_index.as_query_engine()
    summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize")
    
    if not os.path.exists(summary_out_path):
        Path(summary_out_path).parent.mkdir(parents=True, exist_ok=True)
        summary = str(
            await summary_query_engine.aquery(
                "Extract a concise 1-2 line summary of this document"
            )
        )
        pickle.dump(summary, open(summary_out_path, "wb"))
    else:
        summary = pickle.load(open(summary_out_path, "rb"))
        
    query_tools = [
        QueryEngineTool(query_engine=vector_query_engine,
                        metadata=ToolMetadata(
                            name=f"vector_tool_{file_base}",
                            description=f"Useful for questions related to specific facts"
                )),
         QueryEngineTool(query_engine=summary_query_engine,
                        metadata=ToolMetadata(
                            name=f"summary_tool_{file_base}",
                            description=f"Useful for summarization questions",
                            )),

    ]
    
    function_llm = OpenAI(model="mistralai/Mixtral-8x7B-Instruct-v0.1")
 
    agent = OpenAIAgent.from_llm(
        llm=function_llm,
        tools=query_tools,
        verbose=True,
        # system_prompt=dedent(f"""\
        #     You are a specialized agent designed to answer queries about the `{file_base}.html` part of the LlamaIndex docs.
        #     You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
        #     """),
    )
    
    return agent, summary

In [18]:
async def build_agents(docs):
    node_parser = SentenceSplitter()
    
    agents_dict = {}
    extract_info_dict = {}
    
    for idx, doc in enumerate(tqdm(docs)):
        nodes = node_parser.get_nodes_from_documents([doc])
        # ID will be base + parent
        file_path = Path(doc.metadata["path"])
        file_base = str(file_path.parent.stem) + "_" + str(file_path.stem)
        agent, summary = await build_agent_per_doc(nodes, file_base)
        
        agents_dict[file_base] = agent
        extract_info_dict[file_base] = {"summary": summary, "nodes": nodes}
        
    return agents_dict, extract_info_dict
        

In [19]:
agents_dict, extra_info_dict = await build_agents(docs[:7])

  0%|          | 0/7 [00:00<?, ?it/s]

latest_index


 14%|█▍        | 1/7 [00:09<00:58,  9.74s/it]

latest_genindex


#### Next we will build the retreiver enabled OpenAI agent
We are going to build the top level agent that orchestrates the use of the other agents to answer user queries

In [None]:
all_tools = []

for file_base, agent in agents_dict.items():
    summary = extra_info_dict[file_base]["summary"]
    doc_tool = QueryEngineTool(
        query_engine=agent, 
        metadata=ToolMetadata(
            name=f"tool_{file_base}",
            description=summary,
        )
    )
    all_tools.append(doc_tool)