In [27]:
from llama_index.core import (
    VectorStoreIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
)
from llama_index.core import SummaryIndex
from llama_index.core.schema import IndexNode
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.callbacks import CallbackManager

import llama_index.core

llama_index.core.set_global_handler("simple")

In [28]:
import os

# Load all PDF documents
customs_docs = {}
pdf_folder = "./content"
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        print(pdf_file)
        customchapter_name = os.path.splitext(pdf_file)[0]
        print(customchapter_name)
        customs_docs[customchapter_name] = SimpleDirectoryReader(
            input_files=[os.path.join(pdf_folder, pdf_file)]
        ).load_data()
        print(customs_docs[customchapter_name])

chapter 27.pdf
chapter 27
[Document(id_='69c8a10a-53b8-4780-a26c-d9a8f133a730', embedding=None, metadata={'page_label': '1', 'file_name': 'chapter 27.pdf', 'file_path': 'content\\chapter 27.pdf', 'file_type': 'application/pdf', 'file_size': 642721, 'creation_date': '2024-03-14', 'last_modified_date': '2024-03-14', 'last_accessed_date': '2024-03-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='1 / Chap  27  \nCHAPITRE  27  \n \n \nCOMBUSTIBLES  MINERAUX,  HUILES  MINERALES  ET \nPRODUITS DE LEUR DISTILLATION; MATIERES  \nBITUMINEUSES;  CIRES  MINERALES  \n \nNotes.  \n \n1.- Le présent  Chapitre  ne comprend  pas : \na) les produits  organiques  de constitution  chimique  définie  présentés  isolément;  cette  exclusion  ne vise pas \nle méthane  et 

In [29]:
from llama_index.core import Settings
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from llama_index.llms.ollama import Ollama

Settings.embed_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")
llm = Ollama(model="llama2", request_timeout=300.0)

In [30]:
os.environ["OPENAI_API_KEY"] = "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"

In [31]:
from llama_index.core import load_index_from_storage, StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SummaryIndex, VectorStoreIndex
from llama_index.core.agent import ReActAgent
import os

# Initialize SentenceSplitter
node_parser = SentenceSplitter()

# Initialize agents and query engines dictionaries
agents = {}
query_engines = {}

# List to hold all nodes for baseline
all_nodes = []

# Iterate through each custom chapter
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith(".pdf"):
        customchapter_name = os.path.splitext(pdf_file)[0]
        
        # Parse nodes from the PDF document
        nodes = node_parser.get_nodes_from_documents(customs_docs[customchapter_name])
        all_nodes.extend(nodes)

        # Check if vector index directory exists
        vector_index_dir = f"./data/{customchapter_name}"
        if not os.path.exists(vector_index_dir):
            # If directory doesn't exist, build and persist vector index
            vector_index = VectorStoreIndex(nodes)
            vector_index.storage_context.persist(persist_dir=vector_index_dir)
        else:
            # If directory exists, load vector index from storage
            vector_index = load_index_from_storage(StorageContext.from_defaults(persist_dir=vector_index_dir))

        # Build summary index
        summary_index = SummaryIndex(nodes)

        # Define query engines
        vector_query_engine = vector_index.as_query_engine(llm=llm)
        summary_query_engine = summary_index.as_query_engine(llm=llm)

        # Define query engine tools
        query_engine_tools = [
            QueryEngineTool(
                query_engine=vector_query_engine,
                metadata=ToolMetadata(
                    name="vector_tool",
                    description=(
                        f"Useful for questions related to specific aspects of {customchapter_name} of customs regarding Morocco."
                    ),
                ),
            ),
            QueryEngineTool(
                query_engine=summary_query_engine,
                metadata=ToolMetadata(
                    name="summary_tool",
                    description=(
                        f"Useful for any requests that require a holistic summary of EVERYTHING about {customchapter_name} of customs regarding Morocco."
                    ),
                ),
            ),
        ]

        # Build OpenAIAgent
        agent = ReActAgent.from_tools(
            query_engine_tools,
            llm=llm,
            verbose=True,
            system_prompt=f"""\
            You are a specialized agent designed to answer queries about {customchapter_name} of customs regarding Morocco.
            You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
            """,
        )

        # Add agent and query engine to dictionaries
        agents[customchapter_name] = agent
        query_engines[customchapter_name] = vector_index.as_query_engine(similarity_top_k=2,llm=llm)


In [32]:
# Define tool for each customs document agent
all_tools = []
for customchapter_name in customs_docs:
    # print(customchapter_name)
    customchapter_summary = (
        f"This content contains information about {customchapter_name} of customs regarding Morocco."
        f"Use this tool if you want to answer any questions about {customchapter_name} of customs regarding Morocco.\n"
    )
    doc_tool = QueryEngineTool(
        query_engine=agents[customchapter_name],
        metadata=ToolMetadata(
            name=f"tool_{customchapter_name}",
            description=customchapter_summary,
        ),
    )
    all_tools.append(doc_tool)


In [33]:
# define an "object" index and retriever over these tools
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex, SimpleToolNodeMapping

tool_mapping = SimpleToolNodeMapping.from_objects(all_tools)
obj_index = ObjectIndex.from_objects(
    all_tools,
    tool_mapping,
    VectorStoreIndex,
)

In [34]:
# define an "object" index and retriever over these tools
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import (
    ObjectIndex,
    SimpleToolNodeMapping,
    ObjectRetriever,
)
from llama_index.core.retrievers import BaseRetriever
from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.query_engine import SubQuestionQueryEngine
from llama_index.llms.openai import OpenAI

tool_mapping = SimpleToolNodeMapping.from_objects(all_tools)
obj_index = ObjectIndex.from_objects(
    all_tools,
    tool_mapping,
    VectorStoreIndex,
)
vector_node_retriever = obj_index.as_node_retriever(similarity_top_k=10)


# define a custom retriever with reranking
class CustomRetriever(BaseRetriever):
    def __init__(self, vector_retriever, postprocessor=None):
        self._vector_retriever = vector_retriever
        self._postprocessor = postprocessor or CohereRerank(top_n=5)
        super().__init__()

    def _retrieve(self, query_bundle):
        retrieved_nodes = self._vector_retriever.retrieve(query_bundle)
        filtered_nodes = self._postprocessor.postprocess_nodes(
            retrieved_nodes, query_bundle=query_bundle
        )

        return filtered_nodes


# define a custom object retriever that adds in a query planning tool
class CustomObjectRetriever(ObjectRetriever):
    def __init__(self, retriever, object_node_mapping, all_tools, llm=None):
        self._retriever = retriever
        self._object_node_mapping = object_node_mapping
        self._llm = llm or OpenAI("gpt-4-0613")

    def retrieve(self, query_bundle):
        nodes = self._retriever.retrieve(query_bundle)
        tools = [self._object_node_mapping.from_node(n.node) for n in nodes]

        sub_question_engine = SubQuestionQueryEngine.from_defaults(
            query_engine_tools=tools, llm=self._llm
        )
        sub_question_description = f"""\
        Useful for any queries that involve comparing multiple documents. ALWAYS use this tool for comparison queries - make sure to call this \
        tool with the original query. Do NOT use the other tools for any queries involving multiple documents.
        """
        sub_question_tool = QueryEngineTool(
            query_engine=sub_question_engine,
            metadata=ToolMetadata(
                name="compare_tool", description=sub_question_description
            ),
        )

        return tools + [sub_question_tool]

In [35]:
os.environ['COHERE_API_KEY'] = 'tz56cKLq6j4PdFKXEW9K9XGOF4spOnCHHENAaY8W'
custom_node_retriever = CustomRetriever(vector_node_retriever)

# wrap it with ObjectRetriever to return objects
custom_obj_retriever = CustomObjectRetriever(
    custom_node_retriever, tool_mapping, all_tools, llm=llm
)

In [36]:
top_agent = ReActAgent.from_tools(
    tool_retriever=custom_obj_retriever,
    system_prompt=""" \
    You are an agent designed to answer queries about the moroccan customs chapters.
    Please always use the tools provided to answer a question. Do not rely on prior knowledge.\
    """,
    llm=llm,
    verbose=True,
)

In [37]:
response = top_agent.query("qu'est ce qui est exclus du chapitre 29 ?")
# print(str(response))

[1;3;38;5;200mThought: I need to use a tool to help me answer the question.
Action: tool_chapter
Action Input: {'input': 'What is excluded from chapter 29?', 'num_beams': 5}
[0m

KeyError: 'tool_chapter'