In [44]:
import nest_asyncio
nest_asyncio.apply()

In [45]:
from llama_index.core import SimpleDirectoryReader,VectorStoreIndex,SummaryIndex,Settings
from llama_index.core.node_parser import SentenceSplitter
# from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core.tools import QueryEngineTool
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
from llama_index.llms.openai import OpenAI
from sentence_transformers import SentenceTransformer
from pathlib import Path
from llama_parse import LlamaParse
from llama_index.embeddings.openai import OpenAIEmbedding
from dotenv import load_dotenv
import os
from pathlib import Path

# Load environment variables
load_dotenv()

DATA_PATH = Path().cwd().parent / "data" / 'V11_Argumentaire_Peugeot_2024.pdf'

In [39]:
# Initialize the parser with multimodal settings
# parser = LlamaParse(
#     api_key=os.getenv("LLAMA_CLOUD_API_KEY_2"),
#     use_vendor_multimodal_model=True,
#     vendor_multimodal_model_name="openai-gpt4o"
# )

# # Use SimpleDirectoryReader to parse the file
# file_extractor = {".pdf": parser}
# documents = SimpleDirectoryReader(input_files=[DATA_PATH], file_extractor=file_extractor).load_data()

# # Save the parsed result to a markdown file
# with open('parsed_result_gpt.md', 'w') as result_file:
#     for doc in documents:
#         result_file.write(doc.text)

Started parsing the file under job_id a4c9a2f2-7778-49f4-b2d4-75ffbaad0978


In [46]:
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")
Settings.chunk_size = 1024

print("creating instance of LLM")
Settings.llm = OpenAI(model="gpt-4-turbo-2024-04-09")

creating instance of LLM


In [47]:
splitter = SentenceSplitter(chunk_size=1024,chunk_overlap=100)
nodes = splitter.get_nodes_from_documents(documents)

print(f"Length of nodes : {len(nodes)}")

Length of nodes : 55


In [48]:
summary_index = SummaryIndex(nodes)
vector_index = VectorStoreIndex(nodes)

summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True,
)
vector_query_engine = vector_index.as_query_engine()

summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description=(
        "Useful for summarization questions related to the document"
    ),
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific context from the document."
    ),
)

query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        summary_tool,
        vector_tool,
    ],
    verbose=True
)

response = query_engine.query("Explain the advantages of e-208 electric ?")
print(str(response))
print(len(response.source_nodes))

[1;3;38;5;200mSelecting query engine 1: This choice is relevant because the question asks for specific details about the advantages of the e-208 electric, which requires retrieving specific context from the document..
[0mThe E-208, being a fully electric vehicle (BEV), offers the advantage of zero CO₂ emissions during driving. This makes it an environmentally friendly option, contributing to reduced air pollution. Additionally, as a BEV, it qualifies for ecological bonuses in many regions, enhancing its affordability and appeal to eco-conscious consumers.
2
