In [1]:
import os
import nest_asyncio
nest_asyncio.apply()

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core.node_parser import MarkdownElementNodeParser

from llama_parse import LlamaParse
from llama_index.vector_stores.astra import AstraDBVectorStore
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
LLAMA_CLOUD_API_KEY = os.environ.get("LLAMA_CLOUD_API_KEY")

ASTRA_TOKEN = os.environ.get("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_API_ENDPOINT = os.environ.get("ASTRA_API_ENDPOINT")
ASTRA_NAMESPACE = os.environ.get("ASTRA_DB_KEYSPACE")

In [3]:
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o-mini")

Settings.llm = llm
Settings.embed_model = embed_model

In [6]:
documents = LlamaParse(result_type="markdown").load_data("../transcripts/legal-basis/MERGED TSN AUG. 5, 2024.pdf")

Started parsing the file under job_id aa4ad480-7c34-4548-933d-18621dde965e


In [7]:
print(documents[0].text[:1000])

# MONDAY, AUGUST 5, 2024

# OPENING OF THE SESSION

At 3:07 p.m., the President Pro Tempore, Hon. Jinggoy Ejercito Estrada, called the session to order.

The President Pro Tempore. The 6th session of the Senate in the Third Regular Session of the Nineteenth Congress is hereby called to order.

Let us all stand for the opening prayer to be led by Sen. Raffy T. Tulfo.

Everybody rose for the prayer.

# PRAYER

Senator Tulfo. Let us remember that we are in the Holy Presence of God.

Almighty Father, with grateful hearts, we rejoice in the victory of our athletes in the Paris Olympics, especially the gold medals won by Carlos Yulo. May we take these wins, not just as a reason to celebrate, but as an inspiration to represent our nation in the best way we can. While we cannot all be athletes, we are all called to strive to push our country to its full potential.

Father, guide our hearts, our minds, and our spirits so that whatever we do, we can proudly call ourselves Filipinos.

We lift up 

In [8]:
astra_db_store_advanced = AstraDBVectorStore(
    token=ASTRA_TOKEN,
    api_endpoint=ASTRA_API_ENDPOINT,
    namespace=ASTRA_NAMESPACE,
    collection_name="astra_v_table_llamaparse_advanced_new",
    embedding_dimension=1536,
)


astra_db_store_base = AstraDBVectorStore(
    token=ASTRA_TOKEN,
    api_endpoint=ASTRA_API_ENDPOINT,
    namespace=ASTRA_NAMESPACE,
    collection_name="astra_v_table_llamaparse_base",
    embedding_dimension=1536,
)

In [9]:
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8)

In [10]:
nodes = node_parser.get_nodes_from_documents(documents)

0it [00:00, ?it/s]
1it [00:00, 29330.80it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [

In [11]:
for idx, n in enumerate(nodes):
    print(idx)
    print(n.get_content())

0
MONDAY, AUGUST 5, 2024

 OPENING OF THE SESSION

At 3:07 p.m., the President Pro Tempore, Hon. Jinggoy Ejercito Estrada, called the session to order.

The President Pro Tempore. The 6th session of the Senate in the Third Regular Session of the Nineteenth Congress is hereby called to order.

Let us all stand for the opening prayer to be led by Sen. Raffy T. Tulfo.

Everybody rose for the prayer.

 PRAYER

Senator Tulfo. Let us remember that we are in the Holy Presence of God.

Almighty Father, with grateful hearts, we rejoice in the victory of our athletes in the Paris Olympics, especially the gold medals won by Carlos Yulo. May we take these wins, not just as a reason to celebrate, but as an inspiration to represent our nation in the best way we can. While we cannot all be athletes, we are all called to strive to push our country to its full potential.

Father, guide our hearts, our minds, and our spirits so that whatever we do, we can proudly call ourselves Filipinos.

We lift up to

In [12]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [13]:
base_nodes[0].metadata = {
    "document_title":"MERGED TSN AUG. 5, 2024"
}

In [14]:
base_nodes[0].metadata

{'document_title': 'MERGED TSN AUG. 5, 2024'}

In [15]:
storage_context_advanced = StorageContext.from_defaults(vector_store=astra_db_store_advanced)
storage_context_base = StorageContext.from_defaults(vector_store=astra_db_store_base)

recursive_index = VectorStoreIndex(nodes=base_nodes+objects, storage_context=storage_context_advanced)
raw_index = VectorStoreIndex.from_documents(documents, storage_context=storage_context_base)

In [29]:
reranker = FlagEmbeddingReranker(
    top_n=3,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=5,
    node_postprocessors=[reranker],
    verbose=True
)

raw_query_engine = raw_index.as_query_engine(similarity_top_k=5, node_postprocessors=[reranker])

In [30]:
query = "What is the timeline for incentives under EDA?"

response_1 = raw_query_engine.query(query)
print("LlamaParse + Basic Query Engine")
print(response_1)

# response_2 = recursive_query_engine.query(query)
# print("LlamaParse + Recursive Retriever Query Engine")
# print(response_2)

LlamaParse + Basic Query Engine
The timeline for incentives under the Export Development Act (EDA) was from 1995 to 1999. Specific incentives had different expiration dates, such as duty-free importation of machinery and equipment until December 31, 1997, and tax credits for imported raw materials not locally available until December 31, 1999.


In [41]:
query = "What is CREATE MORE?"

response_1 = raw_query_engine.query(query)
print("LlamaParse + Basic Query Engine")
print(response_1)

LlamaParse + Basic Query Engine
The provided context does not contain any information about "CREATE MORE." Therefore, I cannot provide an answer regarding that topic.
