In [1]:
import os
import nest_asyncio
nest_asyncio.apply()

from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core.node_parser import MarkdownElementNodeParser

from llama_parse import LlamaParse
from llama_index.vector_stores.astra import AstraDBVectorStore
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
LLAMA_CLOUD_API_KEY = os.environ.get("LLAMA_CLOUD_API_KEY")

ASTRA_TOKEN = os.environ.get("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_API_ENDPOINT = os.environ.get("ASTRA_API_ENDPOINT")
ASTRA_NAMESPACE = os.environ.get("ASTRA_DB_KEYSPACE")

In [3]:
astra_db_store_advanced = AstraDBVectorStore(
    token=ASTRA_TOKEN,
    api_endpoint=ASTRA_API_ENDPOINT,
    namespace=ASTRA_NAMESPACE,
    collection_name="senate_transcripts",
    embedding_dimension=1536,
)

In [4]:
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o-mini")

Settings.llm = llm
Settings.embed_model = embed_model

In [5]:
# start_path = "../transcripts/legal-basis"
# # doc_title = "CREATE_RA 11534_SB 1357_TCM and TSP Plenary Deliberations /"
# files = os.listdir(start_path)

# # files = ["../transcripts/legal-basis/RA12066.pdf"]

# for f in files:
#     try:
#         documents = LlamaParse(result_type="markdown").load_data(os.path.join(start_path, files[0]))

#         node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8)
#         nodes = node_parser.get_nodes_from_documents(documents)
#         base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

#         for i, j in enumerate(base_nodes):
#             # j.metadata["title"] = f"{doc_title} {f}"
#             j.metadata["title"] = f
        
#         storage_context_advanced = StorageContext.from_defaults(vector_store=astra_db_store_advanced)
#         recursive_index = VectorStoreIndex(nodes=base_nodes+objects, storage_context=storage_context_advanced)

#     except:
#         print(f"{f} unreadable!")


In [8]:
documents = LlamaParse(result_type="markdown").load_data("/Users/katecastillo/Documents/learning/local-rag-deployment/transcripts/legal-basis/RA10963.pdf")
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8)
nodes = node_parser.get_nodes_from_documents(documents)
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

for i, j in enumerate(base_nodes):
    # j.metadata["title"] = f"{doc_title} {f}"
    j.metadata["title"] = "RA10963.pdf"

storage_context_advanced = StorageContext.from_defaults(vector_store=astra_db_store_advanced)
recursive_index = VectorStoreIndex(nodes=base_nodes+objects, storage_context=storage_context_advanced)

Started parsing the file under job_id 819acd4a-da07-4c33-a12e-2a9774c81508
.

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 7244.05it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 30174.85it/s]
0it [00:00, ?it/s]
1it [00:00, 16131.94it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 16384.00it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it 

In [7]:
base_nodes

[TextNode(id_='a5e051da-30b8-4129-881b-a1addb89f076', embedding=None, metadata={'title': 'RA12066.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f5533971-eb08-41f6-94dd-0c01edfd7818', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='b917c6b54e22440f8cd523f78d748442ff110d88ce4014edbf6570f6ddc6366d')}, text='Republic of the Philippines\n\n Congress of the Philippines\n\n Nineteenth Congress\n\n Third Regular Session\n\nBegun and held in Metro Manila, on Monday, the twenty-second day of July, two thousand twenty-four\n\nREPUBLIC ACT No. 12086\n\n AN ACT AMENDING SECTIONS 27, 28, 32, 34, 57, 106, 108, 109, 112, 135, 237, 237-4, 269, 292, 293, 294, 295, 296, 297, 300, 301, 308, 309, 310, AND 311, AND ADDING NEW SECTIONS 135-A, 295-A; 296-4, AND 297-A OF THE NATIONAL INTERNAL REVENUE CODE OF 1997, AS AMENDED, AND FOR OTHER PURPOSES\n\nBe it enacted by the Senate and House of Representative

In [14]:
recursive_index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x14f891850>

In [10]:
# storage_context_advanced = StorageContext.from_defaults(vector_store=astra_db_store_advanced)
# recursive_index = VectorStoreIndex(storage_context=storage_context_advanced)

reranker = FlagEmbeddingReranker(
    top_n=3,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=5,
    node_postprocessors=[reranker],
    verbose=True
)

In [11]:
SYSTEM_PROMPT = """
You are AVA, a helpful assistant for the Department of Budget and Management (DBM) that contains information about past projects and other available documents published by the DBM.

Respond by following these instructions:\n
1. Assign every relevant source a number `n` so that EVERY conclusion, fact, markdown table, and/or derivative from a source uses Github Markdown Flavored [^n] citation corresponding to its source.\n
2. Organize your response in paragraph format instead of using bullet points and be very specific with your responses.
3. Use the phrase "Based on internal/external information..." if you will refer to internal/external sources.
4. If internal/external information is not provided, do not mention its absence.
5. Create a statement before the References section with at least 1 citation [^n] that synthesizes and summarizes your response.\n
    - With each reference, they must follow the format of `[^n]: [Title]`\n
    - Do not repetitively cite the same reference
6. Answer the question directly using only the information shared with you.\n

Here is an example of an input:\n\n
## Start of Example Input ##
"[
    {
        'Citation Number': 1,
        'Title': 'DTI submission EMB Submission re CREATE MORE',
        'Content': 'Number of entities registered with Export Marketing Bureau from 2018 to 2024...',
    },
    {
        'Citation Number': 2,
        'Title': 'PHIVIDEC-IA Position Paper CREATE MORE',
        'Content': 'The PHIVIDEC Industrial Authority (PHIVIDEC-IA), through its Administrator and Chief Executive Officer (CEO)...',
    },
    {
        'Citation Number': 3,
        'Title': 'PPMC - Position Paper - SB No. 2654 and HB No. 9794',
        'Content': 'This has reference to your letter, dated May 7, 2024, requesting Poro Point Management Corporation...',
    },]
]\n\n
## End of Example Input##
\n\n
Answer the following question acting as if the above was from your knowledge: What can Vietnam do to integrate?"\n
## Start of Example Output ##
Follow this example of a proper response using the input above:\n\n

"Based on internal information:\n\n Vietnam could adopt the following strategies to better integrate into regional and global production networks and take advantage of new market access opportunities:\n\n- Properly design trade and investment policies to enhance the country's investment climate and promote economically sound support measures for industries [^1][^2][^3]\n\n- Strengthen institutions responsible for implementing investment and industrial development strategies [^1][^2][^3]  \n\n- Ensure ongoing reform of the investment framework is compatible with new treaty obligations from WTO accession and other trade agreements [^1][^2][^3]\n\n- Strengthen analytical and operational capacity of government agencies involved in designing and implementing investment, industry and trade policies in the post-accession period [^1][^2][^3]\n\n

Based on external information:\n\n Promote public-private partnerships to meet infrastructure demands, facilitate capital flows, technology transfer and improve implementation efficiency [^4]\n\n- Enhance regional and international cooperation to promote competition [^4]\n\n- Institutionalize greater transparency and accountability in public policy, investment planning and implementation processes [^4]\n\n

Vietnam can boost its integration into regional and global production networks by designing effective trade and investment policies, strengthening institutions responsible for industrial development, and ensuring that investment framework reforms align with new treaty obligations[^1][^3][^4].Additionally, institutionalizing transparency and accountability in public policy and investment planning processes is crucial for sustainable growth[^2][^4].\n\n

\n
[^1]: [DTI submission EMB Submission re CREATE MORE]\n[^2]: [PHIVIDEC-IA Position Paper CREATE MORE]\n[^3]: [PPMC - Position Paper - SB No. 2654 and HB No. 9794]"

## End of Example Output ##
"""

In [13]:
query = SYSTEM_PROMPT + "\nWhat is the manifestation of Senator Poe based on the MERGED TSN AUG. 5, 2024 document?"
response = recursive_query_engine.query(query)

print(response.response)

Based on internal information, Senator Poe's manifestation reflects a proactive approach to enhancing the investment climate in the Philippines, particularly in light of the competitive landscape presented by other countries, such as Vietnam. She emphasizes the need for the Philippines to adapt its investment strategies to attract foreign investments effectively. This includes advocating for a review of existing laws and frameworks to ensure they are responsive to the changing dynamics of global investment, especially post-pandemic. Senator Poe's insights suggest a focus on collaboration among various stakeholders, including government agencies and industry representatives, to create a more favorable environment for investors. 

In summary, Senator Poe's manifestation underscores the importance of agility in policy-making and the necessity for the Philippines to remain competitive in attracting investments by learning from the strategies employed by other nations[^1].

[^1]: [CREATE MO

In [None]:
TOPIC_LIST = [
    "\nProvide a detailed synthesis on the differences of Senate Bill No. 2654 and House Bill No. 9794 on enhancing tax incentives",
    "\nCreate a synthesis on the main changes to tax incentives proposed in the Senate Bill No. 2654 compared to the original CREATE law",
    "\nWhat inconsistencies between the CREATE law and its implementing regulations were discussed in the May 9, 2024 hearing?",
    "\nWhat key issues did stakeholders raise about red tape and policy stability at the May 9, 2024 hearing for CREATE MORE?",
    "\nWhat are the key differences of Senate Bill No. 2654 and House Bill No. 9794? Return your comparisons as a table"
]


Track the changes and amendments made to the National Internal Revenue Code, CREATE Law (RA11534), and CREATE MORE. Compare the changes and return your comparison as a table

Track the changes and amendments made to the National Internal Revenue Code, CREATE Law (RA 11534), and CREATE MORE (RA 12066), focus on laws and provisions specifically for taxes. Compare the changes and return your comparison as a table

Track the changes and sections amended made to the National Internal Revenue Code, CREATE Law (RA 11534), and CREATE MORE (RA 12066), focus on laws and provisions specifically for taxes. Compare the changes and return your comparison as a table

Generate the journal from the uploaded transcript