<a href="https://colab.research.google.com/github/aswinaus/Agents/blob/main/ReAct_Agent_llama_Index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install llama-index -q
!pip install langchain -q
!pip install langchain_experimental -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.4/253.4 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.7/300.7 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m63.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m89.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import nest_asyncio
nest_asyncio.apply()

In [3]:
from google.colab import userdata
# Set the OpenAI API key as an environment variable
os.environ["OPENAI_API_KEY"] =  userdata.get('OPENAI_API_KEY')

In [4]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
# Setup OpenAI Model and Embeddings used for indexing the documents
Settings.llm = OpenAI(model='gpt-4o-mini', temperature=0.2)
Settings.embed_model = OpenAIEmbedding(model='text-embedding-3-small')
Settings.chunk_size = 1024

In [5]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/MyDrive' # Input a data dir path from your mounted Google Drive

Mounted at /content/drive


In [6]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
from llama_index.core import SimpleDirectoryReader
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core import VectorStoreIndex, SummaryIndex

In [7]:
# In order to avoid repeated calls to LLMs we can store the documents index and load it if present else create it
PERSIST_INDEX_DIR = f"/{data_dir}/RAG/data/"
def get_index(index_name, doc_file_path):
  index = None
  if not os.path.exists(f"{PERSIST_INDEX_DIR}{index_name}/"):
    # Load the documents
    documents = SimpleDirectoryReader(input_files=[doc_file_path]).load_data()
    index = VectorStoreIndex.from_documents(documents)
    # Store the index to disk
    index.storage_context.persist(f"{PERSIST_INDEX_DIR}{index_name}/")
  else: # Load index from disk
    storage_context = StorageContext.from_defaults(persist_dir=f"{PERSIST_INDEX_DIR}{index_name}/")
    index = load_index_from_storage(storage_context)

  return index

In [8]:
# Load OECD guidelines documents for Transfer Pricing
docs_OECD_guidelines = SimpleDirectoryReader(f"{data_dir}/RAG/data/OECD/").load_data()
# Load OECD guidelines documents for Form990
docs_Form990_guidelines = SimpleDirectoryReader(f"{data_dir}/RAG/data/Form990/").load_data()

In [9]:
#initialise a storage context and use that for both Vector Index and Summary Index for OECD
oecd_nodes = Settings.node_parser.get_nodes_from_documents(docs_OECD_guidelines)
form990_nodes = Settings.node_parser.get_nodes_from_documents(docs_Form990_guidelines)

oecd_storage_context = StorageContext.from_defaults()

oecd_storage_context.docstore.add_documents(oecd_nodes)
oecd_storage_context.docstore.add_documents(form990_nodes)
# Setup Vector and Summary Index from Storage Context
oecd_summary_index = SummaryIndex(oecd_nodes, storage_context=oecd_storage_context)
oecd_vector_index = VectorStoreIndex(oecd_nodes, storage_context=oecd_storage_context)

# Setup Indices.In order to avoid repeated calls to LLMs we can store the documents index and load it if present else create it
OECD_index = get_index("OECDTPGuidelines",f"{data_dir}/RAG/data/OECD/OECD_Transfer_Pricing_Guidelines.pdf")
form990_guidelines_index = get_index("Form990Guidelines",f"{data_dir}/RAG/data/Form990/Form990_Guidelines.pdf")

In [10]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
# Create the query engines
OECD_engine = OECD_index.as_query_engine(similarity_top_k=3)
form990_guidelines_engine = form990_guidelines_index.as_query_engine(similarity_top_k=3)
# Create tools for the query engines
OECD_query_tool = QueryEngineTool(
                      query_engine=OECD_engine,
                      metadata=ToolMetadata(
                          name="OECD_QueryEngineTool_2022",
                          description="Provides information about Transfer Pricing Guidelines for Organization from OECD for year 2022"
                      )
                    )

Form990_query_tool = QueryEngineTool(
                      query_engine=form990_guidelines_engine,
                      metadata=ToolMetadata(
                          name="form990_2022",
                          description="Provides information about Form990 filling guidelines for Non-Profit Organi"
                      )
                    )

tools = [OECD_query_tool, Form990_query_tool]

filing_engine = RouterQueryEngine(
                      selector= LLMSingleSelector.from_defaults(),
                      query_engine_tools=tools
                      )

In [11]:
#Agentic Router RAG -
from llama_index.agent.openai import OpenAIAgent
agent = OpenAIAgent.from_tools(tools=tools, verbose=True)
# Uncomment and use the below call for interactive session
#agent.chat_repl()
response = agent.chat("What is Form990 EZ and when should an organiaztion complete Form990 EZ form")
print (response)

Added user message to memory: What is Form990 EZ and when should an organiaztion complete Form990 EZ form
=== Calling Function ===
Calling function: form990_2022 with args: {"input":"What is Form 990 EZ and when should an organization complete it?"}
Got output: Form 990-EZ is a shorter version of Form 990, designed for organizations that have gross receipts between $200,000 and $500,000 or total assets between $500,000 and $1,250,000 at the end of the tax year. Organizations exempt from income tax under section 501(a) that meet these criteria must complete Form 990-EZ as their annual information return. It is used to provide the IRS with information about the organization's financial activities and compliance with tax regulations.

Form 990-EZ is a shorter version of Form 990, specifically designed for organizations that have gross receipts between $200,000 and $500,000 or total assets between $500,000 and $1,250,000 at the end of the tax year. Organizations that are exempt from income

In [12]:
from llama_index.agent.openai import OpenAIAssistantAgent
agent = OpenAIAssistantAgent.from_new(
          name = "OECD and Form990 Agent",
          instructions= "You are an assistant that provides answers to questions on OECD and Form990",
          tools=tools,
          verbose=True,
          run_retrieve_sleep_time=1.0
        )
response = agent.chat("What does Articles 9 and 25 of the OECD Model Tax Convention state?")
print (response)

=== Calling Function ===
Calling function: OECD_QueryEngineTool_2022 with args: {"input": "Article 9"}
Got output: Article 9 addresses the issue of corresponding adjustments in transfer pricing. It highlights that relief under this article may not be available if the time limit for making such adjustments, as specified by treaty or domestic law, has expired. While the article does not define a specific time limit for corresponding adjustments, jurisdictions may adopt different approaches. Some may prefer an open-ended timeframe to mitigate double taxation, while others may find this approach administratively unreasonable. Consequently, the availability of relief can depend on whether the applicable treaty overrides domestic time limits, establishes its own, or links relief to domestic law time limits.
=== Calling Function ===
Calling function: OECD_QueryEngineTool_2022 with args: {"input": "Article 25"}
Got output: Article 25 of the OECD Model Tax Convention outlines the mutual agreeme

In [13]:
from llama_index.core.tools import FunctionTool
import requests
from requests.auth import HTTPDigestAuth
import json

def call_form990API(param):
  url = "https://projects.propublica.org/nonprofits/api/v2/search.json?q="+param
  apiResponse = requests.get(url, verify=True)
  OrganizationData = json.loads(apiResponse.content)
  return OrganizationData

OrganizationData=call_form990API("north")
json_formatted_str = json.dumps(OrganizationData, indent=4)
print(json_formatted_str)

form990_function_tool = FunctionTool.from_defaults(fn=call_form990API)
#tools = [call_form990API]
# Create the Agent with our tools
#agent = OpenAIAgent.from_tools(tools, verbose=True)
#response = agent.query("North")

{
    "total_results": 10000,
    "organizations": [
        {
            "ein": 923516996,
            "strein": "92-3516996",
            "name": "North",
            "sub_name": "North",
            "city": "Lakeland",
            "state": "FL",
            "ntee_code": "X20",
            "raw_ntee_code": "X20",
            "subseccd": 3,
            "has_subseccd": true,
            "have_filings": null,
            "have_extracts": null,
            "have_pdfs": null,
            "score": 42.26478
        },
        {
            "ein": 833438531,
            "strein": "83-3438531",
            "name": "Sail North",
            "sub_name": "Sail North",
            "city": "North Kingstown",
            "state": "RI",
            "ntee_code": "N99",
            "raw_ntee_code": "N99",
            "subseccd": 3,
            "has_subseccd": true,
            "have_filings": null,
            "have_extracts": null,
            "have_pdfs": null,
            "score": 41.993694
      

In [18]:
#ReAct Agent
from llama_index.core.agent import ReActAgent
query_engine_tools = [OECD_query_tool, Form990_query_tool, form990_function_tool]
agent = ReActAgent.from_tools(
            tools= query_engine_tools,
            verbose=True,
            context="""You are AI Tax Assistant. You will guide tax professionals for filling Form990 and answer queries related to Transfer Pricing based on the OECD guidelines."""
          )
response = agent.query("Please compare and analyse Form990 Tax reporting process and Transfer Pricing methodologies used in identifying Intangibles used within Multinational Firms? If the analysis determines these process are for two different sectors then call the Form990 API with param north and include the results as part of the response?")
print (response)



> Running step 26fdf6d1-99cc-4929-b051-9fc35bfa1d29. Step input: Please compare and analyse Form990 Tax reporting process and Transfer Pricing methodologies used in identifying Intangibles used within Multinational Firms? If the analysis determines these process are for two different sectors then call the Form990 API with param north and include the results as part of the response?
[1;3;38;5;200mThought: The user is asking for a comparison and analysis of the Form 990 tax reporting process and transfer pricing methodologies related to intangibles in multinational firms. This requires an understanding of both processes and their applications. I will first analyze the two processes to determine if they belong to different sectors. If they do, I will call the Form 990 API with the specified parameter.
Action: None
Action Input: {'param': 'north'}
[0m[1;3;34mObservation: Error: No such tool named `None`.
[0m> Running step 683f23dd-6e3e-4087-8e5f-60ed29664a54. Step input: None
[1;3;34m

In [17]:
#One shot Query Planning
from llama_index.core.query_engine import SubQuestionQueryEngine
sub_question_query = "Compare the Form990 Tax reporting process for Non Profit Organizations and Transfer Pricing methodologies used in identifying Intangibles used within a Multinational Firms?"
query_planning_engine = SubQuestionQueryEngine.from_defaults(
                          query_engine_tools=tools,
                          use_async=True
                        )
response = query_planning_engine.query(sub_question_query)
print (response)

Generated 7 sub questions.
[1;3;38;2;237;90;200m[form990_2022] Q: What are the key components of the Form 990 tax reporting process for Non-Profit Organizations in 2022?
[0m[1;3;38;2;90;149;237m[form990_2022] Q: What are the specific guidelines for filling out Form 990 for Non-Profit Organizations in 2022?
[0m[1;3;38;2;11;159;203m[OECD_QueryEngineTool_2022] Q: What are the Transfer Pricing methodologies outlined by the OECD for identifying intangibles in multinational firms in 2022?
[0m[1;3;38;2;155;135;227m[OECD_QueryEngineTool_2022] Q: How do the OECD Transfer Pricing Guidelines address the valuation of intangibles in multinational firms?
[0m[1;3;38;2;237;90;200m[form990_2022] Q: What are the differences in compliance requirements between Form 990 for Non-Profits and Transfer Pricing methodologies for Multinational Firms?
[0m[1;3;38;2;90;149;237m[form990_2022] Q: How do Non-Profit Organizations report their intangible assets, if at all, in Form 990?
[0m[1;3;38;2;11;159;2