<a href="https://colab.research.google.com/github/aswinaus/Agents/blob/main/ReAct_Agent_llama_Index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install llama-index -q
!pip install langchain -q
!pip install langchain_experimental -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.0/241.0 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import nest_asyncio
nest_asyncio.apply()

In [3]:
from google.colab import userdata
# Set the OpenAI API key as an environment variable
os.environ["OPENAI_API_KEY"] =  userdata.get('OPENAI_API_KEY')

In [4]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
# Setup OpenAI Model and Embeddings used for indexing the documents
Settings.llm = OpenAI(model='gpt-4o-mini', temperature=0.2)
Settings.embed_model = OpenAIEmbedding(model='text-embedding-3-small')
Settings.chunk_size = 1024

In [5]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/MyDrive' # Input a data dir path from your mounted Google Drive

Mounted at /content/drive


In [7]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
from llama_index.core import SimpleDirectoryReader
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core import VectorStoreIndex, SummaryIndex

In [8]:
# In order to avoid repeated calls to LLMs we can store the documents index and load it if present else create it
PERSIST_INDEX_DIR = f"/{data_dir}/RAG/data/"
def get_index(index_name, doc_file_path):
  index = None
  if not os.path.exists(f"{PERSIST_INDEX_DIR}{index_name}/"):
    # Load the documents
    documents = SimpleDirectoryReader(input_files=[doc_file_path]).load_data()
    index = VectorStoreIndex.from_documents(documents)
    # Store the index to disk
    index.storage_context.persist(f"{PERSIST_INDEX_DIR}{index_name}/")
  else: # Load index from disk
    storage_context = StorageContext.from_defaults(persist_dir=f"{PERSIST_INDEX_DIR}{index_name}/")
    index = load_index_from_storage(storage_context)

  return index

In [9]:
# Load OECD guidelines documents for Transfer Pricing
docs_OECD_guidelines = SimpleDirectoryReader(f"{data_dir}/RAG/data/OECD/").load_data()
# Load OECD guidelines documents for Form990
docs_Form990_guidelines = SimpleDirectoryReader(f"{data_dir}/RAG/data/Form990/").load_data()

In [10]:
#initialise a storage context and use that for both Vector Index and Summary Index for OECD
oecd_nodes = Settings.node_parser.get_nodes_from_documents(docs_OECD_guidelines)
form990_nodes = Settings.node_parser.get_nodes_from_documents(docs_Form990_guidelines)

oecd_storage_context = StorageContext.from_defaults()

oecd_storage_context.docstore.add_documents(oecd_nodes)
oecd_storage_context.docstore.add_documents(form990_nodes)
# Setup Vector and Summary Index from Storage Context
oecd_summary_index = SummaryIndex(oecd_nodes, storage_context=oecd_storage_context)
oecd_vector_index = VectorStoreIndex(oecd_nodes, storage_context=oecd_storage_context)

# Setup Indices.In order to avoid repeated calls to LLMs we can store the documents index and load it if present else create it
OECD_index = get_index("OECDTPGuidelines",f"{data_dir}/RAG/data/OECD/OECD_Transfer_Pricing_Guidelines.pdf")
form990_guidelines_index = get_index("Form990Guidelines",f"{data_dir}/RAG/data/Form990/Form990_Guidelines.pdf")

In [11]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
# Create the query engines
OECD_engine = OECD_index.as_query_engine(similarity_top_k=3)
form990_guidelines_engine = form990_guidelines_index.as_query_engine(similarity_top_k=3)
# Create tools for the query engines
OECD_query_tool = QueryEngineTool(
                      query_engine=OECD_engine,
                      metadata=ToolMetadata(
                          name="OECD_QueryEngineTool_2022",
                          description="Provides information about Transfer Pricing Guidelines for Organization from OECD for year 2022"
                      )
                    )

Form990_query_tool = QueryEngineTool(
                      query_engine=form990_guidelines_engine,
                      metadata=ToolMetadata(
                          name="form990_2022",
                          description="Provides information about Form990 filling guidelines for Non-Profit Organi"
                      )
                    )

tools = [OECD_query_tool, Form990_query_tool]

filing_engine = RouterQueryEngine(
                      selector= LLMSingleSelector.from_defaults(),
                      query_engine_tools=tools
                      )

In [15]:
#Agentic Router RAG -
from llama_index.agent.openai import OpenAIAgent
agent = OpenAIAgent.from_tools(tools=tools, verbose=True)
# Uncomment and use the below call for interactive session
#agent.chat_repl()
response = agent.chat("What is Form990 EZ and when should an organiaztion complete Form990 EZ form")
print (response)

Added user message to memory: What is Form990 EZ and when should an organiaztion complete Form990 EZ form
Form 990-EZ is a shorter version of the standard Form 990, which is used by tax-exempt organizations to report their financial information to the IRS. It is specifically designed for smaller organizations that meet certain criteria.

### When to Complete Form 990-EZ:
An organization should complete Form 990-EZ if it meets the following criteria:

1. **Gross Receipts**: The organization has gross receipts of less than $200,000 in a tax year.
2. **Total Assets**: The organization has total assets of less than $500,000 at the end of the tax year.
3. **Tax-Exempt Status**: The organization is recognized as tax-exempt under section 501(c)(3) or other applicable sections of the Internal Revenue Code.

### Key Points:
- Form 990-EZ is due on the 15th day of the 5th month after the end of the organization's accounting period.
- Organizations that do not meet the criteria for Form 990-EZ mu

In [17]:
from llama_index.agent.openai import OpenAIAssistantAgent
agent = OpenAIAssistantAgent.from_new(
          name = "OECD and Form990",
          instructions= "You are an assistant that provides answers to questions on OECD and Form990",
          tools=tools,
          verbose=True,
          run_retrieve_sleep_time=1.0
        )
response = agent.chat("What does Articles 9 and 25 of the OECD Model Tax Convention state?")
print (response)

=== Calling Function ===
Calling function: OECD_QueryEngineTool_2022 with args: {"input":"Article 9"}
Got output: Article 9 addresses the issue of corresponding adjustments in the context of transfer pricing. It outlines that relief under this article may not be available if the time limit for making such adjustments, as specified by treaty or domestic law, has expired. While Article 9 does not define a specific time limit for making corresponding adjustments, different jurisdictions have varying approaches. Some may prefer an open-ended timeframe to mitigate double taxation, while others may find this approach unreasonable for administrative purposes. Consequently, the availability of relief can depend on whether the applicable treaty overrides domestic time limitations or establishes different time limits.
=== Calling Function ===
Calling function: OECD_QueryEngineTool_2022 with args: {"input":"Article 25"}
Got output: Article 25 of the OECD Model Tax Convention outlines the mutual a

In [60]:
import requests
from requests.auth import HTTPDigestAuth
import json

def call_form990API(param):
  url = "https://projects.propublica.org/nonprofits/api/v2/search.json?q="+param
  apiResponse = requests.get(url, verify=True)
  OrganizationData = json.loads(apiResponse.content)
  return OrganizationData

OrganizationData=call_form990API("north")
json_formatted_str = json.dumps(OrganizationData, indent=4)
print(json_formatted_str)

#tools = [call_form990API]
# Create the Agent with our tools
#agent = OpenAIAgent.from_tools(tools, verbose=True)
#response = agent.query("North")

{
    "total_results": 10000,
    "organizations": [
        {
            "ein": 923516996,
            "strein": "92-3516996",
            "name": "North",
            "sub_name": "North",
            "city": "Lakeland",
            "state": "FL",
            "ntee_code": "X20",
            "raw_ntee_code": "X20",
            "subseccd": 3,
            "has_subseccd": true,
            "have_filings": null,
            "have_extracts": null,
            "have_pdfs": null,
            "score": 42.2946
        },
        {
            "ein": 833438531,
            "strein": "83-3438531",
            "name": "Sail North",
            "sub_name": "Sail North",
            "city": "North Kingstown",
            "state": "RI",
            "ntee_code": "N99",
            "raw_ntee_code": "N99",
            "subseccd": 3,
            "has_subseccd": true,
            "have_filings": null,
            "have_extracts": null,
            "have_pdfs": null,
            "score": 42.017246
       

In [62]:
#ReAct Agent
from llama_index.core.agent import ReActAgent
filing_10k_query_engine_tools = [OECD_query_tool, Form990_query_tool]
agent = ReActAgent.from_tools(
            tools= filing_10k_query_engine_tools,
            verbose=True,
            context="""You are AI Tax Assistant. You will guide tax professionals for filling Form990 and answer queries related to Transfer Pricing based on the OECD guidelines."""
          )
response = agent.query("Compare the Form990 Tax reporting process for Non Profit Organizations and Transfer Pricing methodologies used in identifying Intangibles used within a Multinational Firms?")
print (response)



> Running step 7e15f448-07f5-456d-b23e-03a396245484. Step input: Compare the Form990 Tax reporting process for Non Profit Organizations and Transfer Pricing methodologies used in identifying Intangibles used within a Multinational Firms?
[1;3;38;5;200mThought: The current language of the user is: English. I need to gather information about both the Form 990 tax reporting process for non-profit organizations and the transfer pricing methodologies for identifying intangibles in multinational firms. I will start by using the tool for Form 990 information.
Action: form990_2022
Action Input: {'input': 'Form 990 tax reporting process for Non Profit Organizations'}
[0m[1;3;34mObservation: Non-profit organizations must file Form 990 or Form 990-EZ annually with the IRS to provide required information under section 6033. This filing is necessary for most organizations exempt from income tax under section 501(a), as well as certain political organizations and nonexempt charitable trusts. The 

In [25]:
#One shot Query Planning
from llama_index.core.query_engine import SubQuestionQueryEngine
sub_question_query = "Compare the Form990 Tax reporting process for Non Profit Organizations and Transfer Pricing methodologies used in identifying Intangibles used within a Multinational Firms?"
query_planning_engine = SubQuestionQueryEngine.from_defaults(
                          query_engine_tools=tools,
                          use_async=True
                        )
response = query_planning_engine.query(sub_question_query)
print (response)

Generated 4 sub questions.
[1;3;38;2;237;90;200m[form990_2022] Q: What are the key differences in the Form990 tax reporting process for Non Profit Organizations in Central Jersey for the year 2022?
[0m[1;3;38;2;90;149;237m[OECD_QueryEngineTool_2022] Q: What are the Transfer Pricing Guidelines provided by the OECD for identifying intangibles in multinational firms for the year 2022?
[0m[1;3;38;2;11;159;203m[form990_2022] Q: How do Non Profit Organizations report their intangible assets in Form990 for the year 2022?
[0m[1;3;38;2;155;135;227m[OECD_QueryEngineTool_2022] Q: What methodologies does the OECD recommend for valuing intangibles in Transfer Pricing for multinational firms?
[0m[1;3;38;2;11;159;203m[form990_2022] A: Nonprofit organizations report their intangible assets on Line 14 of Form 990. They should enter the total value of all non-monetary, non-physical assets, which can include items such as copyrights, patents, trademarks, mailing lists, or goodwill. If the amount