<a href="https://colab.research.google.com/github/aswinaus/Observability/blob/main/Routing_Agent_Observability.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install llama-index -q
!pip install langchain -q
!pip install langchain_experimental -q

In [None]:
import os
import nest_asyncio
nest_asyncio.apply()

In [None]:
from google.colab import userdata
# Set the OpenAI API key as an environment variable
os.environ["OPENAI_API_KEY"] =  userdata.get('OPENAI_API_KEY')
os.environ["LANGSMITH_TRACING"]="true"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"]=userdata.get('LANGCHAIN_API_KEY')
os.environ["LANGSMITH_PROJECT"]="agent_observability"
os.environ["OPENAI_API_KEY"]=userdata.get('OPENAI_API_KEY')
LANGSMITH_TRACING=True
LANGSMITH_ENDPOINT="https://api.smith.langchain.com"
LANGSMITH_API_KEY=userdata.get('LANGCHAIN_API_KEY')
LANGSMITH_PROJECT="agent_observability"
OPENAI_API_KEY=userdata.get('OPENAI_API_KEY')

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
# Setup OpenAI Model and Embeddings used for indexing the documents
Settings.llm = OpenAI(model='gpt-4o-mini', temperature=0.2)
Settings.embed_model = OpenAIEmbedding(model='text-embedding-3-small')
Settings.chunk_size = 1024

In [None]:
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/MyDrive' # Input a data dir path from your mounted Google Drive

In [None]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
from llama_index.core import SimpleDirectoryReader
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core import VectorStoreIndex, SummaryIndex

In [None]:
# In order to avoid repeated calls to LLMs we can store the documents index and load it if present else create it
PERSIST_INDEX_DIR = f"/{data_dir}/RAG/data/"
def get_index(index_name, doc_file_path):
  index = None
  if not os.path.exists(f"{PERSIST_INDEX_DIR}{index_name}/"):
    # Load the documents
    documents = SimpleDirectoryReader(input_files=[doc_file_path]).load_data()
    index = VectorStoreIndex.from_documents(documents)
    # Store the index to disk
    index.storage_context.persist(f"{PERSIST_INDEX_DIR}{index_name}/")
  else: # Load index from disk
    storage_context = StorageContext.from_defaults(persist_dir=f"{PERSIST_INDEX_DIR}{index_name}/")
    index = load_index_from_storage(storage_context)
  return index

In [None]:
# Load OECD guidelines documents for Transfer Pricing
docs_OECD_guidelines = SimpleDirectoryReader(f"{data_dir}/RAG/data/OECD/").load_data()
# Load OECD guidelines documents for Form990
docs_Form990_guidelines = SimpleDirectoryReader(f"{data_dir}/RAG/data/Form990/").load_data()

In [None]:
#initialise a storage context and use that for both Vector Index and Summary Index for OECD
oecd_nodes = Settings.node_parser.get_nodes_from_documents(docs_OECD_guidelines)
form990_nodes = Settings.node_parser.get_nodes_from_documents(docs_Form990_guidelines)

oecd_storage_context = StorageContext.from_defaults()

oecd_storage_context.docstore.add_documents(oecd_nodes)
oecd_storage_context.docstore.add_documents(form990_nodes)
# Setup Vector and Summary Index from Storage Context
oecd_summary_index = SummaryIndex(oecd_nodes, storage_context=oecd_storage_context)
oecd_vector_index = VectorStoreIndex(oecd_nodes, storage_context=oecd_storage_context)

# Setup Indices.In order to avoid repeated calls to LLMs we can store the documents index and load it if present else create it
OECD_index = get_index("OECDTPGuidelines",f"{data_dir}/RAG/data/OECD/OECD_Transfer_Pricing_Guidelines.pdf")
form990_guidelines_index = get_index("Form990Guidelines",f"{data_dir}/RAG/data/Form990/Form990_Guidelines.pdf")

In [None]:
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector
from langsmith import Client, traceable
from langchain.llms import OpenAI


OECD_engine = OECD_index.as_query_engine(similarity_top_k=3)
form990_guidelines_engine = form990_guidelines_index.as_query_engine(similarity_top_k=3)
# Create tools for the query engines and by applying @traceable to the tools, we gain observability into each tool's query operations.

OECD_query_tool = QueryEngineTool(
                      query_engine=OECD_engine, # Changed to OECD_engine
                      metadata=ToolMetadata(
                          name="OECD_QueryEngineTool_2022",
                          description="Provides information about Transfer Pricing Guidelines for Organization from OECD for year 2022"
                      )
                    )

# Instead of functions, use the query engine objects directly
Form990_query_tool = QueryEngineTool(
                      query_engine=form990_guidelines_engine, # Changed to form990_guidelines_engine
                      metadata=ToolMetadata(
                          name="form990_2022",
                          description="Provides information about Central Jersey Form990 filling for the year 2022"
                      )
                    )

tools = [OECD_query_tool, Form990_query_tool]

filing_engine = RouterQueryEngine(
                      selector= LLMSingleSelector.from_defaults(),
                      query_engine_tools=tools
                      )

In [None]:
OECD_query="What documentation and approach is required for the valuation of assets in Transfer Pricing?"
Form990_query="An organization receives contributions worth of $15000 which Form within Form990 should the organization complete before filing returns to IRS"

In [None]:
response = filing_engine.query(OECD_query)
print("\n.--------OECD_query response----------.\n")
print (response)
response = filing_engine.query(Form990_query)
print("\n.--------Form990_query response----------.\n")
print (response)

In [None]:
# Define the Summary and Vector query engines for OECD
summary_query_engine = oecd_summary_index.as_query_engine(response_mode= "tree_summarize")
vector_query_engine = oecd_vector_index.as_query_engine()

# Now Create the query engine tools from the above query engines
summary_tool = QueryEngineTool(
                query_engine=summary_query_engine,
                metadata=ToolMetadata(
                    name="OECD_Summary",
                    description="Summarizes the OECD guidelines for Transfer Pricing"
                    )
                )
vector_tool = QueryEngineTool(
                query_engine=vector_query_engine,
                metadata = ToolMetadata(
                    name="OECD_Guidelines_QA",
                    description="Retrieves answers for questions on OECD"
                    )
                )
oecd_tools = [summary_tool, vector_tool]
# Now define the Router Query Engine
oecd_engine = RouterQueryEngine(
                      selector= LLMSingleSelector.from_defaults(),
                      query_engine_tools=oecd_tools
                      )

In [None]:
# 'oecd_engine' are already defined from your previous code
client = Client(api_key=LANGSMITH_API_KEY)

#@traceable to the individual tools that the RouterQueryEngine uses is a very effective way
#to gain granular visibility into which tool is selected and how that specific tool performs.
@traceable(run_type="chain", client=client, tracing_level="verbose")
def query_oecd_engine_summary(engine: RouterQueryEngine, query_text: str):
  """
  A traceable wrapper function for querying the oecd_engine,
  intended to route to the summary tool.
  """
  return engine.query(query_text)

@traceable(run_type="chain", client=client, tracing_level="verbose")
def query_oecd_engine_vector(engine: RouterQueryEngine, query_text: str):
  """
  A traceable wrapper function for querying the oecd_engine,
  intended to route to the vector (QA) tool.
  """
  return engine.query(query_text)

In [None]:
# Now, use these wrapper functions to query the oecd_engine
# Example using the summary query
response = query_oecd_engine_summary(oecd_engine, "Summarize the OECD guidelines for Transfer Pricing?")
print (response)
# To Confirm whether the Summary Engine was used
print (response.metadata["selector_result"])

# Example using the vector (QA) query
response = query_oecd_engine_vector(oecd_engine, "What would be the best possible way to evaluate the Intangibles in Transfer Pricing?")
print (response)
# Confirm that Vector Engine was used.
print (response.metadata["selector_result"])

In [None]:

response = oecd_engine.query("Summarize the OECD guidelines for Transfer Pricing?")
print (response)

In [None]:
# To Confirm whether the Summary Engine was used
print (response.metadata["selector_result"])

In [None]:
# Now route to a Question Answer engine
response = oecd_engine.query("What would be the best possible way to evaluate the Intangibles in Transfer Pricing?")
print (response)

In [None]:
# Confirm that Vector Engine was used.
print (response.metadata["selector_result"])

In [None]:
!pip install -U faiss-cpu sentence_transformers transformers

In [None]:
import faiss
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM
import time

client = Client(api_key=LANGSMITH_API_KEY)
#FAISS is a library for efficient similarity search and clustering of dense vectors.
#IndexFlatL2 uses Euclidean distance to find the nearest neighbors.
#The 768 indicates the dimension of the vectors that will be stored in the index.
#This corresponds to the output dimension of the chosen sentence transformer model.
class SemanticCaching:
    def __init__(self, json_file='cacheagent.json'):
        # Initialize Faiss index  with Euclidean distance
        self.index =faiss.IndexFlatL2(768)  # Use IndexFlatL2 with Euclidean distance
        if self.index.is_trained:
            print('Index trained')

        # Initialize Sentence Transformer
        #SentenceTransformer model called 'all-mpnet-base-v2'. This model is used to convert text queries into numerical vector representations called embeddings.
        #Semantic similarity between questions is determined by the distance between their embeddings.
        self.encoder = SentenceTransformer('all-mpnet-base-v2')


        # Uncomment the following lines to use DialoGPT for question generation
        # self.tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
        # self.model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")

        # Set Euclidean distance threshold. This threshold is used to determine if a cached answer is sufficiently similar to the current query.
        # If the Euclidean distance between the query embedding and a cached embedding is less than or equal to this threshold, the cached answer is considered a match.
        self.euclidean_threshold = 0.3
        self.json_file = json_file
        self.load_cache()

    def load_cache(self):
        # Load cache from JSON file, creating an empty cache if the file is not found
        try:
            with open(self.json_file, 'r') as file:
                self.cache = json.load(file)
        except FileNotFoundError:
            self.cache = {'questions': [], 'embeddings': [], 'answers': [], 'response_text': []}
    def save_cache(self):
        # Save the cache to the JSON file
        with open(self.json_file, 'w') as file:
            json.dump(self.cache, file)
    @traceable(run_type="chain", client=client, tracing_level="verbose")
    def ask(self, question: str) -> str:
        # Method to retrieve an answer from the cache or generate a new one
        start_time = time.time()
        try:
            l = [question]
            embedding = self.encoder.encode(l)

            # Search for the nearest neighbor in the index
            D, I = self.index.search(embedding, 1)

            if D[0] >= 0:
                if I[0][0] != -1 and D[0][0] <= self.euclidean_threshold:
                    row_id = int(I[0][0])
                    print(f'Found cache in row: {row_id} with score {1 - D[0][0]}') #score inversed to show similarity
                    end_time = time.time()
                    elapsed_time = end_time - start_time
                    print(f"Time taken: {elapsed_time} seconds")
                    return self.cache['response_text'][row_id]

            # Handle the case when there are not enough results or Euclidean distance is not met
            answer, response_text = self.generate_answer(question)

            self.cache['questions'].append(question)
            self.cache['embeddings'].append(embedding[0].tolist())
            self.cache['answers'].append(answer)
            self.cache['response_text'].append(response_text)

            self.index.add(embedding)
            self.save_cache()
            end_time = time.time()
            elapsed_time = end_time - start_time
            print(f"Time taken: {elapsed_time} seconds")

            return response_text
        except Exception as e:
            raise RuntimeError(f"Error during 'ask' method: {e}")
    @traceable(run_type="chain", client=client, tracing_level="verbose")
    def generate_answer(self, question: str) -> str:
        # Method to generate an answer using a separate function (make_prediction in this case)
        try:
            result = oecd_engine.query(question)
            #response_text = result['data']['response_text']
            response_text=result

            return result, response_text
        except Exception as e:
            raise RuntimeError(f"Error during 'generate_answer' method: {e}")

In [None]:
cache = SemanticCaching()

In [None]:
import markdown
from IPython.display import Markdown

def print_gpt_markdown(markdown_text):
    display(Markdown(markdown_text))

In [None]:
# Now Cache the request and response using Semantic Caching wherein the question is routed to a Question Answer engine
response = cache.ask("What would be the best possible way to evaluate the Intangibles in Transfer Pricing?")
print(response)


In [None]:
# Querying the same agent with a different question but with same semantic meaning
response = cache.ask("How can Intangibles be effectively evaluated in Transfer Pricing?")
print(response)


In [None]:
# Querying the same agent with a different question but with same semantic meaning
response = cache.ask("What methods are recommended for assessing Intangibles in Transfer Pricing?")
print(response)

In [None]:
# Querying the same agent with a different question but with same semantic meaning
response = cache.ask("What strategies are considered most effective for evaluating Intangibles in Transfer Pricing?")
print(response)

In [None]:
# Querying the same agent with a different question but with same semantic meaning
response = cache.ask("How should Intangibles be properly assessed in the context of Transfer Pricing?")
print(response)

In [None]:
# Querying the same agent with a different question but with same semantic meaning
response = cache.ask("What are the most reliable approaches for evaluating Intangibles in Transfer Pricing?")
print(response)