### Agent Tool call evaluation with PyTest & LangSmith

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
os.environ['TAVILY_API_KEY'] = os.getenv('TAVILY_API_KEY')

In [2]:
## Langsmith params for observability
os.environ['LANGSMITH_API_KEY'] = os.getenv('LANGSMITH_API_KEY')
os.environ['LANGSMITH_PROJECT'] = 'LLM_OBS_YT'
os.environ['LANGSMITH_TRACING']="true"

### RAG Vector DB Population

In [3]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader('sample_doc.pdf')
docs = loader.load()

###  BGE Embddings

from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

from langchain_community.vectorstores import Chroma

### Creating Retriever using Vector DB
db = Chroma.from_documents(docs, embeddings)
retriever = db.as_retriever(search_kwargs={"k": 3})

  embeddings = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [4]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

### LangGraph Agent with RAG + WebSearch (MultiAgent Supervisor)

In [5]:
## Langsmith params for observability
os.environ['LANGSMITH_API_KEY'] = os.getenv('LANGSMITH_API_KEY')
os.environ['LANGSMITH_PROJECT'] = 'LLM_OBS_YT'
os.environ['LANGSMITH_TRACING']="true"

In [6]:
from langgraph.prebuilt import create_react_agent

### Tools Creation

In [7]:
from langchain_community.tools.tavily_search import TavilySearchResults

tavily_tool = TavilySearchResults(max_results=5)

  tavily_tool = TavilySearchResults(max_results=5)


### Create specialized Agents

In [8]:
### Research Agent for Web Search

def web_search(query: str) -> str:
    """Search the web for information."""
    docs = tavily_tool.invoke({"query": query})
    web_results = "\n".join([d["content"] for d in docs])
    return web_results

research_agent = create_react_agent(
    model=llm,
    tools=[web_search],
    name="research_expert",
   prompt="You are a world class researcher with access to web search."
)

In [9]:
## RAG Agent

def rag_search(query:str):
    "Function to do RAG search"
    docs = retriever.invoke(
            query,
        )
    return "\nRetrieved documents:\n" + "".join(
        [
            f"\n\n===== Document {str(i)} =====\n" + doc.page_content
            for i, doc in enumerate(docs)
        ]
    )

rag_agent = create_react_agent(
    model=llm,
    tools=[rag_search],
    name="rag_expert",
    prompt="You are a RAG tool with access to transformer applications on Deep Learning related tasks."
)

In [10]:
from langgraph_supervisor import create_supervisor

workflow = create_supervisor(
    agents=[research_agent, rag_agent],
    model=llm,
    prompt=(
        "You are a supervisor managing a web search expert and a RAG search expert. "
        "For current events and information, use research_agent."
        "For transformer related information , use rag_agent."
    )
)

# Compile and run
app = workflow.compile()

In [11]:
result_rag = app.invoke({
    "messages": [
        {
            "role": "user",
            "content": "Tell me about mutlihead attention in transformers"
        }
    ]
})

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Task supervisor with path ('__pregel_pull', 'supervisor') wrote to unknown channel is_last_step, ignoring it.
Task supervisor with path ('__pregel_pull', 'supervisor') wrote to unknown channel remaining_steps, ignoring it.
Failed to send telemetry event CollectionQueryEvent: capture() takes 1 positional argument but 3 were given


In [12]:
result_rag

{'messages': [HumanMessage(content='Tell me about mutlihead attention in transformers', additional_kwargs={}, response_metadata={}, id='558409c8-4e5b-4996-aaf9-ada47663f470'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_3xH5iVqO8Cy6O9iSogoZ6YQx', 'function': {'arguments': '{}', 'name': 'transfer_to_rag_expert'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 110, 'total_tokens': 124, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': None, 'id': 'chatcmpl-BuLRYCoyQdG5Oqfw4UA9rjkOD5qqk', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, name='supervisor', id='run--c6bf665c-ecbb-46d8-a3e7-365df6dc19e7-0', tool_calls=[{'name': 'transfer_to_rag_expert', '

In [13]:
import pickle as pkl
with open('result_rag.pkl', 'wb') as f:
    pkl.dump(result_rag, f)

In [14]:
result_rag['messages'][-1].content

'I’ve provided you with an overview of multi-head attention in transformers. If you need further information or clarification on any specific aspect, feel free to ask!'

In [15]:
result_websearch = app.invoke(
    {
    "messages": [
        {
            "role": "user",
            "content": "who is the winner of Last T20 Cricket World Cup?"
        }
    ]}
)

Task supervisor with path ('__pregel_pull', 'supervisor') wrote to unknown channel is_last_step, ignoring it.
Task supervisor with path ('__pregel_pull', 'supervisor') wrote to unknown channel remaining_steps, ignoring it.


In [16]:
result_websearch['messages'][-1].content 

'India won the last T20 Cricket World Cup, which concluded on June 29, 2024.'

In [17]:
result_websearch

{'messages': [HumanMessage(content='who is the winner of Last T20 Cricket World Cup?', additional_kwargs={}, response_metadata={}, id='73c4cf74-04ae-4aed-b89e-35088e5dab0b'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_Rqp1UoDnANqHA5xBB0WjeHEF', 'function': {'arguments': '{}', 'name': 'transfer_to_research_expert'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 113, 'total_tokens': 127, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': None, 'id': 'chatcmpl-BuLRpcNT6aF3siAd67lXD34A0UvZI', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, name='supervisor', id='run--b54764f4-73e6-4679-af5e-c9b7c15de344-0', tool_calls=[{'name': 'transfer_to_research_e

In [18]:
with open('result_websearch.pkl', 'wb') as f:
    pkl.dump(result_websearch, f)

### Evaluation

In [None]:
result_rag = pkl.load(open('result_rag.pkl', 'rb'))
result_websearch = pkl.load(open('result_websearch.pkl', 'rb'))

agent_responses = [result_rag , result_websearch] 

### Testing tool calls

In [20]:
questions = ['Tell me about mutlihead attention in transformers',
             'who is the winner of Last T20 Cricket World Cup?']

questions

['Tell me about mutlihead attention in transformers',
 'who is the winner of Last T20 Cricket World Cup?']

In [28]:
expected_tool_calls = [['transfer_to_rag_expert', 'transfer_back_to_supervisor'],
                       ['transfer_to_research_expert', 'transfer_back_to_supervisor']]

In [24]:
from typing import List, Dict, Any

In [25]:
def extract_tool_calls(messages: List[Any]) -> List[str]:
    """Extract tool call names from messages, safely handling messages without tool_calls."""
    tool_call_names = []
    for message in messages:
        # Check if message is a dict and has tool_calls
        if isinstance(message, dict) and message.get("tool_calls"):
            tool_call_names.extend([call["name"].lower() for call in message["tool_calls"]])
        # Check if message is an object with tool_calls attribute
        elif hasattr(message, "tool_calls") and message.tool_calls:
            tool_call_names.extend([call["name"].lower() for call in message.tool_calls])
    
    return tool_call_names

### Agent tool call evaluation function

In [None]:
import pytest
from langsmith import testing as t

@pytest.mark.langsmith
@pytest.mark.parametrize(
    "prompts, expected_tool_calls",
    [   # Pick some examples with prompts and expected tool call names
        (prompts[0],expected_tool_calls[0]),
        (prompts[1],expected_tool_calls[1]),
    ],
)
def test_agent_tool_calls(prompts, expected_tool_calls):

    """Test that the agent calls the expected tools."""
    # Initialize the agent
    result = agent.invoke(prompts)
    # Extract tool calls from the agent's response
    executed_tool_calls = extract_tool_calls(result['messages'])
                        
    # Check if all expected tool calls are in the extracted ones
    missing_calls = [call for call in expected_tool_calls if call.lower() not in executed_tool_calls]
    
    t.log_outputs({
                "missing_calls": missing_calls,
                "executed_tool_calls": executed_tool_calls,
                "expected_tool_calls": expected_tool_calls
            })

    # Test passes if no expected calls are missing
    assert len(missing_calls) == 0

Notebook to Py Files

In [1]:
from research import AppAgent

  tavily_tool = TavilySearchResults(max_results=5)


In [2]:
app = AppAgent()

  embeddings = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [3]:
result = app.invoke('When is the next Ind vs Eng 4th Test Match?')

Task supervisor with path ('__pregel_pull', 'supervisor') wrote to unknown channel is_last_step, ignoring it.
Task supervisor with path ('__pregel_pull', 'supervisor') wrote to unknown channel remaining_steps, ignoring it.


In [4]:
result

{'messages': [HumanMessage(content='When is the next Ind vs Eng 4th Test Match?', additional_kwargs={}, response_metadata={}, id='2d305718-ca07-437d-bc63-a6794b15bb48'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_meI27kE9GF9243cZTbBez6RS', 'function': {'arguments': '{}', 'name': 'transfer_to_research_expert'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 114, 'total_tokens': 128, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': None, 'id': 'chatcmpl-Buttgn1tn8xHM5PkCylAA9xK9UUCN', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, name='supervisor', id='run--716ed8aa-6d61-4e42-80ac-8aaa3be073d4-0', tool_calls=[{'name': 'transfer_to_research_expert