# Setup


In [14]:
%load_ext autoreload
%autoreload 2


In [12]:
from dotenv import load_dotenv
load_dotenv()

True

# Tools

In [2]:
from tool_evaluator_agent.tools import ScrapingFishTool

In [20]:
import os
SCRAPING_FISH_API_KEY = os.getenv("SCRAPING_FISH_API_KEY")


In [3]:
content = ScrapingFishTool().run('https://www.orulo.com.br')

Using Tool: Scrape website content


# Models

## Google Gemini

In [2]:
import os
from google.oauth2 import service_account
from langchain_google_genai import ChatGoogleGenerativeAI

# Set the path to the service account JSON key file
service_account_file = os.getenv("GOOGLE_SERVICE_ACCOUNT_FILE")

# Authenticate using the service account key
credentials = service_account.Credentials.from_service_account_file(service_account_file)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm=ChatGoogleGenerativeAI(model='gemini-1.5-flash',
                            verbose=True,
                            temperature=0.5,
                            credentials=credentials)

I0000 00:00:1724249811.931081 19971561 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1724249811.931865 19971561 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [7]:
print(type(llm))

<class 'langchain_google_genai.chat_models.ChatGoogleGenerativeAI'>


In [8]:
isinstance(llm, ChatGoogleGenerativeAI)

True

In [13]:
llm.model.split('/')[-1]

'gemini-1.5-flash'

In [4]:
llm.invoke("Sing a ballad of LangChain.")

AIMessage(content="(Verse 1)\nIn the realm of code, where data flows,\nA tool was born, LangChain, it goes.\nWith chains of thought, it weaves its spell,\nTo unlock knowledge, secrets to tell.\n\n(Chorus)\nOh, LangChain, LangChain, a symphony of code,\nConnecting minds, a path we've strode.\nWith prompts and chains, we tap the well,\nOf language models, stories to tell.\n\n(Verse 2)\nFrom LLM's power, we draw our might,\nTo answer questions, day and night.\nWith prompts we guide, with chains we bind,\nA tapestry of words, for all to find.\n\n(Chorus)\nOh, LangChain, LangChain, a symphony of code,\nConnecting minds, a path we've strode.\nWith prompts and chains, we tap the well,\nOf language models, stories to tell.\n\n(Verse 3)\nThrough agents and tools, it takes its stand,\nTo automate tasks, with a helping hand.\nFrom summarization to text creation,\nLangChain empowers, with innovation.\n\n(Chorus)\nOh, LangChain, LangChain, a symphony of code,\nConnecting minds, a path we've strode.

## OpenAI

In [2]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model='gpt-4o',
                  verbose=True,
                  temperature=0)

In [None]:
?ChatOpenAI

## LiteLLM

In [2]:
import os
from google.oauth2 import service_account
from langchain_google_genai import ChatGoogleGenerativeAI

# Set the path to the service account JSON key file
service_account_file = os.getenv("GOOGLE_SERVICE_ACCOUNT_FILE")

# Authenticate using the service account key
credentials = service_account.Credentials.from_service_account_file(service_account_file)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from langchain_community.chat_models import ChatLiteLLM
chat = ChatLiteLLM(model="gpt-3.5-turbo")

In [7]:
type(chat)

langchain_community.chat_models.litellm.ChatLiteLLM

In [15]:
chat.model_name = chat.model

In [17]:
chat.model_name

'gpt-3.5-turbo'

In [16]:
hasattr(chat, "model_name")

True

## Ollama

In [53]:
from langchain_community.llms import Ollama

llm = Ollama(model='invalid')

## LLM Wrapper


In [2]:
from helpers.llm_wrapper import ModelParams

p = ModelParams()


In [4]:
p.use_cache

True

## Trulens

In [16]:
from helpers.llm_wrapper import ModelParams

p = ModelParams(use_trulens=True)

In [17]:
p.use_trulens

True

In [20]:
from helpers.llm_wrapper import get_llm, get_model_name
llm = get_llm('openai', 'gpt-3.5-turbo-1106', config=p)
print(get_model_name(llm))

Tracer get_proxy_config with provider: disabled
Using Trulens
gpt-3.5-turbo-1106


# Observability


## Helicone

In [33]:
params = {
    "model": 'gpt-3.5-turbo',
    "temperature": 0,
    "verbose": True,
}

params["base_url"] = "https://oai.helicone.ai/v1"

In [34]:
import os
headers = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",
}

In [35]:
headers

{'Helicone-Auth': 'Bearer pk-helicone-itv4c7y-xxgecla-wegp4bq-ntrcs6a'}

In [36]:
params["default_headers"] = headers

In [37]:
params

{'model': 'gpt-3.5-turbo',
 'temperature': 0,
 'verbose': True,
 'base_url': 'https://oai.helicone.ai/v1',
 'default_headers': {'Helicone-Auth': 'Bearer pk-helicone-itv4c7y-xxgecla-wegp4bq-ntrcs6a'}}

In [49]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(**params)

In [50]:
llm.invoke("Sing a ballad of LangChain.")

AIMessage(content="In the land of LangChain, a tale unfolds\nOf heroes and villains, of legends untold\nA kingdom divided, by power and greed\nBut a hero arises, to fulfill a great need\n\nLangChain, oh LangChain, a land of strife\nWhere darkness and light, battle for life\nBut in the shadows, a hero stands tall\nTo bring peace and justice, to one and all\n\nWith sword in hand, and heart of gold\nLangChain's hero, brave and bold\nFights for the innocent, fights for the weak\nTo bring an end, to the darkness that seeks\n\nThrough battles and trials, the hero prevails\nWith courage and honor, their quest never fails\nLangChain is saved, by their selfless deed\nA legend forever, in LangChain's creed\n\nSo raise a toast, to LangChain's hero true\nTheir story will live on, forever anew\nIn the hearts of the people, their legend will reign\nIn the land of LangChain, where heroes remain.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 209, 'promp

In [40]:
from helpers.tracer import TracerFactory

tracer = TracerFactory.get_tracer()

In [42]:
from helpers.tracer import TracerAnnotation
tracer.provider = 'helicone'
tracer.annotation = TracerAnnotation(app='tool_evaluator_agent')

In [43]:
proxy_config = tracer.get_proxy_config()
proxy_config

LLMProxyConfig(url='https://oai.helicone.ai/v1', headers={'Helicone-Auth': 'Bearer pk-helicone-itv4c7y-xxgecla-wegp4bq-ntrcs6a', 'Helicone-Cache-Enabled': 'true', 'Helicone-Property-Session': '45b4bdb6-4803-4de2-86e1-7776ebf05344', 'Helicone-Property-App': 'tool_evaluator_agent'})

In [48]:
params

{'model': 'gpt-3.5-turbo',
 'temperature': 0,
 'verbose': True,
 'base_url': 'https://oai.helicone.ai/v1',
 'default_headers': {'Helicone-Auth': 'Bearer pk-helicone-itv4c7y-xxgecla-wegp4bq-ntrcs6a',
  'Helicone-Cache-Enabled': 'true',
  'Helicone-Property-Session': '45b4bdb6-4803-4de2-86e1-7776ebf05344',
  'Helicone-Property-App': 'tool_evaluator_agent'}}

In [47]:
params["base_url"] = proxy_config.url
params["default_headers"] = proxy_config.headers

In [2]:
__name__

'__main__'

In [3]:
## Tracer

In [12]:
from helpers.tracer import TracerFactory, TracerAnnotation
tracer = TracerFactory.get_tracer()

In [10]:
tracer

Tracer(provider='helicone', annotation=None)

In [7]:
tracer.provider = 'helicone'

In [8]:
tracer

Tracer(provider='helicone', annotation=None)

In [13]:
tracer.get_proxy_config()

LLMProxyConfig(url='https://oai.helicone.ai/v1', headers={'Helicone-Auth': 'Bearer pk-helicone-itv4c7y-xxgecla-wegp4bq-ntrcs6a', 'Helicone-Cache-Enabled': 'true'})

## Logger

In [5]:
import logging

In [7]:
logger = logging.getLogger('tool-evaluator-agent')

In [9]:
provider = "Google"
logger.debug(f"Tracer ended with provider: {provider}")

2024-08-26 10:46:44,684 - tool-evaluator-agent - DEBUG - Tracer ended with provider: Google


In [None]:
# Configure the root logger
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)


In [11]:
logger = logging.getLogger('tool-evaluator-agent')
provider = "Google"
logger.debug(f"Tracer ended with provider: {provider}")

2024-08-26 10:47:51,295 - tool-evaluator-agent - DEBUG - Tracer ended with provider: Google


In [12]:

# Create a common logger instance
logger = logging.getLogger('tool-evaluator-agent')

# Example: Adding a file handler
file_handler = logging.FileHandler('logs/app-debug.log')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(
    logging.Formatter('%(asctime)s - %(name)s - '
                      '%(levelname)s - %(message)s'))
logger.addHandler(file_handler)

In [16]:
logger = logging.getLogger('tool-evaluator-agent')
provider = "Google"
logger.debug(f"Tracer ended with provider: {provider}")

2024-08-26 10:54:17,065 - tool-evaluator-agent - DEBUG - Tracer ended with provider: Google


In [14]:
type(logger)

logging.Logger

In [15]:
print(logger)

<Logger tool-evaluator-agent (DEBUG)>


# Agents & Tasks

In [52]:
from tool_evaluator_agent.crew import ToolEvalCrewFactory

factory = ToolEvalCrewFactory()


In [54]:

crew = factory.crew(agents=[factory.researcher()], tasks=[factory.criteria_clarification_task()])

In [55]:
crew.agents

[Agent(role=Senior Research Analyst
 , goal=Uncover cutting-edge information about {topic}.
 , backstory=You work at a leading tech think tank. Your expertise lies in investigating and comparing tools, their strenghts, and their weeknesses. You have a knack for dissecting complex data and summing them up in their most impactful insights.
 )]

In [66]:
crew.tasks

[Task(description=Provide a summary of the following criteria used to score the tools: 
         - Comprehensive support for the development of multi-step AI agents
         - Low vendor lock-in
         - Support for higher end LLM models (e.g. GPT-4, Gemini, ...)
         - Support for zero cost LLM models (e.g. Open Hermes, ...)
         - Good documentation
         - Strong community support
         - RAG (Retrieval-Augmented Generation) support
         - Rich ecosystem of integrations and toolkits
          1) Include a brief explanation of each criteria and suggest a scoring approach. 2) When possible offer concrete examples for the criteria. 3) Define the criteria so they have minimum overlap Example: 
   - Low vendor lock-in
     - Explanation: refers to a situation where a software solution or platform is designed in such a way that it minimizes dependency on a single vendor or provider
     - Scoring approach:
       - Better if:
         - There is interoperability: The s

In [56]:
type(crew.agents[0].llm)

langchain_openai.chat_models.base.ChatOpenAI

In [65]:
from trulens.apps.langchain import TruChain
from trulens.core import TruSession

session = TruSession()
session.reset_database()

In [67]:
import numpy as np
from trulens.core import Feedback
from trulens.providers.openai import OpenAI

# Initialize provider class
provider = OpenAI()

# select context to be used in feedback. the location of context is app specific.
# context = TruChain.select_context(rag_chain)

# Define a groundedness feedback function
# f_groundedness = (
#     Feedback(
#         provider.groundedness_measure_with_cot_reasons, name="Groundedness"
#     )
#     .on(context.collect())  # collect context chunks into a list
#     .on_output()
# )

# Question/answer relevance between overall question and answer.
f_answer_relevance = Feedback(
     provider.relevance_with_cot_reasons, name="Answer Relevance"
).on_input_output()

# Context relevance between question and each context chunk.
# f_context_relevance = (
#     Feedback(
#         provider.context_relevance_with_cot_reasons, name="Context Relevance"
#     )
#     .on_input()
#     .on(context)
#     .aggregate(np.mean)
# )

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [80]:
from trulens.apps.langchain import TruChain
true_app = TruChain(crew.agents[0],
                    app_name=f"researcher@criteria-clarification-task",
                    app_version="0.0.1",
                    feedbacks=[f_answer_relevance])

In [None]:
from tool_evaluator_agent.main import crew_inputs
with true_app as recordings:    
    crew.kickoff(crew_inputs)

[1m[95m [2024-09-07 09:27:05][DEBUG]: == Working Agent: Senior Research Analyst
[00m
[1m[95m [2024-09-07 09:27:05][INFO]: == Starting Task: Provide a summary of the following criteria used to score the tools: 
        - Comprehensive support for the development of multi-step AI agents
        - Low vendor lock-in
        - Support for higher end LLM models (e.g. GPT-4, Gemini, ...)
        - Support for zero cost LLM models (e.g. Open Hermes, ...)
        - Good documentation
        - Strong community support
        - RAG (Retrieval-Augmented Generation) support
        - Rich ecosystem of integrations and toolkits
         1) Include a brief explanation of each criteria and suggest a scoring approach. 2) When possible offer concrete examples for the criteria. 3) Define the criteria so they have minimum overlap Example: 
  - Low vendor lock-in
    - Explanation: refers to a situation where a software solution or platform is designed in such a way that it minimizes dependency on 

WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  2.61it/s]


[32;1m[1;3mI need to use the available tools to gather information on the criteria for scoring the tools for development of applications that use LLM.

Action:
Search the internet
Action Input:
{"search_query": "Comprehensive support for the development of multi-step AI agents"}[0m[95m 


Search results: Title: Step by Step guide to develop AI Multi-Agent system using Microsoft ...
Link: https://devblogs.microsoft.com/semantic-kernel/step-by-step-guide-to-develop-ai-multi-agent-system-using-microsoft-semantic-kernel-and-gpt-4o/
Snippet: This blog series will equip you to not only understand AI agents but also build your own using the powerful Semantic Kernel framework.
---
Title: AI Agent Development: A Comprehensive Guide to the Future of ...
Link: https://medium.com/@honeyricky1m3/ai-agent-development-a-comprehensive-guide-to-the-future-of-software-engineering-30d619dc99e9
Snippet: This blog post will demystify the role, provide a step-by-step roadmap, and empower you to embark o

WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s]


[32;1m[1;3mThought: I need to use the available tools to gather information on the criteria for scoring the tools for development of applications that use LLM.

Action: Search the internet
Action Input:
{"search_query": "Low vendor lock-in"}[0m[95m 


Search results: Title: Vendor lock-in: Understanding risks and how to avoid it - OutSystems
Link: https://www.outsystems.com/tech-hub/app-dev/vendor-lock-in/
Snippet: Vendor lock-in occurs when access to a type of product or service is limited to the paying customer of a single vendor. Find out how to avoid it.
---
Title: Mythbusting: “Vendor lock-in is a big problem” - NoCode.Tech
Link: https://www.nocode.tech/article/mythbusting-06-getting-locked-in-with-a-no-code-vendor-is-a-big-problem
Snippet: Vendor lock-in is a pretty common – and understandable – concern when it comes to actually committing to a no-code platform. Time to unearth the nuance.
---
Title: What is Vendor Lock-in? Tips to avoid it - Quixy
Link: https://quixy.com/blo

WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARN

[32;1m[1;3mThought: I need to use the available tools to gather information on the criteria for scoring the tools for development of applications that use LLM.

Action: Search the internet
Action Input:
{"search_query": "Support for higher end LLM models (e.g. GPT-4, Gemini, ...)"}[0m[95m 


Search results: Title: Comparison Of Gemini Advanced and GPT-4-Turbo (and kinda ...
Link: https://www.reddit.com/r/singularity/comments/1apgv6s/comparison_of_gemini_advanced_and_gpt4turbo_and/
Snippet: GPT-4-Turbo is the better model overall. Gemini Advanced is so much better at creativity, sounding human and response speed though. And it has no message caps.
---
Title: Introducing Gemini: our largest and most capable AI model
Link: https://blog.google/technology/ai/google-gemini-ai/
Snippet: Gemini is our most capable and general model, built to be multimodal and optimized for three different sizes: Ultra, Pro and Nano.
---
Title: Google's Gemini Pro vs. OpenAI's GPT-4: A Detailed Review - Med

WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARNI [trulens.providers.openai.endpoint] Cannot track costs from a OpenAI Stream.
WARN

[32;1m[1;3mFinal Answer: 

Criteria for scoring the tools for development of applications that use LLM:

1) Comprehensive support for the development of multi-step AI agents
   - Explanation: Refers to the extent to which the tools support the development of AI agents capable of performing multiple tasks or making decisions autonomously.
   - Scoring approach:
     - Better if:
       - Tools provide extensive documentation and tutorials on developing multi-step AI agents
       - Tools offer pre-built models or templates for multi-step AI agents
       - Tools have a strong community of developers and users focused on multi-step AI agents
     - Worse if:
       - Tools lack documentation or resources for developing multi-step AI agents
       - Tools do not have any pre-built models or templates for multi-step AI agents
       - Tools have limited community support for multi-step AI agents

2) Low vendor lock-in
   - Explanation: Refers to a situation where a software solution or p

Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  2.84it/s]
Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  3.48it/s]
Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  3.00it/s]
Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  3.05it/s]
Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  2.63it/s]
Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
Inserting batches in chromadb: 100%|██████████| 1/1 [00:00<00:00,  4.29it/s]


In [78]:
with true_app as recordings:  
    crew.agents[0].llm.invoke("Sing a ballad of LangChain.")

In [59]:
type(crew.agents[0].llm)

langchain_openai.chat_models.base.ChatOpenAI

In [82]:
session.get_leaderboard(app_ids=[true_app.app_id])

Unnamed: 0_level_0,Unnamed: 1_level_0,Answer Relevance,latency,total_cost
app_name,app_version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
researcher@criteria-clarification-task,0.0.1,1.0,19.5,0.00017


In [83]:
recordings.records

[Record(record_id='record_hash_5e24fc0b276d4e737189bbec58a4e077', app_id='app_hash_8f07e808b17ac4a60f6c323b6b2c3e4a', cost=Cost(n_requests=12, n_successful_requests=78, n_classes=0, n_tokens=23230, n_stream_chunks=0, n_prompt_tokens=17884, n_completion_tokens=5346, n_cortext_guardrails_tokens=0, cost=0.025362000000000006, cost_currency='USD'), perf=Perf(start_time=datetime.datetime(2024, 9, 7, 9, 27, 6, 951146), end_time=datetime.datetime(2024, 9, 7, 9, 27, 43, 762817)), ts=datetime.datetime(2024, 9, 7, 9, 27, 43, 762962), tags='-', meta=None, main_input='Provide a summary of the following criteria used to score the tools: \n        - Comprehensive support for the development of multi-step AI agents\n        - Low vendor lock-in\n        - Support for higher end LLM models (e.g. GPT-4, Gemini, ...)\n        - Support for zero cost LLM models (e.g. Open Hermes, ...)\n        - Good documentation\n        - Strong community support\n        - RAG (Retrieval-Augmented Generation) support\

In [85]:
from trulens.dashboard.display import get_feedback_result

last_record = recordings.records[-1]
get_feedback_result(last_record, "Answer Relevance")

Unnamed: 0,prompt,response,ret
0,Provide a summary of the following criteria us...,Criteria for scoring the tools for development...,1.0


In [86]:
from trulens.dashboard import run_dashboard

run_dashboard(session)

Starting dashboard ...


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.0.16:58873 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>