# Setup


In [1]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv
load_dotenv()

import sys
sys.path.append('./src')


# Tools

In [6]:
from agents.tool_evaluator.tools import ScrapingFishTool

In [20]:
import os
SCRAPING_FISH_API_KEY = os.getenv("SCRAPING_FISH_API_KEY")


In [3]:
content = ScrapingFishTool().run('https://www.orulo.com.br')

Using Tool: Scrape website content


# Models

## Google Gemini

In [2]:
import os
from google.oauth2 import service_account
from langchain_google_genai import ChatGoogleGenerativeAI

# Set the path to the service account JSON key file
service_account_file = os.getenv("GOOGLE_SERVICE_ACCOUNT_FILE")

# Authenticate using the service account key
credentials = service_account.Credentials.from_service_account_file(service_account_file)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm=ChatGoogleGenerativeAI(model='gemini-1.5-flash',
                            verbose=True,
                            temperature=0.5,
                            credentials=credentials)

I0000 00:00:1724249811.931081 19971561 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1724249811.931865 19971561 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


In [7]:
print(type(llm))

<class 'langchain_google_genai.chat_models.ChatGoogleGenerativeAI'>


In [8]:
isinstance(llm, ChatGoogleGenerativeAI)

True

In [13]:
llm.model.split('/')[-1]

'gemini-1.5-flash'

In [4]:
llm.invoke("Sing a ballad of LangChain.")

AIMessage(content="(Verse 1)\nIn the realm of code, where data flows,\nA tool was born, LangChain, it goes.\nWith chains of thought, it weaves its spell,\nTo unlock knowledge, secrets to tell.\n\n(Chorus)\nOh, LangChain, LangChain, a symphony of code,\nConnecting minds, a path we've strode.\nWith prompts and chains, we tap the well,\nOf language models, stories to tell.\n\n(Verse 2)\nFrom LLM's power, we draw our might,\nTo answer questions, day and night.\nWith prompts we guide, with chains we bind,\nA tapestry of words, for all to find.\n\n(Chorus)\nOh, LangChain, LangChain, a symphony of code,\nConnecting minds, a path we've strode.\nWith prompts and chains, we tap the well,\nOf language models, stories to tell.\n\n(Verse 3)\nThrough agents and tools, it takes its stand,\nTo automate tasks, with a helping hand.\nFrom summarization to text creation,\nLangChain empowers, with innovation.\n\n(Chorus)\nOh, LangChain, LangChain, a symphony of code,\nConnecting minds, a path we've strode.

## OpenAI

In [2]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model='gpt-4o',
                  verbose=True,
                  temperature=0)

In [None]:
?ChatOpenAI

## LiteLLM

In [2]:
import os
from google.oauth2 import service_account
from langchain_google_genai import ChatGoogleGenerativeAI

# Set the path to the service account JSON key file
service_account_file = os.getenv("GOOGLE_SERVICE_ACCOUNT_FILE")

# Authenticate using the service account key
credentials = service_account.Credentials.from_service_account_file(service_account_file)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from langchain_community.chat_models import ChatLiteLLM
chat = ChatLiteLLM(model="gpt-3.5-turbo")

In [7]:
type(chat)

langchain_community.chat_models.litellm.ChatLiteLLM

In [15]:
chat.model_name = chat.model

In [17]:
chat.model_name

'gpt-3.5-turbo'

In [16]:
hasattr(chat, "model_name")

True

## Ollama

In [53]:
from langchain_community.llms import Ollama

llm = Ollama(model='invalid')

## LLM Wrapper


In [18]:
# print current directory
import os

print(os.getcwd())

/Users/aolchik/src/spikes/crewai/llm-agent


In [19]:
# add ./src to python path
import sys
sys.path.append('./src')


In [20]:
from helpers.llm_wrapper import ModelParams

p = ModelParams()


In [4]:
p.use_cache

True

## Trulens

In [16]:
from helpers.llm_wrapper import ModelParams

p = ModelParams(use_trulens=True)

In [17]:
p.use_trulens

True

In [20]:
from helpers.llm_wrapper import get_llm, get_model_name
llm = get_llm('openai', 'gpt-3.5-turbo-1106', config=p)
print(get_model_name(llm))

Tracer get_proxy_config with provider: disabled
Using Trulens
gpt-3.5-turbo-1106


# Observability


## Helicone

In [33]:
params = {
    "model": 'gpt-3.5-turbo',
    "temperature": 0,
    "verbose": True,
}

params["base_url"] = "https://oai.helicone.ai/v1"

In [34]:
import os
headers = {
    "Helicone-Auth": f"Bearer {os.getenv('HELICONE_API_KEY')}",
}

In [35]:
headers

{'Helicone-Auth': 'Bearer pk-helicone-itv4c7y-xxgecla-wegp4bq-ntrcs6a'}

In [36]:
params["default_headers"] = headers

In [37]:
params

{'model': 'gpt-3.5-turbo',
 'temperature': 0,
 'verbose': True,
 'base_url': 'https://oai.helicone.ai/v1',
 'default_headers': {'Helicone-Auth': 'Bearer pk-helicone-itv4c7y-xxgecla-wegp4bq-ntrcs6a'}}

In [49]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(**params)

In [50]:
llm.invoke("Sing a ballad of LangChain.")

AIMessage(content="In the land of LangChain, a tale unfolds\nOf heroes and villains, of legends untold\nA kingdom divided, by power and greed\nBut a hero arises, to fulfill a great need\n\nLangChain, oh LangChain, a land of strife\nWhere darkness and light, battle for life\nBut in the shadows, a hero stands tall\nTo bring peace and justice, to one and all\n\nWith sword in hand, and heart of gold\nLangChain's hero, brave and bold\nFights for the innocent, fights for the weak\nTo bring an end, to the darkness that seeks\n\nThrough battles and trials, the hero prevails\nWith courage and honor, their quest never fails\nLangChain is saved, by their selfless deed\nA legend forever, in LangChain's creed\n\nSo raise a toast, to LangChain's hero true\nTheir story will live on, forever anew\nIn the hearts of the people, their legend will reign\nIn the land of LangChain, where heroes remain.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 209, 'promp

In [40]:
from helpers.tracer import TracerFactory

tracer = TracerFactory.get_tracer()

In [42]:
from helpers.tracer import TracerAnnotation
tracer.provider = 'helicone'
tracer.annotation = TracerAnnotation(app='tool_evaluator_agent')

In [43]:
proxy_config = tracer.get_proxy_config()
proxy_config

LLMProxyConfig(url='https://oai.helicone.ai/v1', headers={'Helicone-Auth': 'Bearer pk-helicone-itv4c7y-xxgecla-wegp4bq-ntrcs6a', 'Helicone-Cache-Enabled': 'true', 'Helicone-Property-Session': '45b4bdb6-4803-4de2-86e1-7776ebf05344', 'Helicone-Property-App': 'tool_evaluator_agent'})

In [48]:
params

{'model': 'gpt-3.5-turbo',
 'temperature': 0,
 'verbose': True,
 'base_url': 'https://oai.helicone.ai/v1',
 'default_headers': {'Helicone-Auth': 'Bearer pk-helicone-itv4c7y-xxgecla-wegp4bq-ntrcs6a',
  'Helicone-Cache-Enabled': 'true',
  'Helicone-Property-Session': '45b4bdb6-4803-4de2-86e1-7776ebf05344',
  'Helicone-Property-App': 'tool_evaluator_agent'}}

In [47]:
params["base_url"] = proxy_config.url
params["default_headers"] = proxy_config.headers

In [2]:
__name__

'__main__'

In [7]:
## Tracer

In [8]:
from helpers.tracer import TracerFactory, TracerAnnotation
tracer = TracerFactory.get_tracer()

In [10]:
tracer

Tracer(provider='helicone', annotation=None)

In [7]:
tracer.provider = 'helicone'

In [8]:
tracer

Tracer(provider='helicone', annotation=None)

In [13]:
tracer.get_proxy_config()

LLMProxyConfig(url='https://oai.helicone.ai/v1', headers={'Helicone-Auth': 'Bearer pk-helicone-itv4c7y-xxgecla-wegp4bq-ntrcs6a', 'Helicone-Cache-Enabled': 'true'})

## Logger

In [5]:
import logging

In [7]:
logger = logging.getLogger('tool-evaluator-agent')

In [9]:
provider = "Google"
logger.debug(f"Tracer ended with provider: {provider}")

2024-08-26 10:46:44,684 - tool-evaluator-agent - DEBUG - Tracer ended with provider: Google


In [None]:
# Configure the root logger
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)


In [11]:
logger = logging.getLogger('tool-evaluator-agent')
provider = "Google"
logger.debug(f"Tracer ended with provider: {provider}")

2024-08-26 10:47:51,295 - tool-evaluator-agent - DEBUG - Tracer ended with provider: Google


In [12]:

# Create a common logger instance
logger = logging.getLogger('tool-evaluator-agent')

# Example: Adding a file handler
file_handler = logging.FileHandler('logs/app-debug.log')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(
    logging.Formatter('%(asctime)s - %(name)s - '
                      '%(levelname)s - %(message)s'))
logger.addHandler(file_handler)

In [16]:
logger = logging.getLogger('tool-evaluator-agent')
provider = "Google"
logger.debug(f"Tracer ended with provider: {provider}")

2024-08-26 10:54:17,065 - tool-evaluator-agent - DEBUG - Tracer ended with provider: Google


In [14]:
type(logger)

logging.Logger

In [15]:
print(logger)

<Logger tool-evaluator-agent (DEBUG)>


# Agents & Tasks

## Fedback Functions

In [2]:
import numpy as np
from trulens.core import Feedback
from trulens.providers.openai import OpenAI
from trulens.providers.bedrock import Bedrock
from trulens.providers.bedrock.endpoint import BedrockEndpoint
#import boto3

# Initialize provider class
provider = OpenAI(model_engine='gpt-4o')

# boto3_session = boto3.Session(profile_name='iaproject', region_name='us-west-2')
# boto3_client = boto3_session.client('bedrock')
# provider = Bedrock(model_id='anthropic.claude-3-opus-20240229-v1:0', client=boto3_client, region_name='us-west-2')

# select context to be used in feedback. the location of context is app specific.
# context = TruChain.select_context(rag_chain)

# Define a groundedness feedback function
# f_groundedness = (
#     Feedback(
#         provider.groundedness_measure_with_cot_reasons, name="Groundedness"
#     )
#     .on(context.collect())  # collect context chunks into a list
#     .on_output()
# )

# Question/answer relevance between overall question and answer.
f_answer_relevance = Feedback(
     provider.relevance_with_cot_reasons, name="Answer Relevance"
).on_input_output()

# Context relevance between question and each context chunk.
# f_context_relevance = (
#     Feedback(
#         provider.context_relevance_with_cot_reasons, name="Context Relevance"
#     )
#     .on_input()
#     .on(context)
#     .aggregate(np.mean)
# )

âœ… In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
âœ… In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .


In [113]:
boto3_session

Session(region_name='us-west-2')

In [114]:
provider

Bedrock(tru_class_info=trulens.providers.bedrock.provider.Bedrock, endpoint=BedrockEndpoint(region_name=us-east-1), model_engine='Bedrock', model_id='anthropic.claude-3-opus-20240229-v1:0')

## Tracer

In [3]:
from helpers.tracer import TracerFactory, TracerAnnotation
tracer = TracerFactory.get_tracer()
tracer.provider = 'helicone'


## Tool Evaluator

In [4]:
from agents.tool_evaluator.crew import ToolEvalCrewFactory
from agents.tool_evaluator.main import crew_inputs as te_crew_inputs
from trulens.apps.langchain import TruChain
from trulens.core import TruSession
from trulens.apps.langchain import TruChain

te_factory = ToolEvalCrewFactory()

te_session = TruSession()
#te_session.reset_database()

te_crew = te_factory.crew(agents=[te_factory.researcher()], 
                          tasks=[te_factory.criteria_clarification_task()])

te_true_app = TruChain(te_crew.agents[0],
                       app_name=f"researcher@criteria-clarification-task",
                       app_version="0.0.1",
                       feedbacks=[f_answer_relevance])

tracer.annotation = TracerAnnotation(app=te_true_app.app_name)
print(f"Tracer annotation is {tracer.annotation}")

with te_true_app as te_recordings:    
    tracer.init()
    te_crew.kickoff(te_crew_inputs)
    tracer.end()

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


In [43]:
from tool_evaluator_agent.main import crew_inputs

crew = factory.crew(agents=[factory.researcher()], tasks=[factory.research_task()])

from trulens.apps.langchain import TruChain
true_app = TruChain(crew.agents[0],
                    app_name=f"researcher@research-task",
                    app_version="0.0.1",
                    feedbacks=[f_answer_relevance])

with true_app as recordings:    
    crew.kickoff(crew_inputs)



[1m[95m [2024-09-08 09:45:12][DEBUG]: == Working Agent: Senior Research Analyst
[00m
[1m[95m [2024-09-08 09:45:12][INFO]: == Starting Task: Conduct a comprehensive analysis of plaforms for development of applications that use LLM. Identify trends, adoption, strenghts and weeknesses, mainly focusing on the following criteria: 
        - Comprehensive support for the development of multi-step AI agents
        - Low vendor lock-in
        - Support for higher end LLM models (e.g. GPT-4, Gemini, ...)
        - Support for zero cost LLM models (e.g. Open Hermes, ...)
        - Good documentation
        - Strong community support
        - RAG (Retrieval-Augmented Generation) support
        - Rich ecosystem of integrations and toolkits
        
Evaluate the following alternatives according to criteria: 
        - Amazon Bedrock
        - CrewAI
        - LangChain
        - LangGraph
        - phidata
        
[00m






[1m> Entering new CrewAgentExecutor chain...[0m


[31;1mðŸ–‡ AgentOps: Could not record event. Start a session by calling agentops.start_session().[0m
Inserting batches in chromadb: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  2.90it/s]


[32;1m[1;3mI need to gather comprehensive information about the platforms for development of applications that use LLM. I should start by conducting a thorough analysis of each platform based on the specified criteria.

Action:
Search the internet

Action Input:
{
  "search_query": "Amazon Bedrock platform for LLM development"
}
[0m

ðŸ–‡ AgentOps: Could not record event - no sessions detected. Create a session by calling agentops.start_session()


[95m 


Search results: Title: Amazon Bedrock - Generative AI - AWS
Link: https://aws.amazon.com/bedrock/
Snippet: Amazon Bedrock is a fully managed service that offers a choice of high-performing foundation models (FMs) from leading AI companies.
---
Title: Generative AI, LLMs, and Foundation Models - AWS
Link: https://aws.amazon.com/ai/generative-ai/
Snippet: Stability AI's top 3 text-to-image models now available in Amazon Bedrock Â· Mistral Large 2, Mistral AI's most advanced LLM, is now available in Amazon Bedrock ...
---
Title: Amazon Bedrock offers access to multiple generative AI models
Link: https://www.amazon.science/news-and-features/amazon-bedrock-offers-access-to-multiple-generative-ai-models
Snippet: Continuing in that vein, a standout feature of Bedrock is the availability of Amazon's Titan FMs, including a generative LLM and an embeddings LLM. Titan FMs ...
---
Title: Building your own LLM API with AWS Bedrock and Lambda - Medium
Link: https://medium.com/@jamesryebread

[31;1mðŸ–‡ AgentOps: Could not record event. Start a session by calling agentops.start_session().[0m


[32;1m[1;3mAction:
Scrape website content

Action Input:
{
  "website": "https://aws.amazon.com/bedrock/"
}
[0m

ðŸ–‡ AgentOps: Could not record event - no sessions detected. Create a session by calling agentops.start_session()


[95m 


Scrapped Content: {"message":"You have run out of requests. You can buy new here: https://scrapingfish.com/buy or try our autorenew packs: https://scrapingfish.com/auto-renew"}

[00m


[31;1mðŸ–‡ AgentOps: Could not record event. Start a session by calling agentops.start_session().[0m
2024-09-08 09:45:24,832 - 8349552448 - instruments.py-instruments:542 - ERROR: Error calling wrapped function invoke.
2024-09-08 09:45:24,833 - 8349552448 - instruments.py-instruments:545 - ERROR: Traceback (most recent call last):
  File "/Users/aolchik/mambaforge/envs/crewai/lib/python3.11/site-packages/trulens/core/instruments.py", line 534, in tru_wrapper
    rets, cost = mod_endpoint.Endpoint.track_all_costs_tally(
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/aolchik/mambaforge/envs/crewai/lib/python3.11/site-packages/trulens/core/feedback/endpoint.py", line 524, in track_all_costs_tally
    result, cbs = Endpoint.track_all_costs(
                  ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/aolchik/mambaforge/envs/crewai/lib/python3.11/site-packages/trulens/core/feedback/endpoint.py", line 504, in track_all_costs
    return Endpoint._track_costs(
   

[32;1m[1;3m[0m

[31;1mðŸ–‡ AgentOps: Could not record event. Start a session by calling agentops.start_session().[0m
ðŸ–‡ AgentOps: Could not end session - no sessions detected


[32;1m[1;3mThought: I now can give a great answer

Final Answer: 

**Amazon Bedrock:**
- Comprehensive support for the development of multi-step AI agents
- Low vendor lock-in
- Support for higher end LLM models (e.g. GPT-4, Gemini, ...)
- Support for zero cost LLM models (e.g. Open Hermes, ...)
- Good documentation
- Strong community support
- RAG (Retrieval-Augmented Generation) support
- Rich ecosystem of integrations and toolkits

**CrewAI:**
- [Analysis not available]

**LangChain:**
- [Analysis not available]

**LangGraph:**
- [Analysis not available]

**phidata:**
- [Analysis not available]

References:
- Amazon Bedrock: https://aws.amazon.com/bedrock/
- Generative AI, LLMs, and Foundation Models - AWS: https://aws.amazon.com/ai/generative-ai/
- Building your own LLM API with AWS Bedrock and Lambda - Medium: https://medium.com/@jamesryebread/building-your-own-llm-api-with-aws-bedrock-and-lambda-dd495cae43e2
- What are Large Language Models? - LLM AI Explained - AWS: https://aw

In [11]:
te_session.get_leaderboard()

Unnamed: 0_level_0,Unnamed: 1_level_0,Answer Relevance,latency,total_cost
app_name,app_version,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
researcher@criteria-clarification-task,0.0.1,1.0,17.666667,5.6e-05
researcher@research-task,0.0.1,0.0,16.75,0.0


In [12]:
from trulens.dashboard.display import get_feedback_result

te_last_record = te_recordings.records[-1]
te_feedback_df = get_feedback_result(te_last_record, "Answer Relevance")


# align text to the left and do not crop cell content
te_feedback_df.head(1).T.style.set_properties(**{'text-align': 'left', 'white-space': 'pre-wrap'})

Unnamed: 0,0
prompt,"Provide a summary of the following criteria used to score the tools: - Comprehensive support for the development of multi-step AI agents  - Low vendor lock-in  - Support for higher end LLM models (e.g. GPT-4, Gemini, ...)  - Support for zero cost LLM models (e.g. Open Hermes, ...)  - Good documentation  - Strong community support  - RAG (Retrieval-Augmented Generation) support  - Rich ecosystem of integrations and toolkits  1) Include a brief explanation of each criteria and suggest a scoring approach. 2) When possible offer concrete examples for the criteria. 3) Define the criteria so they have minimum overlap Example: - Low vendor lock-in  - Explanation: refers to a situation where a software solution or platform is designed in such a way that it minimizes dependency on a single vendor or provider  - Scoring approach:  - Better if:  - There is interoperability: The software is compatible with other systems, platforms, or tools, often through the use of standard protocols, APIs, or open-source technologies. For example: support for the OpenAPI standard.  - There is portability: The software can be easily moved or adapted to different environments  - Uses open standards  - Uses open-source software  - Has modular design  - Worse if  - Uses proprietary tools This is the expect criteria for your final answer: Brief explanation of each critearia and a scoring approach.  you MUST return the actual complete content as the final answer, not a summary. # Useful context: Historical Data: - Consider providing more specific examples for each criteria to illustrate the scoring approach. - Ensure that the scoring approach for each criteria is consistent and well-defined. - Organize the criteria and scoring approach in a more structured format, possibly using a table or list for better readability. - Include a summary of the final scores for each criteria to provide a comprehensive evaluation of the tools. - Consider adding a section to highlight potential challenges or limitations related to the criteria. - Provide additional information about the tools and platforms being evaluated to add context to the criteria and scoring approach. - Improve the clarity and specificity of the explanations for each criteria to ensure a clear understanding of the evaluation process. - Consider incorporating a comparison with existing tools or platforms to provide context for the evaluation criteria. - Consider providing more concrete examples for each criterion to illustrate the ideal and suboptimal scenarios. - Ensure that the scoring approach is consistent across all criteria to maintain uniformity in evaluation. - Include real-world use cases or success stories related to the use of development tools that align with the specified criteria."
response,"Criteria for Scoring Development Tools for LLM Applications: 1. Comprehensive support for the development of multi-step AI agents  - Explanation: This criterion refers to the extent to which a development tool provides comprehensive support for building multi-step AI agents, including features such as sequential decision-making, memory, and reasoning capabilities.  - Scoring approach:  - Better if:  - Provides built-in support for reinforcement learning algorithms  - Offers pre-built libraries for handling sequential data and state transitions  - Includes features for long-term memory and context retention  - Worse if:  - Lacks support for multi-step decision-making processes  - Does not provide tools for handling sequential data and state transitions 2. Low vendor lock-in  - Explanation: Refers to a situation where a software solution or platform is designed in such a way that it minimizes dependency on a single vendor or provider.  - Scoring approach:  - Better if:  - There is interoperability: The software is compatible with other systems, platforms, or tools, often through the use of standard protocols, APIs, or open-source technologies. For example: support for the OpenAPI standard.  - There is portability: The software can be easily moved or adapted to different environments  - Uses open standards  - Uses open-source software  - Has modular design  - Worse if:  - Uses proprietary tools 3. Support for higher-end LLM models (e.g. GPT-4, Gemini, ...)  - Explanation: This criterion evaluates the extent to which a development tool supports the integration and utilization of advanced LLM models, such as GPT-4, Gemini, and other high-performance language models.  - Scoring approach:  - Better if:  - Provides native support for advanced LLM models  - Offers seamless integration with the latest LLM architectures  - Includes features for fine-tuning and customizing high-end LLM models  - Worse if:  - Limited or no support for advanced LLM models  - Incompatible with the latest LLM architectures 4. Support for zero-cost LLM models (e.g. Open Hermes, ...)  - Explanation: This criterion assesses the availability of support for zero-cost LLM models, such as Open Hermes, and other open-source language models that do not require licensing fees or subscription costs.  - Scoring approach:  - Better if:  - Provides access to open-source LLM models  - Offers tools and resources for training and deploying zero-cost LLM models  - Includes community-driven initiatives for zero-cost LLM development  - Worse if:  - Relies solely on proprietary or paid LLM models  - Does not support open-source LLM initiatives 5. Good documentation  - Explanation: Refers to the quality and accessibility of documentation provided by the development tool, including guides, tutorials, and reference materials.  - Scoring approach:  - Better if:  - Comprehensive and well-organized documentation  - Includes examples, use cases, and best practices  - Offers interactive tutorials and learning resources  - Worse if:  - Sparse or outdated documentation  - Lack of practical examples and use cases  - Limited accessibility and searchability of documentation 6. Strong community support  - Explanation: Evaluates the presence of an active and engaged community around the development tool, including forums, user groups, and collaborative platforms.  - Scoring approach:  - Better if:  - Active community forums and discussion groups  - Regular contributions from developers and users  - Supportive and responsive community moderators  - Worse if:  - Sparse or inactive community engagement  - Limited user participation and contributions  - Lack of official community support channels 7. RAG (Retrieval-Augmented Generation) support  - Explanation: Assesses the tool's capability to support retrieval-augmented generation techniques, which combine language generation with information retrieval for enhanced performance.  - Scoring approach:  - Better if:  - Provides built-in support for RAG techniques  - Offers tools for integrating retrieval and generation processes  - Includes features for fine-tuning retrieval-augmented models  - Worse if:  - Lacks support for RAG methodologies  - Limited or no tools for combining retrieval and generation 8. Rich ecosystem of integrations and toolkits  - Explanation: Refers to the availability of a diverse and extensive set of integrations, plugins, and toolkits that enhance the functionality and interoperability of the development tool.  - Scoring approach:  - Better if:  - Extensive library of integrations with third-party services and platforms  - Support for industry-standard APIs and protocols  - Active development of community-driven toolkits and extensions  - Worse if:  - Limited or outdated integrations with external services  - Lack of support for common APIs and protocols  - Sparse availability of community-driven toolkits Summary of Final Scores: 1. Comprehensive support for the development of multi-step AI agents: 8/10 2. Low vendor lock-in: 9/10 3. Support for higher-end LLM models: 7/10 4. Support for zero-cost LLM models: 6/10 5. Good documentation: 8/10 6. Strong community support: 7/10 7. RAG (Retrieval-Augmented Generation) support: 7/10 8. Rich ecosystem of integrations and toolkits: 8/10"
ret,1.000000


In [19]:
te_records, te_feedback = te_session.get_records_and_feedback()

te_records.tail()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,Answer Relevance,Answer Relevance_calls,Answer Relevance feedback cost in USD,app_name,app_version,latency,total_tokens,total_cost,cost_currency
5,app_hash_8f07e808b17ac4a60f6c323b6b2c3e4a,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",Agent(crewai.agent),record_hash_5e24fc0b276d4e737189bbec58a4e077,"""Provide a summary of the following criteria u...","""Criteria for scoring the tools for developmen...",-,"{""record_id"": ""record_hash_5e24fc0b276d4e73718...","{""n_requests"": 3, ""n_successful_requests"": 6, ...","{""start_time"": ""2024-09-07T09:27:06.951146"", ""...",2024-09-07T09:27:43.762962,1.0,[{'args': {'prompt': 'Provide a summary of the...,0.000372,researcher@criteria-clarification-task,0.0.1,11,312,0.0,USD
6,app_hash_8f07e808b17ac4a60f6c323b6b2c3e4a,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",Agent(crewai.agent),record_hash_fdf61700e389a894c41ee2c79952073c,"""Provide a summary of the following criteria u...","""Criteria for Scoring Tools for LLM Applicatio...",-,"{""record_id"": ""record_hash_fdf61700e389a894c41...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2024-09-20T12:04:33.582616"", ""...",2024-09-20T12:05:03.960175,1.0,[{'args': {'prompt': 'Provide a summary of the...,0.00816,researcher@criteria-clarification-task,0.0.1,27,71,0.0,USD
7,app_hash_8f07e808b17ac4a60f6c323b6b2c3e4a,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",Agent(crewai.agent),record_hash_560a1095db721df45f0f95ed6039a10a,"""Provide a summary of the following criteria u...","""Criteria for Scoring Development Tools for LL...",-,"{""record_id"": ""record_hash_560a1095db721df45f0...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2024-09-20T12:06:09.012345"", ""...",2024-09-20T12:06:34.365702,1.0,[{'args': {'prompt': 'Provide a summary of the...,0.009235,researcher@criteria-clarification-task,0.0.1,18,48,0.0,USD
8,app_hash_8f07e808b17ac4a60f6c323b6b2c3e4a,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",Agent(crewai.agent),record_hash_9bcfd5ad75b50c8029a50879f0da455a,"""Provide a summary of the following criteria u...","""The criteria used to score the tools for the ...",-,"{""record_id"": ""record_hash_9bcfd5ad75b50c8029a...","{""n_requests"": 2, ""n_successful_requests"": 3, ...","{""start_time"": ""2024-09-20T12:13:53.696982"", ""...",2024-09-20T12:14:22.105755,1.0,[{'args': {'prompt': 'Provide a summary of the...,0.009585,researcher@criteria-clarification-task,0.0.1,3,155,0.0,USD
9,app_hash_8f07e808b17ac4a60f6c323b6b2c3e4a,"{""tru_class_info"": {""name"": ""TruChain"", ""modul...",Agent(crewai.agent),record_hash_99df92df49e33a821f638484b2a8105b,"""Provide a summary of the following criteria u...","""Criteria for Scoring Development Tools for LL...",-,"{""record_id"": ""record_hash_99df92df49e33a821f6...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2024-09-20T12:25:12.495854"", ""...",2024-09-20T12:26:07.102373,1.0,[{'args': {'prompt': 'Provide a summary of the...,0.0104,researcher@criteria-clarification-task,0.0.1,36,49,0.0,USD


In [20]:
te_records.tail(1)['Answer Relevance_calls'].values[0][0]['args']['prompt']

'Provide a summary of the following criteria used to score the tools: \n        - Comprehensive support for the development of multi-step AI agents\n        - Low vendor lock-in\n        - Support for higher end LLM models (e.g. GPT-4, Gemini, ...)\n        - Support for zero cost LLM models (e.g. Open Hermes, ...)\n        - Good documentation\n        - Strong community support\n        - RAG (Retrieval-Augmented Generation) support\n        - Rich ecosystem of integrations and toolkits\n         1) Include a brief explanation of each criteria and suggest a scoring approach. 2) When possible offer concrete examples for the criteria. 3) Define the criteria so they have minimum overlap Example: \n  - Low vendor lock-in\n    - Explanation: refers to a situation where a software solution or platform is designed in such a way that it minimizes dependency on a single vendor or provider\n    - Scoring approach:\n      - Better if:\n        - There is interoperability: The software is compat

In [21]:
provider

OpenAI(tru_class_info=trulens.providers.openai.provider.OpenAI, endpoint=Endpoint(openai), model_engine='gpt-4o')

In [24]:
te_records.tail(1)['Answer Relevance_calls'].values[0][0]['args']['prompt']

'Provide a summary of the following criteria used to score the tools: \n        - Comprehensive support for the development of multi-step AI agents\n        - Low vendor lock-in\n        - Support for higher end LLM models (e.g. GPT-4, Gemini, ...)\n        - Support for zero cost LLM models (e.g. Open Hermes, ...)\n        - Good documentation\n        - Strong community support\n        - RAG (Retrieval-Augmented Generation) support\n        - Rich ecosystem of integrations and toolkits\n         1) Include a brief explanation of each criteria and suggest a scoring approach. 2) When possible offer concrete examples for the criteria. 3) Define the criteria so they have minimum overlap Example: \n  - Low vendor lock-in\n    - Explanation: refers to a situation where a software solution or platform is designed in such a way that it minimizes dependency on a single vendor or provider\n    - Scoring approach:\n      - Better if:\n        - There is interoperability: The software is compat

In [25]:
te_records.tail(1)['Answer Relevance_calls'].values[0][0]['args']['response']

"Criteria for Scoring Development Tools for LLM Applications:\n\n1. Comprehensive support for the development of multi-step AI agents\n   - Explanation: This criterion refers to the extent to which a development tool provides comprehensive support for building multi-step AI agents, including features such as sequential decision-making, memory, and reasoning capabilities.\n   - Scoring approach:\n     - Better if:\n       - Provides built-in support for reinforcement learning algorithms\n       - Offers pre-built libraries for handling sequential data and state transitions\n       - Includes features for long-term memory and context retention\n     - Worse if:\n       - Lacks support for multi-step decision-making processes\n       - Does not provide tools for handling sequential data and state transitions\n\n2. Low vendor lock-in\n   - Explanation: Refers to a situation where a software solution or platform is designed in such a way that it minimizes dependency on a single vendor or pr

In [23]:
provider.relevance_with_cot_reasons(te_records.tail(1)['Answer Relevance_calls'].values[0][0]['args']['prompt'],
                                    te_records.tail(1)['Answer Relevance_calls'].values[0][0]['args']['response'])




(1.0, {})

In [31]:
json_like = last_record.layout_calls_as_app()

from ipytree import Node
from ipytree import Tree


def display_call_stack(data):
    tree = Tree()
    tree.add_node(Node("Record ID: {}".format(data["record_id"])))
    tree.add_node(Node("App ID: {}".format(data["app_id"])))
    tree.add_node(Node("Cost: {}".format(data["cost"])))
    tree.add_node(Node("Performance: {}".format(data["perf"])))
    tree.add_node(Node("Timestamp: {}".format(data["ts"])))
    tree.add_node(Node("Tags: {}".format(data["tags"])))
    tree.add_node(Node("Main Input: {}".format(data["main_input"])))
    tree.add_node(Node("Main Output: {}".format(data["main_output"])))
    tree.add_node(Node("Main Error: {}".format(data["main_error"])))

    calls_node = Node("Calls")
    tree.add_node(calls_node)

    for call in data["calls"]:
        call_node = Node("Call")
        calls_node.add_node(call_node)

        for step in call["stack"]:
            step_node = Node("Step: {}".format(step["path"]))
            call_node.add_node(step_node)
            if "expanded" in step:
                expanded_node = Node("Expanded")
                step_node.add_node(expanded_node)
                for expanded_step in step["expanded"]:
                    expanded_step_node = Node(
                        "Step: {}".format(expanded_step["path"])
                    )
                    expanded_node.add_node(expanded_step_node)

    return tree


# Usage
tree = display_call_stack(json_like)
tree

Tree(nodes=(Node(name='Record ID: record_hash_5788e1235355b584159fe5d406a8b016'), Node(name='App ID: app_hash_â€¦

In [25]:
from trulens.dashboard import run_dashboard

run_dashboard(te_session)

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valuâ€¦

Dashboard started at http://192.168.0.16:53922 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

In [None]:
from trulens.dashboard import run_dashboard

run_dashboard(te_session)

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valuâ€¦

Dashboard started at http://192.168.0.16:53922 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

## Feature Formulator

In [37]:
from agents.feature_formulator.crew import FeatureFormulatorCrewFactory
from agents.feature_formulator.main import crew_inputs as ff_crew_inputs
from trulens.apps.langchain import TruChain
from trulens.core import TruSession
from trulens.apps.langchain import TruChain


ff_factory = FeatureFormulatorCrewFactory()

ff_session = TruSession()
#ff_session.reset_database()

ff_crew = ff_factory.crew(agents=[ff_factory.product_owner()], 
                          tasks=[ff_factory.feature_specification_task()])

ff_true_app = TruChain(ff_crew.agents[0],
                       app_name=f"product_owner@criteria-feature_specification_task",
                       app_version="0.0.1",
                       feedbacks=[f_answer_relevance])

tracer.annotation = TracerAnnotation(app=ff_true_app.app_name)
print(f"Tracer annotation is {tracer.annotation}")

with ff_true_app as ff_recordings:    
    tracer.init()
    ff_crew.kickoff(ff_crew_inputs)
    tracer.end()


Tracer annotation is session='efeb138d-d23c-4186-9d5c-d81f58329b2e' app='product_owner@criteria-feature_specification_task'
[1m[95m [2024-09-20 13:03:17][DEBUG]: == Working Agent: Product Owner
[00m
[1m[95m [2024-09-20 13:03:17][INFO]: == Starting Task: Define and scope the feature for DA.IA (Smart IA Assistant for Real Estate Developers), ensuring that it fits within the development cycle and aligns with the product's overall goals.
1) **Problem Definition**: 
   Describe the core problem this feature will solve and why it is important to both the users and the business. Ensure the problem is clear, concise, and scoped to avoid ambiguity. 
   
   The main problem to solve is/are:
   
        - Sales Partners Manager from Real Estate Developers waste time
        answering repeated questions from brokers
    

2) **Shaped Solution**: 
   Provide a shaped solution for the problem. This should include sketches or visualizations that outline the functionality and user experience.
   

Inserting batches in chromadb: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  2.30it/s]


In [38]:
from trulens.dashboard import run_dashboard

run_dashboard(ff_session)

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valuâ€¦

Dashboard started at http://192.168.0.254:65248 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>