In [14]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override = True)

True

In [13]:
os.environ.get("OPENAI_API_KEY")
os.environ.get("LANGCHAIN_API_KEY")
print()




In [24]:
import os
from uuid import uuid4

unique_id = uuid4().hex[0:8]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"Tracing Walkthrough - {unique_id}"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

In [15]:
from langsmith import Client
client = Client()

In [18]:
from langchain import hub
from langchain.agents import AgentExecutor
from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain_community.tools import DuckDuckGoSearchResults
from langchain_openai import ChatOpenAI

prompt = hub.pull("wfh/langsmith-agent-prompt:5d466cbc")

llm = ChatOpenAI(
    model = "gpt-3.5-turbo-16k",
    temperature=0,
)

tools = [
    DuckDuckGoSearchResults(
        name = "duck_duck_go"
    ),
]

llm_with_tools = llm.bind_tools(tools)

runnable_agent = (
    {
        "input" : lambda x : x["input"],
        "agent_scratchpad" : lambda x : format_to_openai_tool_messages(
            x["intermediate_steps"]
        ),
    }
    | prompt
    | llm_with_tools
    | OpenAIToolsAgentOutputParser()
)

agent_executor = AgentExecutor(
    agent=runnable_agent, tools=tools, handle_parsing_errors=True
)

In [19]:
inputs = [
    "What is LangChain?",
    "What's LangSmith?",
    "When was Llama-v3 released?",
    "What is the langsmith cookbook?",
    "When did langchain first announce the hub?",
]

results = agent_executor.batch([{"input" : x} for x in inputs], return_exceptions=True)

In [22]:
results

[{'input': 'What is LangChain?',
  'output': 'I\'m sorry, but I couldn\'t find any information about "LangChain". Could you please provide more context or clarify your question?'},
 {'input': "What's LangSmith?",
  'output': 'I\'m sorry, but I couldn\'t find any information about "LangSmith". It could be a company, a product, or a person. Can you provide more context or details about what you are referring to?'},
 RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-0_2'."),
 {'input': 'What is the langsmith cookbook?',
  'output': 'I\'m sorry, but I couldn\'t find any information about the "Langsmith Cookbook". It\'s possible that it may not be a well-known cookbook or it may not exist. Could you provide more context or clarify your question?'},
 RuntimeError("There is no current event loop in thread 'ThreadPoolExecutor-0_4'.")]

In [None]:
"""
1. Create a dataset
2. Initialize a new agent to benchmark
3. Configure evaluators to grade an agent’s output
4. Run the agent over the dataset and evaluate the results
"""

In [23]:
outputs = [
    "LangChain is an open-source framework for building applications using large language models. It also the name of the company building LangSmith",
    "LangSmith is a unified platform for debugging, testing, and monitoring language model applications and agents powered by LangChain",
    "April 19, 2024",
    "The langsmith cookbook is a github repository containing detailed examples of how to use LangSmith to debug, evaluate, and monitor Large Language Model-powered applications.",
    "September 5, 2023"
]

In [25]:
dataset_name = f"agent-qa-{unique_id}"

dataset = client.create_dataset(
    dataset_name,
    description="An example dataset of questions over the LangSmith documentation.",
)

client.create_examples(
    inputs = [{"input" : query} for query in inputs],
    outputs = [{"output" : answer} for answer in outputs],
    dataset_id=dataset.id,
)

In [26]:
from langchain import hub
from langchain.agents import AgentExecutor, AgentType, initialize_agent, load_tools
from langchain_openai import ChatOpenAI

def create_agent(prompt, llm_with_tools):
    runnable_agent = (
        {
            "input" : lambda x : x["input"],
            "agent_scratchpad" : lambda x : format_to_openai_tool_messages(
                x["intermediate_steps"]
            ),
        }
        | prompt
        | llm_with_tools
        | OpenAIToolsAgentOutputParser()
    )
    return AgentExecutor(agent=runnable_agent, tools = tools, handle_parsing_errors=True)

In [27]:
# configure evaluation

from langsmith.evaluation import EvaluationResult
from langsmith.schemas import Example, Run

def check_not_idk(run : Run, example : Example):
    """Illustration of a custom evaluaor."""
    agent_response = run.outputs["output"]
    if "don't know" in agent_response or "not sure" in agent_response :
        score = 0
    else : 
        score = 1

    return EvaluationResult(
        key = "not_uncertain",
        score = score,
    )

In [28]:
# batch Evaluators

from typing import List

def max_pred_length(runs : List[Run], examples : List[Example]):
    predictions = [len(run.outputs["output"]) for run in runs]
    return EvaluationResult(key = "max_pred_length", score = max(predictions))

In [29]:
from langchain.evaluation import EvaluatorType
from langchain.smith import RunEvalConfig

evaluation_config = RunEvalConfig(
    evaluators=[
        check_not_idk,
        EvaluatorType.QA,
        EvaluatorType.EMBEDDING_DISTANCE,
        RunEvalConfig.LabeledCriteria("helpfulness"),
        RunEvalConfig.LabeledScoreString(
            {
                "accuracy" :"""
Score 1: The answer is completely unrelated to the reference.
Score 3: The answer has minor relevance but does not align with the reference.
Score 5: The answer has moderate relevance but contains inaccuracies.
Score 7: The answer aligns with the reference but has minor errors or omissions.
Score 10: The answer is completely accurate and aligns perfectly with the reference.
"""
            },
            normalize_by=10,
        ),
    ],
    batch_evaluators=[max_pred_length],
)

In [30]:
# Run and Evaluator

from langchain import hub
prompt = hub.pull("wfh/langsmith-agent-prompt:798e7324")

In [31]:
import functools

from langchain.smith import arun_on_dataset, run_on_dataset

chain_results = run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=functools.partial(
        create_agent, prompt=prompt, llm_with_tools=llm_with_tools
    ),
    evaluation=evaluation_config,
    verbose=True,
    client = client,
    project_name=f"tools-agent-test-5d466cbc-{unique_id}",

    project_metadata={
        "env" : "testing-notebook",
        "model" :"gpt-3.5-turbo",
        "prompt" : "5d466cbc",
    },
)

View the evaluation results for project 'tools-agent-test-5d466cbc-40ac11a6' at:
https://smith.langchain.com/o/ea23ed94-4bfe-54f0-9f87-c20e44630cca/datasets/d029f8ae-2630-4ea5-9a8f-81bc20d4bf0b/compare?selectedSessions=3b0ad103-154b-44de-95dc-2a7269c40c7c

View all tests for Dataset agent-qa-40ac11a6 at:
https://smith.langchain.com/o/ea23ed94-4bfe-54f0-9f87-c20e44630cca/datasets/d029f8ae-2630-4ea5-9a8f-81bc20d4bf0b


  warn_deprecated(


[>                                                 ] 0/5

Chain failed for example 4d59839c-19cb-4885-840b-ed209f8693ae with inputs {'input': 'What is the langsmith cookbook?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-7_3'.
Chain failed for example 56213781-f21d-4e3a-85c5-2d7b1c51fb01 with inputs {'input': 'When was Llama-v3 released?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-7_2'.


[------------------->                              ] 2/5

Chain failed for example 06946fe8-2fe0-43aa-aed3-3c16f1737130 with inputs {'input': "What's LangSmith?"}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-7_1'.
Chain failed for example c9541042-cdbb-4fb1-be74-0ede6cb5b222 with inputs {'input': 'When did langchain first announce the hub?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-7_4'.


[--------------------------------------->          ] 4/5

Error running batch evaluator <function max_pred_length at 0x11fd57b80>: 'NoneType' object is not subscriptable


[------------------------------------------------->] 5/5

Unnamed: 0,feedback.not_uncertain,feedback.correctness,feedback.embedding_cosine_distance,feedback.helpfulness,feedback.score_string:accuracy,error,execution_time,run_id
count,1.0,1.0,1.0,1.0,1.0,4,5.0,5
unique,,,,,,4,,5
top,,,,,,There is no current event loop in thread 'Thre...,,3a2c9bed-ba6f-433a-9ac9-0923a24e34c8
freq,,,,,,1,,1
mean,1.0,0.0,0.105595,0.0,0.1,,1.438028,
std,,,,,,,0.626396,
min,1.0,0.0,0.105595,0.0,0.1,,1.022329,
25%,1.0,0.0,0.105595,0.0,0.1,,1.024985,
50%,1.0,0.0,0.105595,0.0,0.1,,1.26342,
75%,1.0,0.0,0.105595,0.0,0.1,,1.351384,


In [32]:
chain_results.to_dataframe()

Unnamed: 0,inputs.input,outputs.input,outputs.output,reference.output,feedback.not_uncertain,feedback.correctness,feedback.embedding_cosine_distance,feedback.helpfulness,feedback.score_string:accuracy,error,execution_time,run_id
af416890-acb1-4efd-95b1-2256d8668ead,What is LangChain?,What is LangChain?,LangChain is a decentralized blockchain platfo...,LangChain is an open-source framework for buil...,1.0,0.0,0.105595,0.0,0.1,,2.528021,3a2c9bed-ba6f-433a-9ac9-0923a24e34c8
06946fe8-2fe0-43aa-aed3-3c16f1737130,What's LangSmith?,,,"LangSmith is a unified platform for debugging,...",,,,,,There is no current event loop in thread 'Thre...,1.26342,a1d7d5f5-aa66-4216-bec0-30ec324515fd
56213781-f21d-4e3a-85c5-2d7b1c51fb01,When was Llama-v3 released?,,,"April 19, 2024",,,,,,There is no current event loop in thread 'Thre...,1.022329,67f6c3a4-4369-475d-a19a-f073a80afe98
4d59839c-19cb-4885-840b-ed209f8693ae,What is the langsmith cookbook?,,,The langsmith cookbook is a github repository ...,,,,,,There is no current event loop in thread 'Thre...,1.024985,a326c587-3c0a-4616-bfa2-bd76787118e2
c9541042-cdbb-4fb1-be74-0ede6cb5b222,When did langchain first announce the hub?,,,"September 5, 2023",,,,,,There is no current event loop in thread 'Thre...,1.351384,1f00cc16-2e32-4ef3-9607-9ea5c466bad9


In [33]:
# compare to another prompt

candidate_prompt = hub.pull("wfh/langsmith-agent-prompt:39f3bbd0")

chain_results = run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=functools.partial(
        create_agent, prompt=candidate_prompt, llm_with_tools = llm_with_tools
    ),
    evaluation=evaluation_config,
    verbose=True,
    client=client,
    project_name=f"tools-agent-test-39f3bbd0-{unique_id}",
    project_metadata={
        "env" : "testing-notebook",
        "model" : "gpt-3.5-turbo",
        "prompt" : "39f3bbd0"
    }
)

View the evaluation results for project 'tools-agent-test-39f3bbd0-40ac11a6' at:
https://smith.langchain.com/o/ea23ed94-4bfe-54f0-9f87-c20e44630cca/datasets/d029f8ae-2630-4ea5-9a8f-81bc20d4bf0b/compare?selectedSessions=a279e55a-77ab-4fed-824e-25da79b8c3d8

View all tests for Dataset agent-qa-40ac11a6 at:
https://smith.langchain.com/o/ea23ed94-4bfe-54f0-9f87-c20e44630cca/datasets/d029f8ae-2630-4ea5-9a8f-81bc20d4bf0b
[>                                                 ] 0/5

Chain failed for example af416890-acb1-4efd-95b1-2256d8668ead with inputs {'input': 'What is LangChain?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-14_0'.
Chain failed for example 4d59839c-19cb-4885-840b-ed209f8693ae with inputs {'input': 'What is the langsmith cookbook?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-14_3'.
Chain failed for example 06946fe8-2fe0-43aa-aed3-3c16f1737130 with inputs {'input': "What's LangSmith?"}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-14_1'.


[----------------------------->                    ] 3/5

Chain failed for example 56213781-f21d-4e3a-85c5-2d7b1c51fb01 with inputs {'input': 'When was Llama-v3 released?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-14_2'.
Chain failed for example c9541042-cdbb-4fb1-be74-0ede6cb5b222 with inputs {'input': 'When did langchain first announce the hub?'}
Error Type: RuntimeError, Message: There is no current event loop in thread 'ThreadPoolExecutor-14_4'.
Error running batch evaluator <function max_pred_length at 0x11fd57b80>: 'NoneType' object is not subscriptable


[------------------------------------------------->] 5/5

Unnamed: 0,error,execution_time,run_id
count,5,5.0,5
unique,5,,5
top,There is no current event loop in thread 'Thre...,,c490fd23-a730-4efd-a5df-c69f5a8c1fdb
freq,1,,1
mean,,1.027785,
std,,0.184764,
min,,0.842113,
25%,,0.850531,
50%,,1.019425,
75%,,1.184613,


In [34]:
# exporting datasets and run

runs = client.list_runs(project_name=chain_results["project_name"], execution_order = 1)

In [35]:
client.read_project(project_name=chain_results["project_name"]).metadata

{'env': 'testing-notebook',
 'git': {'tags': None,
  'dirty': True,
  'branch': 'main',
  'commit': 'ff00b430956e3ee6733b1938f3dd4fea9982ee8a',
  'repo_name': 'LangChain',
  'remote_url': 'https://github.com/architectyou/LangChain.git',
  'author_name': 'SunYoung Park',
  'commit_time': '1707685024',
  'author_email': 'youngkairos98@gmail.com'},
 'model': 'gpt-3.5-turbo',
 'prompt': '39f3bbd0',
 'dataset_version': '2024-04-21T16:43:54.799589+00:00'}

In [36]:
client.read_project(project_name=chain_results["project_name"]).feedback_stats

{}