In [1]:
from uuid import uuid4

import pandas as pd
from langchain.chat_models import init_chat_model

from plum_chatbot.configs.settings import Settings
from plum_chatbot.datasources.qdrant_datasource import QdrantDatasource
from plum_chatbot.di_containers.datasources_containers import Container, container
from plum_chatbot.schemas.schema import UserInput

settings = Settings()

In [2]:
dataset_name = "FAQ dataset"

In [3]:
judge_llm = init_chat_model(
    "llama3.2",  # qwen3:8b
    model_provider="ollama",
    base_url="host.docker.internal",
    temperature=0.5,
)

In [4]:
golden_set = pd.read_csv("data/golden_set.csv")

In [5]:
golden_set

Unnamed: 0,id,question,answer
0,1,Posso conoscere l'esito delle email inviate?,Si certamente!. Per inviare le email massime a...
1,2,Come posso reinviare un'email ad un soggetto?,Premendo sul REPORT INVII e richiamando la spe...
2,3,Come posso vedere eventuali allegati all'email...,Premendo sul REPORT INVI INVII e richiamando ...
3,4,Posso inserire la firma e il logo del mio studio?,Dal menù principale in altro l'utente deve pre...
4,5,Posso impostare più email e pec dello studio i...,Certamente si!. La piattaforma è progetta per ...
5,6,Dove configuro le email e le pec?,Nel menù principale è presente la sezione CON...
6,7,Cosa sono i bollettini digitali e come acquist...,La procedura non ha costi di attivazione e qua...
7,8,Come posso impostare i parametri dell'avviso d...,Accedendo al menù Configurazionià Parametri av...
8,9,Come posso inserire il testo dell'email per l...,Accedendo al menù Configurazionià Testi person...
9,10,Cos'è l'email evoluta?,L'email evoluta rappresenta un nuovo strumento...


In [6]:
container.init_resources()
for k, v in Container.providers.items():
    provider = v()
    if isinstance(provider, QdrantDatasource):
        provider.setup()

  self.client = QdrantClient(


In [12]:
from langsmith import Client

client = Client(api_key=settings.LANGSMITH_API_KEY)

# Programmatically create a dataset in LangSmith
# For other dataset creation methods, see:
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="A sample dataset generated from LLM (llama 3.2).",
)

# Create examples
examples = [
    {
        "inputs": {"question": row["question"]},
        "outputs": {"answer": row["answer"]},
        "metadata": {"id": row["id"]},
    }
    for _, row in golden_set.iterrows()
]

# Add examples to the dataset
client.create_examples(dataset_id=dataset.id, examples=examples);

In [13]:
async def correct(outputs: dict, reference_outputs: dict) -> bool:
    instructions = (
        "Given an actual answer and an expected answer, determine whether"
        " the actual answer contains all of the information in the"
        " expected answer. Respond with 'CORRECT' if the actual answer"
        " does contain all of the expected information and 'INCORRECT'"
        " otherwise. Do not include anything else in your response."
    )
    # Our graph outputs a State dictionary, which in this case means
    # we'll have a 'messages' key and the final message should
    # be our actual answer.
    actual_answer = outputs["messages"][-1].content
    expected_answer = reference_outputs["answer"]
    user_msg = f"ACTUAL ANSWER: {actual_answer}\n\nEXPECTED ANSWER: {expected_answer}"
    response = await judge_llm.ainvoke(
        [
            {"role": "system", "content": instructions},
            {"role": "user", "content": user_msg},
        ]
    )
    return response.content.upper() == "CORRECT"

In [14]:
from plum_chatbot.agents.agents import _handle_input
from plum_chatbot.agents.my_agent import RagAgent


def example_to_state(inputs: dict) -> dict:
    return {"messages": [{"role": "user", "content": inputs["question"]}]}


async def to_agent_input(inputs: dict) -> dict:
    kwargs, run_id, thread_id = await _handle_input(
        UserInput(message=inputs["question"], thread_id=str(uuid4())), agent.graph
    )
    return kwargs


agent = RagAgent()


# We use LCEL declarative syntax here.
# Remember that langgraph graphs are also langchain runnables.
# Compose the target as a function that unpacks the dict returned by to_agent_input
async def target(inputs):
    kwargs = await to_agent_input(inputs)
    return await agent.graph.ainvoke(**kwargs)

In [15]:
from langsmith import aevaluate

experiment_results = await aevaluate(
    target,
    data=dataset_name,
    evaluators=[correct],
    max_concurrency=4,  # optional
    experiment_prefix="llama3.2",  # optional
)

View the evaluation results for experiment: 'llama3.2-2f41ef77' at:
https://smith.langchain.com/o/7c3428ad-07e0-5237-b0f4-4dd618c80caa/datasets/396b421f-9197-4672-9f34-487dee0d15a3/compare?selectedSessions=f5cbc60d-e1a0-4e69-8495-e15da635cf4c




0it [00:00, ?it/s]