In [1]:
from uuid import uuid4

import pandas as pd
from langchain.chat_models import init_chat_model

from plum_chatbot.configs.settings import Settings
from plum_chatbot.datasources.qdrant_datasource import QdrantDatasource
from plum_chatbot.di_containers.datasources_containers import Container, container
from plum_chatbot.schemas.schema import UserInput

settings = Settings()

In [2]:
judge_llm = init_chat_model(
    "llama3.2",  # qwen3:8b
    model_provider="ollama",
    base_url="host.docker.internal",
    temperature=0.5,
)

In [3]:
golden_set = pd.read_csv("data/generated_golden_set_with_answers.csv")

In [4]:
golden_set

Unnamed: 0,id,question,real_answer,content_used,llm_answer
0,0,Come posso configurare una nuova email ordinar...,Per configurare una nuova email ordinaria all'...,COME CONFIGUARE LE EMAIL E LA PEC,Per configurare una nuova email ordinaria nell...
1,1,"Non trovo la configurazione dell'email, mi aiuti?",Per configurare l'email accedi al portale e va...,COME CONFIGUARE LE EMAIL E LA PEC,"Per configurare l'email su Plum, segui questi ..."
2,2,Cosa fa esattamente il servizio TurboSMTP?,Il servizio TurboSMTP è un'alternativa all'inv...,COME CONFIGUARE LE EMAIL E LA PEC,Il servizio TurboSMTP è una funzionalità del p...
3,3,Dove si inseriscono le coordinate delle buste ...,Per inserire le coordinate delle buste da lett...,COME INSERIRE LE COORDINATE DELLA BUSTA DA LET...,Per inserire le coordinate delle buste da lett...
4,4,C'e' un modo per usare la Dymo per stampare le...,"Sì, la piattaforma permette di stampare le eti...",COME INSERIRE LE COORDINATE DELLE ETICHETTE,"Gentile utente, per stampare le etichette util..."
5,5,"Che differenza c'e' tra le 1, 2 e 3 piste dell...",Le piste rappresentano il numero di etichette ...,COME INSERIRE LE COORDINATE DELLE ETICHETTE,"Le etichette ""1"", ""2"" e ""3"" si riferiscono all..."
6,6,Come posso configurare un bollettino 123?,Per configurare i bollettini 123 accedi al men...,COME IMPOSTARE LE COORDINATE DEI BOLLETTINI 123,"Per configurare un bollettino 123 in Plum, puo..."
7,7,Dove si salvano i testi personalizzati per le ...,I testi personalizzati si configurano dal menu...,COME CONFIGUARE I TESTI PERSONALIZZATI,Per salvare i testi personalizzati per le emai...
8,8,Posso scrivere testi diversi per ogni sollecito?,"Sì, la piattaforma permette di impostare testi...",COME CONFIGUARE I TESTI PERSONALIZZATI,"No, non è possibile scrivere testi diversi per..."
9,9,Come cambio i dati del mittente sulle buste?,Per modificare i dati del mittente da stampare...,COME IMPOSTARE I DATI DEL MITTENTE,"Per cambiare i dati del mittente sulle buste, ..."


In [5]:
container.init_resources()
for k, v in Container.providers.items():
    provider = v()
    if isinstance(provider, QdrantDatasource):
        provider.setup()

  self.client = QdrantClient(


In [None]:
from langsmith import Client

client = Client(api_key=settings.LANGSMITH_API_KEY)

# Programmatically create a dataset in LangSmith
# For other dataset creation methods, see:
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_programmatically
# https://docs.smith.langchain.com/evaluation/how_to_guides/manage_datasets_in_application
dataset = client.create_dataset(
    dataset_name="Generated FAQ dataset",
    description="A sample dataset generated from LLM (llama 3.2).",
)

# Create examples
examples = [
    {
        "inputs": {"question": row["question"]},
        "outputs": {"answer": row["real_answer"]},
        "metadata": {"id": row["id"], "content_used": row["content_used"]},
    }
    for _, row in golden_set.iterrows()
]

# Add examples to the dataset
client.create_examples(dataset_id=dataset.id, examples=examples);

In [6]:
async def correct(outputs: dict, reference_outputs: dict) -> bool:
    instructions = (
        "Given an actual answer and an expected answer, determine whether"
        " the actual answer contains all of the information in the"
        " expected answer. Respond with 'CORRECT' if the actual answer"
        " does contain all of the expected information and 'INCORRECT'"
        " otherwise. Do not include anything else in your response."
    )
    # Our graph outputs a State dictionary, which in this case means
    # we'll have a 'messages' key and the final message should
    # be our actual answer.
    actual_answer = outputs["messages"][-1].content
    expected_answer = reference_outputs["answer"]
    user_msg = f"ACTUAL ANSWER: {actual_answer}\n\nEXPECTED ANSWER: {expected_answer}"
    response = await judge_llm.ainvoke(
        [
            {"role": "system", "content": instructions},
            {"role": "user", "content": user_msg},
        ]
    )
    return response.content.upper() == "CORRECT"

In [7]:
from plum_chatbot.agents.agents import _handle_input
from plum_chatbot.agents.my_agent import RagAgent


def example_to_state(inputs: dict) -> dict:
    return {"messages": [{"role": "user", "content": inputs["question"]}]}


async def to_agent_input(inputs: dict) -> dict:
    kwargs, run_id, thread_id = await _handle_input(
        UserInput(message=inputs["question"], thread_id=str(uuid4())), agent.graph
    )
    return kwargs


#     input = {"messages": [HumanMessage(content=inputs["question"])]}
#     run_id = uuid4()
#     thread_id = str(uuid4())
#     user_id = str(uuid4())
#     configurable = {
#         "thread_id": thread_id,
#         "model": "rag",
#         "user_id": user_id,
#     }
#     config = RunnableConfig(
#         configurable=configurable,
#         run_id=run_id,
#     )
#     kwargs = {
#         "input": input,
#         "config": config,
#     }
#     return kwargs


agent = RagAgent()


# We use LCEL declarative syntax here.
# Remember that langgraph graphs are also langchain runnables.
# Compose the target as a function that unpacks the dict returned by to_agent_input
async def target(inputs):
    kwargs = await to_agent_input(inputs)
    return await agent.graph.ainvoke(**kwargs)

In [9]:
from langsmith import aevaluate

experiment_results = await aevaluate(
    target,
    data="Generated FAQ dataset",
    evaluators=[correct],
    max_concurrency=4,  # optional
    experiment_prefix="llama3.2",  # optional
)

View the evaluation results for experiment: 'llama3.2-376bee6b' at:
https://smith.langchain.com/o/7c3428ad-07e0-5237-b0f4-4dd618c80caa/datasets/047acf7b-9c53-43c9-a003-bbdc71caaba6/compare?selectedSessions=4abea81e-15a5-4933-9c36-fca8699094c4




0it [00:00, ?it/s]