### Release notes

In this version we changed PROMPT changed to:

```
你是一名准入判别代理,负责判断用户输入是否在系统定义的领域与可执行范围内

# 系统领域范围:
你属于实验室智能机器人, 你的名字是Talos, 这个系统是面向小分子合成及 DMPK 实验室的对话式助手,具备机器人任务布置、化学专业问答与实验室运营查询三类核心可执行。
机器人可执行/支持: TLC 点板、过柱、LC-MS 前处理与送样、旋蒸、称重入库,以及贯穿纯化流程;另支持 DMPK 稳定性测试。
专业问答: TLC 条件设计、过柱条件推荐、旋蒸条件推荐、物质属性查询。
运营查询: 实验任务进度、机器人状态、仪器状态、物料位置与状态。

# 判断依据
1. 领域为小分子合成和 DMPK 场景, 任何超出该范围, 或没有明确表明意图的输入, 视为越界和超越范围。
2. 可执行为机器人任务布置、化学专业问答与实验室运营查询三类核心可执行。
3. 任何超出该范围, 或没有明确表明意图的输入, 视为越界和超越范围。

# 输出格式
请严格按照以下 JSON 格式输出结果:
{
    "within_domain": bool,  // 用户输入是否在系统定义的领域范围内
    "within_capacity": bool, // 用户输入是否在系统定义的可执行范围内
    "feedback": str         // 对判别结果的简要说明
}

# 示例
示例 1:
用户输入: "请帮我设计一个 TLC 条件"
输出: {
    "within_domain": true,
    "within_capacity": true,
    "feedback": "用户输入在小分子合成领域内, 且符合系统能力范围."
}

示例 2:
用户输入: "你能告诉我今天的天气吗?"
输出: {
    "within_domain": false,
    "within_capacity": false,
    "feedback": "用户输入不在小分子合成和 DMPK 领域内, 超出系统能力范围."
}

示例 3:
用户输入: "帮我做一下萃取"
输出: {
    "within_domain": true,
    "within_capacity": false,
    "feedback": "用户输入在小分子合成领域内, 但不在系统可执行范围内."
}
```



## Prepare dataset

Under following cells, we will read data under `data` folder and convert them into pandas dataframe. `senario` property will be used as df name while `examples` will be converted into rows.

fetch them from `dfs['<property>']`

In [None]:
import sys

sys.path.append("../../")  # import src

In [None]:
import json
from pathlib import Path

import pandas as pd

data_dir = Path("/Users/drakezhou/Development/big-poc/src/data")

dfs = {}
for json_path in data_dir.glob("*.json"):
    with json_path.open(encoding="utf-8") as f:
        obj = json.load(f)
    senario_name = obj["senario"]

    cdf = pd.DataFrame(obj["examples"])

    match senario_name:
        case "valid_can_do":
            cdf["target_within_domain"] = True
            cdf["target_within_capacity"] = True
        case "valid_cannot_do":
            cdf["target_within_domain"] = True
            cdf["target_within_capacity"] = False
        case "invalid":
            cdf["target_within_domain"] = False
            cdf["target_within_capacity"] = False
        case _:
            raise ValueError(f"Unknown senario: {senario_name}")

    dfs[senario_name] = cdf

# Example
dfs["valid_can_do"].head()

## Benchmark

Use to evaluate the performance of each node

In [None]:
from typing import TYPE_CHECKING

from langchain_core.messages import AIMessage, AnyMessage, HumanMessage

from src.functions.admittance import WatchDogAgent

if TYPE_CHECKING:
    from src.classes.operation import OperationResponse
    from src.classes.system_state import UserAdmittance

watch_dog = WatchDogAgent()


def run_watch_dog_agent(dataset: pd.DataFrame) -> pd.DataFrame:
    within_domain_list: list[bool] = []
    within_capacity_list: list[bool] = []
    feedback_list: list[str] = []

    for _, row in dataset.iterrows():
        input_msg = []
        for msg in row["dialogue"]:
            if msg["role"] == "assistant":
                input_msg.append(AIMessage(content=msg["message"]))
            elif msg["role"] == "user":
                input_msg.append(HumanMessage(content=msg["message"]))

        res: OperationResponse[list[AnyMessage], UserAdmittance] = watch_dog.run(
            user_input=input_msg,
            stream_mode=False,
        )
        within_domain_list.append(res.output.within_domain)
        within_capacity_list.append(res.output.within_capacity)
        feedback_list.append(res.output.feedback)

    dataset = dataset.copy()
    dataset["within_domain"] = within_domain_list
    dataset["within_capacity"] = within_capacity_list
    dataset["feedback"] = feedback_list
    return dataset

Valid Can Do

In [None]:
# Evaluate performance using `Valid Can DO` Dataset

eval_df = run_watch_dog_agent(dfs["valid_can_do"])

In [None]:
# Add configuration information and show
version_control = {
    "model": "gpt-5.1",
    "prompt_version": "v1",
}

eval_df["version_control"] = [version_control] * len(dfs["valid_can_do"])

eval_df.head()

valid_cannot_do

In [None]:
eval_valid_cannot_do = run_watch_dog_agent(dfs["valid_cannot_do"])

version_control = {
    "model": "gpt-5.1",
    "prompt_version": "v1",
}

eval_valid_cannot_do["version_control"] = [version_control] * len(dfs["valid_cannot_do"])

eval_valid_cannot_do.head()

Invalid

In [None]:
eval_invalid = run_watch_dog_agent(dfs["invalid"])

version_control = {
    "model": "gpt-5.1",
    "prompt_version": "v1",
}

eval_invalid["version_control"] = [version_control] * len(dfs["invalid"])

eval_invalid.head()

In [None]:
import matplotlib.pyplot as plt  # pyright: ignore[reportMissingImports]
import seaborn as sns  # pyright: ignore[reportMissingImports]
from sklearn.metrics import classification_report, confusion_matrix  # pyright: ignore[reportMissingImports]

# Aggregate evaluation across all datasets and plot confusion matrices
combined_eval = pd.concat(
    [
        eval_df.assign(senario="valid_can_do"),
        eval_valid_cannot_do.assign(senario="valid_cannot_do"),
        eval_invalid.assign(senario="invalid"),
    ],
    ignore_index=True,
)


def compute_binary_metrics(df: pd.DataFrame, target_col: str, pred_col: str) -> tuple[pd.DataFrame, pd.DataFrame]:
    y_true = df[target_col]
    y_pred = df[pred_col]
    report = classification_report(
        y_true,
        y_pred,
        output_dict=True,
        zero_division=0,
    )
    cm = confusion_matrix(y_true, y_pred, labels=[True, False])
    cm_df = pd.DataFrame(
        cm,
        index=pd.Index(["actual_true", "actual_false"]),
        columns=pd.Index(["pred_true", "pred_false"]),
    )
    return pd.DataFrame(report).T, cm_df


domain_report, domain_cm = compute_binary_metrics(
    combined_eval,
    target_col="target_within_domain",
    pred_col="within_domain",
)
capacity_report, capacity_cm = compute_binary_metrics(
    combined_eval,
    target_col="target_within_capacity",
    pred_col="within_capacity",
)

display(domain_report)
display(capacity_report)

fig, axes = plt.subplots(1, 2, figsize=(10, 4))

sns.heatmap(
    domain_cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    ax=axes[0],
)
axes[0].set_title("Within Domain Confusion Matrix")
axes[0].set_xlabel("Predicted")
axes[0].set_ylabel("Actual")

sns.heatmap(
    capacity_cm,
    annot=True,
    fmt="d",
    cmap="Greens",
    ax=axes[1],
)
axes[1].set_title("Within Capacity Confusion Matrix")
axes[1].set_xlabel("Predicted")
axes[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()