# Support Response Evals (Casper's Kitchens)

This notebook shows how to evaluate support agent responses with MLflow judges and compare judge outcomes against human thumbs ratings captured in the support app.

## What this covers

1. Build an evaluation dataset from support requests and latest agent responses
2. Load human ratings (`thumbs_up` / `thumbs_down`) from Lakebase
3. Run `mlflow.genai.evaluate()` with an LLM judge
4. Compute `human_agreement_rate` between judge outcomes and app ratings
5. Review results in Databricks UI (MLflow traces + evaluation views)

In [None]:
%pip install -U -qqqq mlflow[databricks] databricks-sdk psycopg2-binary pandas
%restart_python

In [None]:
import json
import pandas as pd
import mlflow
import psycopg2
from databricks.sdk import WorkspaceClient
from mlflow.genai.judges import make_judge

dbutils.widgets.text("CATALOG", "caspersdev", "UC Catalog")
dbutils.widgets.text("JUDGE_MODEL", "databricks-gpt-5-mini", "Judge model endpoint")
dbutils.widgets.text("EVAL_LIMIT", "50", "Max examples")

CATALOG = dbutils.widgets.get("CATALOG")
JUDGE_MODEL = dbutils.widgets.get("JUDGE_MODEL")
EVAL_LIMIT = int(dbutils.widgets.get("EVAL_LIMIT") or "50")

experiment = mlflow.set_experiment(f"/Shared/{CATALOG}_support_response_evals")
print(f"Using experiment: {experiment.name}")

In [None]:
# Load latest support response per request and join request text.
support_df = spark.sql(f"""
WITH latest_reports AS (
  SELECT support_request_id, user_id, order_id, ts, agent_response,
         ROW_NUMBER() OVER (PARTITION BY support_request_id ORDER BY ts DESC) AS rn
  FROM {CATALOG}.support.support_agent_reports_sync
)
SELECT
  l.support_request_id,
  l.user_id,
  l.order_id,
  l.ts,
  l.agent_response,
  r.request_text
FROM latest_reports l
LEFT JOIN {CATALOG}.support.raw_support_requests r
  ON r.support_request_id = l.support_request_id
WHERE l.rn = 1
ORDER BY l.ts DESC
LIMIT {EVAL_LIMIT}
""").toPandas()

print(f"Loaded {len(support_df)} support responses")
support_df.head(3)

In [None]:
# Load latest human rating from Lakebase support.response_ratings.
config_row = spark.sql(f"""
SELECT endpoint_name, endpoint_host, database_name
FROM {CATALOG}.support.lakebase_v2_config
ORDER BY updated_at DESC
LIMIT 1
""").collect()[0]

endpoint_name = config_row["endpoint_name"]
endpoint_host = config_row["endpoint_host"]
database_name = config_row["database_name"]

w = WorkspaceClient()
creds = w.postgres.generate_database_credential(endpoint=endpoint_name)
current_user = w.current_user.me().user_name

ratings_df = pd.DataFrame(
    columns=["support_request_id", "rating", "reason_code", "feedback_notes", "actor", "created_at"]
)

try:
    conn = psycopg2.connect(
        host=endpoint_host,
        port=5432,
        dbname=database_name,
        user=current_user,
        password=creds.token,
        sslmode="require",
    )
    with conn.cursor() as cur:
        cur.execute(
            """
            SELECT support_request_id, rating, reason_code, feedback_notes, actor, created_at
            FROM support.response_ratings
            ORDER BY created_at DESC
            """
        )
        rows = cur.fetchall()
    conn.close()
    ratings_df = pd.DataFrame(
        rows,
        columns=["support_request_id", "rating", "reason_code", "feedback_notes", "actor", "created_at"],
    )
except Exception as e:
    print(f"Could not load ratings yet: {e}")

if not ratings_df.empty:
    ratings_df = (
        ratings_df.sort_values("created_at", ascending=False)
        .drop_duplicates(subset=["support_request_id"], keep="first")
        .reset_index(drop=True)
    )

print(f"Loaded {len(ratings_df)} latest human ratings")
ratings_df.head(3)

In [None]:
def parse_report(raw):
    if isinstance(raw, dict):
        return raw
    if isinstance(raw, str):
        try:
            return json.loads(raw)
        except Exception:
            return {}
    return {}

eval_data = []
for _, row in support_df.iterrows():
    report = parse_report(row.get("agent_response"))
    eval_data.append(
        {
            "inputs": {
                "support_request_id": row.get("support_request_id"),
                "order_id": row.get("order_id"),
                "request_text": row.get("request_text") or "",
            },
            "outputs": {
                "draft_response": report.get("draft_response"),
                "refund_recommendation": report.get("refund_recommendation"),
                "credit_recommendation": report.get("credit_recommendation"),
                "decision_confidence": report.get("decision_confidence"),
                "escalation_flag": report.get("escalation_flag"),
            },
        }
    )

print(f"Prepared {len(eval_data)} eval rows")

In [None]:
policy_compliance_judge = make_judge(
    name="support_policy_compliance",
    instructions="""
Evaluate whether the support response is policy-compliant and actionable.

Support case input: {{ inputs }}
Agent output: {{ outputs }}

Pass criteria:
- Draft response is professional and directly addresses the complaint.
- If refund/credit is recommended, amount or rationale is coherent.
- Response does not expose internal system identifiers.
- Escalation flag and confidence are consistent with severity.

Return PASS only when all major criteria are satisfied. Otherwise return FAIL.
""",
    model=f"databricks:/{JUDGE_MODEL}",
)

evaluation_result = mlflow.genai.evaluate(
    data=eval_data,
    scorers=[policy_compliance_judge],
)

print("Evaluation completed")
evaluation_result

In [None]:
def to_pandas_table(obj):
    if obj is None:
        return None
    if isinstance(obj, pd.DataFrame):
        return obj
    if hasattr(obj, "toPandas"):
        return obj.toPandas()
    return None

eval_df = None
if hasattr(evaluation_result, "tables") and isinstance(evaluation_result.tables, dict):
    for _, table in evaluation_result.tables.items():
        candidate = to_pandas_table(table)
        if candidate is not None and len(candidate) > 0:
            eval_df = candidate
            break

if eval_df is None:
    raise RuntimeError("Could not extract evaluation rows from EvaluationResult.tables")

judge_value_col = next((c for c in eval_df.columns if "support_policy_compliance" in c and c.endswith("value")), None)
inputs_col = "inputs" if "inputs" in eval_df.columns else None

if judge_value_col is None or inputs_col is None:
    print("Available columns:", list(eval_df.columns))
    raise RuntimeError("Expected evaluation columns were not found")

def parse_inputs(value):
    if isinstance(value, dict):
        return value
    if isinstance(value, str):
        try:
            return json.loads(value)
        except Exception:
            return {}
    return {}

eval_df["support_request_id"] = eval_df[inputs_col].apply(lambda v: parse_inputs(v).get("support_request_id"))
eval_df["judge_label"] = eval_df[judge_value_col].astype(str).str.upper().map({"PASS": "thumbs_up", "FAIL": "thumbs_down"})

if ratings_df.empty:
    print("No human ratings yet. Submit ratings in the support app, then rerun this cell.")
else:
    merged = eval_df[["support_request_id", "judge_label"]].merge(
        ratings_df[["support_request_id", "rating"]],
        on="support_request_id",
        how="inner",
    )
    if len(merged) == 0:
        print("No overlap yet between evaluated rows and rated rows.")
    else:
        merged["agrees"] = merged["judge_label"] == merged["rating"]
        human_agreement_rate = float(merged["agrees"].mean())
        print(f"human_agreement_rate: {human_agreement_rate:.2%} ({merged['agrees'].sum()}/{len(merged)})")
        display(merged.head(20))

## Where to review in Databricks UI

1. **Support app UI**
   - Open a support request in the details drawer.
   - Use **Rate Agent Response** to save thumbs up/down (+ optional reason and notes).

2. **Lakebase ratings table**
   - Query `support.response_ratings` to audit latest operator feedback per request.

3. **MLflow experiment view** (`/Shared/<CATALOG>_support_response_evals`)
   - **Runs**: see each `mlflow.genai.evaluate` execution.
   - **Evaluation** tab: see scorer pass/fail values and judge rationales.
   - **Traces** tab / trace detail: inspect per-example records and add human assessments if you want judge alignment.

4. **Judge alignment (optional)**
   - After adding human assessments in trace details, align the template judge with `judge.align(...)` in a follow-up run.