In [1]:
from pathlib import Path
import json
import pandas as pd

runs_dir = Path("../runs")

def latest_run_dir(runs_dir: Path) -> Path:
    if not runs_dir.exists():
        raise FileNotFoundError(f"Runs directory not found: {runs_dir}")
    candidates = [p for p in runs_dir.iterdir() if p.is_dir()]
    if not candidates:
        raise FileNotFoundError(f"No run directories found under: {runs_dir}")
    return sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[0]

run_dir = latest_run_dir(runs_dir)
print(f"Using run: {run_dir.name}")

instances_path = run_dir / "instances.json"
if not instances_path.exists():
    raise FileNotFoundError(f"Missing instances.json at: {instances_path}")

with instances_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

instances = data.get("instances") or []
df = pd.json_normalize(instances)
print(df.shape)

Using run: 20260128_204323
(85, 54)


In [2]:
df_hiring = df[df["canonical_process"] == "hiring"].copy()
len(df), len(df_hiring)


(85, 31)

In [3]:
round((df_hiring["canonical_current_step_id"].notna()).mean(), 2)


np.float64(0.71)

In [4]:
has_raw_step = df_hiring["state.step"].notna() & (df_hiring["state.step"].astype(str).str.strip() != "")
has_canonical = df_hiring["canonical_current_step_id"].notna()

pd.DataFrame({
    "raw_step_present_pct": [has_raw_step.mean()],
    "canonical_step_present_pct": [has_canonical.mean()],
    "raw_but_not_canonical_pct": [(has_raw_step & ~has_canonical).mean()],
    "neither_raw_nor_canonical_pct": [(~has_raw_step & ~has_canonical).mean()],
})


Unnamed: 0,raw_step_present_pct,canonical_step_present_pct,raw_but_not_canonical_pct,neither_raw_nor_canonical_pct
0,0.935484,0.709677,0.225806,0.064516


In [5]:
pd.DataFrame({
    "raw_step_present_pct": [has_raw_step.mean()],
    "canonical_step_present_pct": [has_canonical.mean()],
    "raw_but_not_canonical_pct": [(has_raw_step & ~has_canonical).mean()],
    "neither_raw_nor_canonical_pct": [(~has_raw_step & ~has_canonical).mean()],
}).to_dict(orient="records")

[{'raw_step_present_pct': 0.9354838709677419,
  'canonical_step_present_pct': 0.7096774193548387,
  'raw_but_not_canonical_pct': 0.22580645161290322,
  'neither_raw_nor_canonical_pct': 0.06451612903225806}]

In [6]:
df_hiring["canonical_current_step_match_type"].value_counts(dropna=False)


canonical_current_step_match_type
alias    12
fuzzy    10
none      9
Name: count, dtype: int64

In [7]:
failed = df_hiring[has_raw_step & ~has_canonical]
failed["state.step"].value_counts().head(30)


state.step
candidate fit assessment / potential referral       1
Drafting job description and finalizing terms       1
candidate follow-up / hold                          1
candidate hired / start date scheduled              1
HR interview coordination / challenge resolution    1
role scope confirmation                             1
Gather and send prior JD and top priorities         1
Name: count, dtype: int64

In [8]:
matched = df_hiring[has_canonical]
matched[["state.step","canonical_current_step_id","canonical_current_step_match_type"]].head(30)


Unnamed: 0,state.step,canonical_current_step_id,canonical_current_step_match_type
0,interview scheduling / candidate responsiveness,interviews,fuzzy
1,post-interview feedback,interviews,alias
2,Client review / interview coordination and fee...,interviews,fuzzy
3,candidate rejected / not moving forward,close,fuzzy
5,awaiting candidate responses,interviews,alias
7,interview scheduled,interviews,alias
9,Client search on hold (awaiting reopening to p...,interviews,alias
10,candidate selection,offer,alias
11,candidate interview feedback shared,interviews,fuzzy
12,interview scheduled,interviews,alias


In [9]:
(df_hiring.assign(has_canonical=has_canonical)
 .groupby("canonical_client")["has_canonical"]
 .mean()
 .sort_values(ascending=False)
 .head(20))


canonical_client
Public Relay    1.000000
Qb              1.000000
Chromatics      0.846154
Celara Labs     0.000000
Kerry           0.000000
QU              0.000000
Name: has_canonical, dtype: float64