In [1]:
from pathlib import Path
import json
import pandas as pd

runs_dir = Path("../runs")

def latest_run_dir(runs_dir: Path) -> Path:
    if not runs_dir.exists():
        raise FileNotFoundError(f"Runs directory not found: {runs_dir}")
    candidates = [p for p in runs_dir.iterdir() if p.is_dir()]
    if not candidates:
        raise FileNotFoundError(f"No run directories found under: {runs_dir}")
    return sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[0]

run_dir = latest_run_dir(runs_dir)
print(f"Using run: {run_dir.name}")

instances_path = run_dir / "instances.json"
if not instances_path.exists():
    raise FileNotFoundError(f"Missing instances.json at: {instances_path}")

with instances_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

instances = data.get("instances") or []
df = pd.json_normalize(instances)
print(df.shape)

Using run: 20260128_213740
(85, 70)


In [10]:
df_recruiting = df[df["canonical_process"] == "recruiting"].copy()
len(df), len(df_recruiting)


(85, 31)

In [11]:
round((df_recruiting["canonical_current_step_id"].notna()).mean(), 2)


np.float64(0.26)

In [12]:
has_raw_step = df_recruiting["state.step"].notna() & (df_recruiting["state.step"].astype(str).str.strip() != "")
has_canonical = df_recruiting["canonical_current_step_id"].notna()

pd.DataFrame({
    "raw_step_present_pct": [has_raw_step.mean()],
    "canonical_step_present_pct": [has_canonical.mean()],
    "raw_but_not_canonical_pct": [(has_raw_step & ~has_canonical).mean()],
    "neither_raw_nor_canonical_pct": [(~has_raw_step & ~has_canonical).mean()],
})


Unnamed: 0,raw_step_present_pct,canonical_step_present_pct,raw_but_not_canonical_pct,neither_raw_nor_canonical_pct
0,0.935484,0.258065,0.677419,0.064516


In [13]:
pd.DataFrame({
    "raw_step_present_pct": [has_raw_step.mean()],
    "canonical_step_present_pct": [has_canonical.mean()],
    "raw_but_not_canonical_pct": [(has_raw_step & ~has_canonical).mean()],
    "neither_raw_nor_canonical_pct": [(~has_raw_step & ~has_canonical).mean()],
}).to_dict(orient="records")

[{'raw_step_present_pct': 0.9354838709677419,
  'canonical_step_present_pct': 0.25806451612903225,
  'raw_but_not_canonical_pct': 0.6774193548387096,
  'neither_raw_nor_canonical_pct': 0.06451612903225806}]

In [14]:
df_recruiting["canonical_current_step_match_type"].value_counts(dropna=False)


canonical_current_step_match_type
none     23
fuzzy     8
Name: count, dtype: int64

In [15]:
failed = df_recruiting[has_raw_step & ~has_canonical]
failed["state.step"].value_counts().head(30)


state.step
interview scheduled                                                   2
Client review / feedback on candidate CV and next steps               1
tech screen decision                                                  1
handoff/intro                                                         1
notes/documentation shared                                            1
role scope clarification                                              1
HR interview coordination and candidate challenge resolution          1
hired                                                                 1
sourcing_outreach                                                     1
candidate follow-up / decision pending                                1
candidate outreach decision                                           1
client decision / rejection                                           1
Drafting job description and finalizing terms                         1
searches ongoing / starting new searches             

In [16]:
matched = df_recruiting[has_canonical]
matched[["state.step","canonical_current_step_id","canonical_current_step_match_type"]].head(30)


Unnamed: 0,state.step,canonical_current_step_id,canonical_current_step_match_type
0,interview scheduling / candidate engagement,interview-scheduling,fuzzy
1,post-interview feedback,client-interview-feedback,fuzzy
15,Offer stage (offer made; paperwork/official of...,offer-letter,fuzzy
16,offer letter drafting/amendment,offer-letter,fuzzy
18,post-offer / onboarding next steps,admin-onboarding-process,fuzzy
19,candidate fit assessment / potential referral,referral,fuzzy
61,Candidate role assignment in Lever,lever,fuzzy
72,Interview scheduling request,interview-scheduling,fuzzy


In [17]:
(df_recruiting.assign(has_canonical=has_canonical)
 .groupby("canonical_client")["has_canonical"]
 .mean()
 .sort_values(ascending=False)
 .head(20))


canonical_client
Celara Labs     0.500000
Chromatics      0.307692
Kerry           0.000000
Public Relay    0.000000
QU              0.000000
Qb              0.000000
Name: has_canonical, dtype: float64