In [1]:
from pathlib import Path
import json
import pandas as pd

runs_dir = Path("../runs")

def latest_run_dir(runs_dir: Path) -> Path:
    if not runs_dir.exists():
        raise FileNotFoundError(f"Runs directory not found: {runs_dir}")
    candidates = [p for p in runs_dir.iterdir() if p.is_dir()]
    if not candidates:
        raise FileNotFoundError(f"No run directories found under: {runs_dir}")
    return sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[0]

run_dir = latest_run_dir(runs_dir)
print(f"Using run: {run_dir.name}")

instances_path = run_dir / "instances.json"
if not instances_path.exists():
    raise FileNotFoundError(f"Missing instances.json at: {instances_path}")

with instances_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

instances = data.get("instances") or []
df = pd.json_normalize(instances)
print(df.shape)

Using run: 20260128_192135
(85, 54)


In [2]:
df_hiring = df[df["canonical_process"] == "hiring"].copy()
len(df), len(df_hiring)


(85, 31)

In [4]:
round((df_hiring["canonical_current_step_id"].notna()).mean(), 2)


np.float64(0.52)

In [5]:
has_raw_step = df_hiring["state.step"].notna() & (df_hiring["state.step"].astype(str).str.strip() != "")
has_canonical = df_hiring["canonical_current_step_id"].notna()

pd.DataFrame({
    "raw_step_present_pct": [has_raw_step.mean()],
    "canonical_step_present_pct": [has_canonical.mean()],
    "raw_but_not_canonical_pct": [(has_raw_step & ~has_canonical).mean()],
    "neither_raw_nor_canonical_pct": [(~has_raw_step & ~has_canonical).mean()],
})


Unnamed: 0,raw_step_present_pct,canonical_step_present_pct,raw_but_not_canonical_pct,neither_raw_nor_canonical_pct
0,0.935484,0.516129,0.419355,0.064516


In [6]:
df_hiring["canonical_current_step_match_type"].value_counts(dropna=False)


canonical_current_step_match_type
none     15
fuzzy    12
alias     4
Name: count, dtype: int64

In [7]:
failed = df_hiring[has_raw_step & ~has_canonical]
failed["state.step"].value_counts().head(30)


state.step
rejection communicated / awaiting feedback details                              1
Client search on hold (awaiting reopening to proceed to interview)              1
candidate selection                                                             1
post-offer onboarding / next steps                                              1
candidate fit assessment / potential referral                                   1
searches and candidate pipeline                                                 1
Drafting job description and finalizing terms before scheduling kickoff call    1
sourcing/referrals requested                                                    1
Define role scope / search parameters                                           1
notes/documentation shared                                                      1
handoff / introduction                                                          1
interview_scheduling_request                                                    1
Gathe

In [8]:
matched = df_hiring[has_canonical]
matched[["state.step","canonical_current_step_id","canonical_current_step_match_type"]].head(30)


Unnamed: 0,state.step,canonical_current_step_id,canonical_current_step_match_type
0,interview scheduling / candidate response pending,interviews,fuzzy
1,post-interview feedback,interviews,alias
2,Client review / feedback on candidate CV and n...,interviews,fuzzy
5,awaiting candidate responses,interviews,alias
7,interview scheduled,interviews,alias
11,candidate interview feedback delivered,interviews,fuzzy
12,interview scheduling,interviews,alias
14,candidate sourcing / intake,intake,fuzzy
15,Offer extended; awaiting official offer letter...,offer,fuzzy
16,Drafting/amending offer letter terms (PTO/sick...,offer,fuzzy


In [9]:
(df_hiring.assign(has_canonical=has_canonical)
 .groupby("canonical_client")["has_canonical"]
 .mean()
 .sort_values(ascending=False)
 .head(20))


canonical_client
QU              1.000000
Qb              1.000000
Celara Labs     0.500000
Chromatics      0.307692
Kerry           0.000000
Public Relay    0.000000
Name: has_canonical, dtype: float64