In [1]:
from pathlib import Path
import json
import pandas as pd

runs_dir = Path("../runs")

def latest_run_dir(runs_dir: Path) -> Path:
    if not runs_dir.exists():
        raise FileNotFoundError(f"Runs directory not found: {runs_dir}")
    candidates = [p for p in runs_dir.iterdir() if p.is_dir()]
    if not candidates:
        raise FileNotFoundError(f"No run directories found under: {runs_dir}")
    return sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[0]

run_dir = latest_run_dir(runs_dir)
print(f"Using run: {run_dir.name}")

instances_path = run_dir / "instances.json"
if not instances_path.exists():
    raise FileNotFoundError(f"Missing instances.json at: {instances_path}")

with instances_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

instances = data.get("instances") or []
df = pd.json_normalize(instances)
print(df.shape)

Using run: 20260129_002417
(85, 70)


In [2]:
df_recruiting = df[df["canonical_process"] == "recruiting"].copy()
len(df), len(df_recruiting)


(85, 31)

In [3]:
round((df_recruiting["canonical_current_step_id"].notna()).mean(), 2)


np.float64(0.48)

In [4]:
has_raw_step = df_recruiting["state.step"].notna() & (df_recruiting["state.step"].astype(str).str.strip() != "")
has_canonical = df_recruiting["canonical_current_step_id"].notna()

pd.DataFrame({
    "raw_step_present_pct": [has_raw_step.mean()],
    "canonical_step_present_pct": [has_canonical.mean()],
    "raw_but_not_canonical_pct": [(has_raw_step & ~has_canonical).mean()],
    "neither_raw_nor_canonical_pct": [(~has_raw_step & ~has_canonical).mean()],
})


Unnamed: 0,raw_step_present_pct,canonical_step_present_pct,raw_but_not_canonical_pct,neither_raw_nor_canonical_pct
0,0.935484,0.483871,0.451613,0.064516


In [5]:
pd.DataFrame({
    "raw_step_present_pct": [has_raw_step.mean()],
    "canonical_step_present_pct": [has_canonical.mean()],
    "raw_but_not_canonical_pct": [(has_raw_step & ~has_canonical).mean()],
    "neither_raw_nor_canonical_pct": [(~has_raw_step & ~has_canonical).mean()],
}).to_dict(orient="records")

[{'raw_step_present_pct': 0.9354838709677419,
  'canonical_step_present_pct': 0.4838709677419355,
  'raw_but_not_canonical_pct': 0.45161290322580644,
  'neither_raw_nor_canonical_pct': 0.06451612903225806}]

In [6]:
df_recruiting["canonical_current_step_match_type"].value_counts(dropna=False)


canonical_current_step_match_type
none     16
fuzzy     9
alias     6
Name: count, dtype: int64

In [7]:
failed = df_recruiting[has_raw_step & ~has_canonical]
failed["state.step"].value_counts().head(30)


state.step
Client review / feedback and next-step decision                       1
candidate rejected / not moving forward                               1
Client search on hold (awaiting reopening to proceed to interview)    1
candidate selection                                                   1
candidate intake / sourcing                                           1
offer paperwork pending                                               1
searches and candidate pipeline management                            1
Candidate outreach decision                                           1
tech screen decision/scheduling                                       1
candidate follow-up / timing decision                                 1
HR interview coordination and challenge resolution                    1
Define search scope / role expansion                                  1
notes/documentation shared                                            1
scheduling_interview_request                         

In [8]:
matched = df_recruiting[has_canonical]
matched[["state.step","canonical_current_step_id","canonical_current_step_match_type"]].head(30)


Unnamed: 0,state.step,canonical_current_step_id,canonical_current_step_match_type
0,candidate outreach / interview scheduling,interview-scheduling,fuzzy
1,post-interview feedback,client-interview-feedback,alias
5,awaiting candidate responses,interview-scheduling,alias
7,interview scheduled,interview-scheduling,alias
11,candidate interview feedback shared,client-interview-feedback,fuzzy
12,interview scheduling,interview-scheduling,alias
16,offer letter drafting/amendment,offer-letter,fuzzy
18,post-offer onboarding / next steps,admin-onboarding-process,fuzzy
19,candidate fit assessment / potential referral,referral,fuzzy
57,Drafting job description and finalizing terms ...,role-details,fuzzy


In [9]:
(df_recruiting.assign(has_canonical=has_canonical)
 .groupby("canonical_client")["has_canonical"]
 .mean()
 .sort_values(ascending=False)
 .head(20))


canonical_client
Celara Labs     1.000000
Kerry           1.000000
Chromatics      0.384615
Public Relay    0.000000
QU              0.000000
Qb              0.000000
Name: has_canonical, dtype: float64

In [10]:
val = "candidate selection"
rows = df_recruiting[df_recruiting["state.step"] == val]
rows[["instance_key", "evidence"]].head(3)


Unnamed: 0,instance_key,evidence
10,thread:199e91bdb002d6ca,"[{'message_id': 'gmail_199eb1ab091c4ebf', 'tim..."


In [11]:
rows.evidence.item()

[{'message_id': 'gmail_199eb1ab091c4ebf',
  'timestamp': '2025-10-16T00:23:14',
  'event_type': 'decision',
  'confidence': 0.74,
  'snippet': 'I don’t think guido or german will work'}]

In [16]:
val = "offer paperwork pending"
rows = df_recruiting[df_recruiting["state.step"] == val]
rows[["instance_key", "evidence"]].head(3)


Unnamed: 0,instance_key,evidence
15,thread:19a177a186d04786,"[{'message_id': 'gmail_19a177d95736e58a', 'tim..."


In [17]:
rows.evidence.item()

[{'message_id': 'gmail_19a177d95736e58a',
  'timestamp': '2025-10-24T15:31:37',
  'event_type': 'status_update',
  'confidence': 0.86,
  'snippet': 'congratulations - we really liked you - happy to make you an offer to work with Chromatics and Celara. The official offer letter will come from Chromatics'},
 {'message_id': 'gmail_19a179961571157b',
  'timestamp': '2025-10-24T16:01:46',
  'event_type': 'status_update',
  'confidence': 0.74,
  'snippet': 'Just to say congratulations - we really liked you - happy to make you an offer to work with Chromatics and Celara. The official offer letter will come from Chromatics'},
 {'message_id': 'gmail_19a179a6003a95b7',
  'timestamp': '2025-10-24T16:02:52',
  'event_type': 'status_update',
  'confidence': 0.78,
  'snippet': 'We’re excited to have you aboard Franco.\n\nI’ll send all the paperwork by Monday.'},
 {'message_id': 'gmail_19a178b426a34769',
  'timestamp': '2025-10-24T15:46:21',
  'event_type': 'process_signal',
  'confidence': 0.32,
  '