# Role coverage counts

Loads the latest run's `instances.json` into a DataFrame and reports role coverage counts.

In [22]:
from pathlib import Path
import json
import pandas as pd
import numpy as np

In [None]:
runs_dir = Path("../runs")

def latest_run_dir(runs_dir: Path) -> Path:
    if not runs_dir.exists():
        raise FileNotFoundError(f"Runs directory not found: {runs_dir}")
    candidates = [p for p in runs_dir.iterdir() if p.is_dir()]
    if not candidates:
        raise FileNotFoundError(f"No run directories found under: {runs_dir}")
    return sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[0]

run_dir = latest_run_dir(runs_dir)
print(f"Using run: {run_dir.name}")

instances_path = run_dir / "instances.json"
if not instances_path.exists():
    raise FileNotFoundError(f"Missing instances.json at: {instances_path}")

with instances_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

instances = data.get("instances") or []
df = pd.json_normalize(instances)
print(df.shape)

raw_role = df.get("candidate_role_raw")
canon_role = df.get("canonical_role")

raw_role_nonempty = raw_role.fillna("").str.strip().ne("")
canon_role_present = canon_role.fillna("").str.strip().ne("")
canon_role_other = canon_role.fillna("").eq("Other")

results = {
    "non_empty_raw_role": int(raw_role_nonempty.sum()),
    "canonical_role_present": int(canon_role_present.sum()),
    "canonical_role_other": int(canon_role_other.sum()),
}

pd.DataFrame([results])

Using run: 20260128_145046
(85, 51)


Unnamed: 0,non_empty_raw_role,canonical_role_present,canonical_role_other
0,14,85,7


In [3]:
df.head(2)

Unnamed: 0,instance_key,thread_ids,candidate_client,candidate_process,candidate_role,evidence,candidate_process_raw,candidate_client_raw,candidate_role_raw,canonical_process,...,steps_state.produce,steps_state.measure,steps_state.request,steps_state.search,steps_state.confirm,steps_state.staffed,steps_state.done,steps_state.retrieve,steps_state.classify,steps_state.pipeline
0,thread:199a0adc145aa0f1,[199a0adc145aa0f1],,hiring,SR AI Engineer,"[{'message_id': 'gmail_199c47725ba7ca7f', 'tim...",hiring,,SR AI Engineer,hiring,...,,,,,,,,,,
1,thread:19995af812677279,[19995af812677279],,hiring,AI Engineer,"[{'message_id': 'gmail_199a6973ef9af93a', 'tim...",hiring,,AI Engineer,hiring,...,,,,,,,,,,


In [4]:
role_cols = [c for c in df.columns if "role" in c]
df[role_cols].head(2)


Unnamed: 0,candidate_role,candidate_role_raw,canonical_role
0,SR AI Engineer,SR AI Engineer,AI Engineer
1,AI Engineer,AI Engineer,AI Engineer


In [5]:
df.candidate_role_raw.value_counts()

candidate_role_raw
AI Engineer               6
SR AI Engineer            1
Data Science type role    1
Account manager           1
Chromatics role           1
Chromatics Position       1
Prod/Eng leader           1
Fullstack internal        1
.NET                      1
Name: count, dtype: int64

In [6]:
df.canonical_role.value_counts()

canonical_role
Unknown        71
AI Engineer     7
Other           7
Name: count, dtype: int64

In [7]:
df.canonical_process.value_counts()

canonical_process
hiring                 31
delivery               10
sales_pipeline          8
resourcing_staffing     6
marketing               2
data_ops                2
documentation           1
Name: count, dtype: int64

In [8]:
df_hiring = df[df.canonical_process == "hiring"]

In [9]:
df_hiring.candidate_role_raw.value_counts()

candidate_role_raw
AI Engineer               6
SR AI Engineer            1
Data Science type role    1
Chromatics role           1
Chromatics Position       1
Prod/Eng leader           1
Fullstack internal        1
.NET                      1
Name: count, dtype: int64

In [10]:
df_hiring.canonical_role.value_counts()

canonical_role
Unknown        18
AI Engineer     7
Other           6
Name: count, dtype: int64