# Load latest run instances.json into a DataFrame

This notebook finds the most recent run under `runs/` and loads its `instances.json` as a pandas DataFrame.

In [49]:
from pathlib import Path
import json
import pandas as pd

# Root runs directory (adjust if needed)
runs_dir = Path("../runs")


def latest_run_dir(runs_dir: Path) -> Path:
    """Return the most recently modified run directory under runs_dir."""
    if not runs_dir.exists():
        raise FileNotFoundError(f"Runs directory not found: {runs_dir}")
    candidates = [p for p in runs_dir.iterdir() if p.is_dir()]
    if not candidates:
        raise FileNotFoundError(f"No run directories found under: {runs_dir}")
    latest = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)[0]
    return latest


run_dir = latest_run_dir(runs_dir)
print(f"Using run: {run_dir.name}")

instances_path = run_dir / "instances.json"
if not instances_path.exists():
    raise FileNotFoundError(f"Missing instances.json at: {instances_path}")

with instances_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

instances = data.get("instances") or []
df = pd.json_normalize(instances)
print(df.shape)
df.head()

Using run: 20260123_161428
(87, 49)


Unnamed: 0,instance_key,thread_ids,candidate_client,candidate_process,candidate_role,evidence,candidate_process_raw,candidate_client_raw,candidate_role_raw,canonical_process,...,steps_state.measure,steps_state.request,steps_state.search,steps_state.confirm,steps_state.staffed,steps_state.done,steps_state.retrieve,steps_state.classify,steps_state.pipeline,steps_state.complete
0,thread:199a0adc145aa0f1,[199a0adc145aa0f1],,hiring,SR AI Engineer,"[{'message_id': 'gmail_199c47725ba7ca7f', 'tim...",hiring,,SR AI Engineer,hiring,...,,,,,,,,,,
1,thread:19995af812677279,[19995af812677279],Chromatics AI,hiring,AI Engineer,"[{'message_id': 'gmail_199a6973ef9af93a', 'tim...",hiring,Chromatics AI,AI Engineer,hiring,...,,,,,,,,,,
2,thread:19986d695c57ee6b,[19986d695c57ee6b],CHR,hiring,AI Engineer,"[{'message_id': 'gmail_199e30814f301613', 'tim...",hiring,CHR,AI Engineer,hiring,...,,,,,,,,,,
3,thread:1981e8c0c7baaf5e,[1981e8c0c7baaf5e],Public Relay,hiring,,"[{'message_id': 'gmail_199ab67dbb1bc15d', 'tim...",hiring,Public Relay,,hiring,...,,,,,,,,,,
4,thread:199bfaeb0d858453,[199bfaeb0d858453],Chromatics.ai,Project Delivery,,"[{'message_id': 'gmail_199bfafc3cf79459', 'tim...",Project Delivery,Chromatics.ai,,delivery,...,,,,,,,,,,


In [50]:
df.candidate_client_raw.value_counts()

candidate_client_raw
Chromatics          39
Chromatics.ai        2
Tryni 2.0            2
CARPEDM              2
chromatics           2
AFAB                 2
Chromatics AI        1
CHR                  1
Public Relay         1
Celara Labs          1
Ideal Prediction     1
Celara               1
chromatics.ai        1
LRN                  1
CELARA               1
CarpeDiem            1
QU BEYOND            1
Kerry                1
Portillos            1
Portillo's           1
Name: count, dtype: int64

In [51]:
df.columns

Index(['instance_key', 'thread_ids', 'candidate_client', 'candidate_process',
       'candidate_role', 'evidence', 'candidate_process_raw',
       'candidate_client_raw', 'candidate_role_raw', 'canonical_process',
       'canonical_client', 'canonical_role', 'owner', 'steps_total',
       'steps_done', 'health', 'state.status', 'state.step', 'state.summary',
       'state.last_updated_at', 'state.confidence', 'steps_state.intake',
       'steps_state.screening', 'steps_state.interviews', 'steps_state.offer',
       'steps_state.close', 'steps_state.kickoff',
       'steps_state.implementation', 'steps_state.deployment',
       'steps_state.handoff', 'steps_state', 'steps_state.prospecting',
       'steps_state.qualification', 'steps_state.proposal', 'steps_state.poc',
       'steps_state.contract', 'steps_state.plan', 'steps_state.produce',
       'steps_state.publish', 'steps_state.measure', 'steps_state.request',
       'steps_state.search', 'steps_state.confirm', 'steps_state.staffe

In [52]:
df.canonical_client.value_counts()

Series([], Name: count, dtype: int64)

In [53]:
df[['candidate_process_raw', 'canonical_process']].head()

Unnamed: 0,candidate_process_raw,canonical_process
0,hiring,hiring
1,hiring,hiring
2,hiring,hiring
3,hiring,hiring
4,Project Delivery,delivery


In [54]:
df[['candidate_client_raw', 'canonical_client']].head()

Unnamed: 0,candidate_client_raw,canonical_client
0,,
1,Chromatics AI,
2,CHR,
3,Public Relay,
4,Chromatics.ai,


In [55]:
df[['candidate_role_raw', 'canonical_role']].head()

Unnamed: 0,candidate_role_raw,canonical_role
0,SR AI Engineer,AI Engineer
1,AI Engineer,AI Engineer
2,AI Engineer,AI Engineer
3,,Unknown
4,,Unknown


In [36]:
# # Load a specific run's instances.json into a DataFrame
# from pathlib import Path
# import json

# specific_path = Path("../runs/20260121_193930/instances.json")
# print(f"Loading: {specific_path}")
# if not specific_path.exists():
#     raise FileNotFoundError(f"{specific_path} not found. Check the path or run id.")

# with specific_path.open("r", encoding="utf-8") as f:
#     data_specific = json.load(f)

# df_specific = pd.json_normalize(data_specific.get("instances") or [])
# print(f"Loaded {len(df_specific)} rows from {specific_path}")
# df_specific.head()

In [37]:
# df_specific.candidate_client.value_counts()

In [38]:
df.loc[df.canonical_role == 'Unknown', ['candidate_role', 'canonical_role']].candidate_role.value_counts()

Series([], Name: count, dtype: int64)

In [39]:
list(df.loc[df.canonical_process.isna(), ['candidate_process', 'canonical_process']].candidate_process.unique())

[None,
 'Deployment',
 'Stakeholder Meeting',
 'Project Handover',
 'Project Tracking',
 'Account Management',
 'Contract Renewal',
 'Proposal/Estimation']

In [40]:
list(df.loc[df.canonical_process.isna(), ['candidate_process', 'canonical_process']].candidate_process.unique())

[None,
 'Deployment',
 'Stakeholder Meeting',
 'Project Handover',
 'Project Tracking',
 'Account Management',
 'Contract Renewal',
 'Proposal/Estimation']

In [41]:
df.canonical_role.value_counts()

canonical_role
Unknown        73
AI Engineer     8
Other           6
Name: count, dtype: int64

In [42]:
df.head(2)

Unnamed: 0,instance_key,thread_ids,candidate_client,candidate_process,candidate_role,evidence,candidate_process_raw,candidate_client_raw,candidate_role_raw,canonical_process,...,steps_state.measure,steps_state.request,steps_state.search,steps_state.confirm,steps_state.staffed,steps_state.done,steps_state.retrieve,steps_state.classify,steps_state.pipeline,steps_state.complete
0,thread:199a0adc145aa0f1,[199a0adc145aa0f1],,hiring,SR AI Engineer,"[{'message_id': 'gmail_199c47725ba7ca7f', 'tim...",hiring,,SR AI Engineer,hiring,...,,,,,,,,,,
1,thread:19995af812677279,[19995af812677279],Chromatics AI,hiring,AI Engineer,"[{'message_id': 'gmail_199a6973ef9af93a', 'tim...",hiring,Chromatics AI,AI Engineer,hiring,...,,,,,,,,,,


In [43]:
df.loc[df.instance_key == 'thread:199bfaeb0d858453']

Unnamed: 0,instance_key,thread_ids,candidate_client,candidate_process,candidate_role,evidence,candidate_process_raw,candidate_client_raw,candidate_role_raw,canonical_process,...,steps_state.measure,steps_state.request,steps_state.search,steps_state.confirm,steps_state.staffed,steps_state.done,steps_state.retrieve,steps_state.classify,steps_state.pipeline,steps_state.complete
4,thread:199bfaeb0d858453,[199bfaeb0d858453],Chromatics.ai,Project Delivery,,"[{'message_id': 'gmail_199bfafc3cf79459', 'tim...",Project Delivery,Chromatics.ai,,delivery,...,,,,,,,,,,
