# 0) Setup

In [4]:
from pathlib import Path
import pandas as pd
from dotenv import load_dotenv; load_dotenv()

from humaidclf import run_experiment
from humaidclf import build_token_index               # from budget.py
from humaidclf.batch import use_api_key_env           # context manager for key switching
from rules import RULES_1

# --- config ---
BASE = Path("Dataset/HumAID")
SPLITS = ["train"]             # or ["train","dev","test"]
MODEL = "gpt-4o-mini"
RULES = RULES_1
TAG = "modeS-RULES1"
DRYRUN_N = 20
POLL_SECS = 300
DO_ANALYSIS = True
OUT_ROOT = "runs"

BATCH_TOKEN_LIMIT = 2_000_000  # Tier-1 cap
SAFETY_MARGIN = 0.90           # 10% headroom
MAX_OUTPUT_TOKENS = 40


# 1) Discover datasets (events/splits)

In [3]:
# --- discover datasets ---
def discover_tsvs(base: Path, splits: list[str]):
    items = []
    for event_dir in sorted([p for p in base.iterdir() if p.is_dir()]):
        event = event_dir.name
        for split in splits:
            tsv = event_dir / f"{event}_{split}.tsv"
            if tsv.exists():
                items.append({"event": event, "split": split, "tsv": str(tsv)})
    return pd.DataFrame(items)

df_sources = discover_tsvs(BASE, SPLITS)

# --- token budgeting ---
token_index = build_token_index(
    df_sources,
    model=MODEL,
    rules_text=RULES,
    batch_token_limit=BATCH_TOKEN_LIMIT,
    safety_margin=SAFETY_MARGIN,
    sample_size=200,
    max_output_tokens=MAX_OUTPUT_TOKENS,
)

display(token_index)

df_fit     = token_index[token_index["fits_cap"]].reset_index(drop=True)
df_too_big = token_index[~token_index["fits_cap"]].reset_index(drop=True)

print("OK to run with Tier-1 key:")
display(df_fit[["event","split","num_rows","est_total_tokens","limit_used_%"]])

print("Too big for Tier-1 (use alternate key):")
display(df_too_big[["event","split","num_rows","est_total_tokens","limit_used_%"]])

Unnamed: 0,event,split,tsv,num_rows,avg_req_tokens,est_total_tokens,fits_cap,limit_used_%
8,kaikoura_earthquake_2016,train,Dataset\HumAID\kaikoura_earthquake_2016\kaikou...,1536,440,675840,True,33.8
1,canada_wildfires_2016,train,Dataset\HumAID\canada_wildfires_2016\canada_wi...,1569,439,688791,True,34.4
2,cyclone_idai_2019,train,Dataset\HumAID\cyclone_idai_2019\cyclone_idai_...,2753,461,1269133,True,63.5
4,hurricane_florence_2018,train,Dataset\HumAID\hurricane_florence_2018\hurrica...,4384,455,1994720,False,99.7
7,hurricane_maria_2017,train,Dataset\HumAID\hurricane_maria_2017\hurricane_...,5094,442,2251548,False,112.6
0,california_wildfires_2018,train,Dataset\HumAID\california_wildfires_2018\calif...,5163,451,2328513,False,116.4
3,hurricane_dorian_2019,train,Dataset\HumAID\hurricane_dorian_2019\hurricane...,5329,455,2424695,False,121.2
9,kerala_floods_2018,train,Dataset\HumAID\kerala_floods_2018\kerala_flood...,5588,460,2570480,False,128.5
5,hurricane_harvey_2017,train,Dataset\HumAID\hurricane_harvey_2017\hurricane...,6378,440,2806320,False,140.3
6,hurricane_irma_2017,train,Dataset\HumAID\hurricane_irma_2017\hurricane_i...,6579,440,2894760,False,144.7


OK to run with Tier-1 key:


Unnamed: 0,event,split,num_rows,est_total_tokens,limit_used_%
0,kaikoura_earthquake_2016,train,1536,675840,33.8
1,canada_wildfires_2016,train,1569,688791,34.4
2,cyclone_idai_2019,train,2753,1269133,63.5


Too big for Tier-1 (use alternate key):


Unnamed: 0,event,split,num_rows,est_total_tokens,limit_used_%
0,hurricane_florence_2018,train,4384,1994720,99.7
1,hurricane_maria_2017,train,5094,2251548,112.6
2,california_wildfires_2018,train,5163,2328513,116.4
3,hurricane_dorian_2019,train,5329,2424695,121.2
4,kerala_floods_2018,train,5588,2570480,128.5
5,hurricane_harvey_2017,train,6378,2806320,140.3
6,hurricane_irma_2017,train,6579,2894760,144.7


# 2) Run all datasets (sequentially)

In [5]:
# --- helpers to run a list of datasets ---
def run_list(dflist: pd.DataFrame, rules_text: str, model: str, tag: str):
    results = []
    for _, row in dflist.iterrows():
        event, split, tsv = row["event"], row["split"], row["tsv"]
        print(f"\n=== Running {event}/{split} ({model} | {tag}) ===")
        try:
            plan, preds, summary = run_experiment(
                dataset_path=tsv,
                rules=rules_text,
                model=model,
                tag=tag,
                dryrun_n=DRYRUN_N,
                poll_secs=POLL_SECS,
                out_root=OUT_ROOT,
                do_analysis=DO_ANALYSIS,
            )
            results.append({
                "event": event,
                "split": split,
                "run_dir": str(plan["dir"]),
                "predictions_csv": str(plan["predictions_csv"]),
                "macro_f1": summary.get("macro_f1"),
                "accuracy": summary.get("accuracy"),
                "num_total": summary.get("num_total_with_truth"),
            })
        except Exception as e:
            print(f"[ERROR] {event}/{split}: {e}")
            results.append({
                "event": event,
                "split": split,
                "run_dir": "ERROR",
                "predictions_csv": "",
                "macro_f1": float("nan"),
                "accuracy": float("nan"),
                "num_total": 0,
            })
    return pd.DataFrame(results)

# --- 1) Use OPENAI_API_KEY_1 for smaller datasets ---
with use_api_key_env("OPENAI_API_KEY_1"):
    print(">>> Using Tier-1 key (OPENAI_API_KEY_1)")
    df_runs_small = run_list(df_fit, RULES, MODEL, tag=f"{TAG}-TIER1")
    display(df_runs_small)

# --- 2) Use OPENAI_API_KEY_2 for larger datasets ---
if not df_too_big.empty:
    with use_api_key_env("OPENAI_API_KEY_2"):
        print(">>> Using alternate key (OPENAI_API_KEY_2)")
        df_runs_big = run_list(df_too_big, RULES, MODEL, tag=f"{TAG}-ALT")
        display(df_runs_big)
else:
    df_runs_big = pd.DataFrame()
    print("No large datasets; nothing to run with the alternate key.")

# (optional) save an index of what ran under which key
from datetime import datetime
idx_dir = Path(OUT_ROOT) / "_indexes"
idx_dir.mkdir(parents=True, exist_ok=True)
stamp = datetime.now().strftime("%Y%m%d-%H%M%S")

df_runs_small.assign(key="OPENAI_API_KEY_1").to_csv(idx_dir / f"runs_tier1_{MODEL}_{TAG}_{stamp}.csv", index=False)
if not df_runs_big.empty:
    df_runs_big.assign(key="OPENAI_API_KEY_2").to_csv(idx_dir / f"runs_alt_{MODEL}_{TAG}_{stamp}.csv", index=False)
print("Saved run indexes in:", idx_dir)

>>> Using Tier-1 key (OPENAI_API_KEY_1)

=== Running kaikoura_earthquake_2016/train (gpt-4o-mini | modeS-RULES1-TIER1) ===
Macro-F1 (tiny sample): 0.7341269841269842
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = validating
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f83249c27c8190bdafcf0f44367b82] status = in_progress
[batch batch_68f832

Unnamed: 0,event,split,run_dir,predictions_csv,macro_f1,accuracy,num_total
0,kaikoura_earthquake_2016,train,runs\kaikoura_earthquake_2016\train\gpt-4o-min...,runs\kaikoura_earthquake_2016\train\gpt-4o-min...,0.666262,0.694661,1536
1,canada_wildfires_2016,train,runs\canada_wildfires_2016\train\gpt-4o-mini\2...,runs\canada_wildfires_2016\train\gpt-4o-mini\2...,0.586217,0.773104,1569
2,cyclone_idai_2019,train,runs\cyclone_idai_2019\train\gpt-4o-mini\20251...,runs\cyclone_idai_2019\train\gpt-4o-mini\20251...,0.578483,0.683255,2753


>>> Using alternate key (OPENAI_API_KEY_2)

=== Running hurricane_florence_2018/train (gpt-4o-mini | modeS-RULES1-ALT) ===
Macro-F1 (tiny sample): 0.7283950617283951
[batch batch_68f85ba39dec8190b5da02a8e47f52c4] status = validating
[batch batch_68f85ba39dec8190b5da02a8e47f52c4] status = in_progress
[batch batch_68f85ba39dec8190b5da02a8e47f52c4] status = in_progress
[batch batch_68f85ba39dec8190b5da02a8e47f52c4] status = in_progress
[batch batch_68f85ba39dec8190b5da02a8e47f52c4] status = finalizing
[batch batch_68f85ba39dec8190b5da02a8e47f52c4] status = finalizing
[batch batch_68f85ba39dec8190b5da02a8e47f52c4] status = completed
Saved predictions to: runs\hurricane_florence_2018\train\gpt-4o-mini\20251021-212048-modeS-RULES1-ALT\predictions.csv
Macro-F1: 0.6004112552352256

=== Running hurricane_maria_2017/train (gpt-4o-mini | modeS-RULES1-ALT) ===
Macro-F1 (tiny sample): 0.5380952380952382
[batch batch_68f862c4c0108190b2581e01c131bb8a] status = validating
[batch batch_68f862c4c0108190

Unnamed: 0,event,split,run_dir,predictions_csv,macro_f1,accuracy,num_total
0,hurricane_florence_2018,train,runs\hurricane_florence_2018\train\gpt-4o-mini...,runs\hurricane_florence_2018\train\gpt-4o-mini...,0.600411,0.73677,4384
1,hurricane_maria_2017,train,runs\hurricane_maria_2017\train\gpt-4o-mini\20...,runs\hurricane_maria_2017\train\gpt-4o-mini\20...,0.577048,0.663329,5094
2,california_wildfires_2018,train,runs\california_wildfires_2018\train\gpt-4o-mi...,runs\california_wildfires_2018\train\gpt-4o-mi...,0.615469,0.705791,5163
3,hurricane_dorian_2019,train,runs\hurricane_dorian_2019\train\gpt-4o-mini\2...,runs\hurricane_dorian_2019\train\gpt-4o-mini\2...,0.584102,0.61794,5329
4,kerala_floods_2018,train,runs\kerala_floods_2018\train\gpt-4o-mini\2025...,runs\kerala_floods_2018\train\gpt-4o-mini\2025...,0.509513,0.680923,5588
5,hurricane_harvey_2017,train,runs\hurricane_harvey_2017\train\gpt-4o-mini\2...,runs\hurricane_harvey_2017\train\gpt-4o-mini\2...,0.566796,0.667451,6378
6,hurricane_irma_2017,train,runs\hurricane_irma_2017\train\gpt-4o-mini\202...,runs\hurricane_irma_2017\train\gpt-4o-mini\202...,0.555319,0.637483,6579


Saved run indexes in: runs\_indexes
