# Setup (imports, config, labels, prompts)

In [23]:
import os, json, time, math, pathlib, datetime, requests
import pandas as pd
from dotenv import load_dotenv

# Put your key in the environment once per runtime:
# os.environ["OPENAI_API_KEY"] = ""
load_dotenv()  # Automatically loads from `.env` in current dir

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
assert OPENAI_API_KEY, "Please set OPENAI_API_KEY in your environment."

OPENAI_BASE = "https://api.openai.com/v1"
H_JSON = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
H_MULTI = {"Authorization": f"Bearer {OPENAI_API_KEY}"}

# === Your label set (enum names) ===
LABELS = [
    "caution_and_advice",
    "displaced_people_and_evacuations",
    "infrastructure_and_utility_damage",
    "injured_or_dead_people",
    "missing_or_found_people",
    "not_humanitarian",
    "other_relevant_information",
    "requests_or_urgent_needs",
    "rescue_volunteering_or_donation_effort",
    "sympathy_and_support",
]

# === Prompt pieces (short, deterministic) ===
SYSTEM_PROMPT = (
  "You are a precise tweet classifier for humanitarian-response content. "
  "Choose exactly one label from the allowed list that best fits the tweet. "
  "If unrelated to humanitarian contexts, choose 'not_humanitarian'. "
  "Follow the short rules and output JSON that matches the schema; no extra fields."
)

RULES_1 = (
  "- requests_or_urgent_needs: asking for help/supplies/SOS\n"
  "- rescue_volunteering_or_donation_effort: offering help, donation, organizing aid\n"
  "- caution_and_advice: warnings/instructions/tips\n"
  "- displaced_people_and_evacuations: evacuations, relocation, shelters\n"
  "- injured_or_dead_people: injuries, casualties, fatalities\n"
  "- missing_or_found_people: missing or found persons\n"
  "- infrastructure_and_utility_damage: damage/outages to roads/bridges/power/water/buildings\n"
  "- sympathy_and_support: prayers/condolences, no actionable info\n"
  "- other_relevant_information: on-topic but none of the above\n"
  "- not_humanitarian: unrelated to disasters/aid\n"
  "Tie-break: prefer actionable class when in doubt."
)

RULES_2 = (
  "- requests_or_urgent_needs: ASKING for help/supplies/services (need/please help/send/urgent/SOS). If both ask and offer words appear, ASKING wins.\n"
  "- rescue_volunteering_or_donation_effort: OFFERING help, organizing rescues, donation drives, fundraisers, volunteering sign-ups.\n"
  "- caution_and_advice: Warnings, instructions, actionable tips (evacuate/avoid/boil water). If only prayers/solidarity words, do NOT use this.\n"
  "- displaced_people_and_evacuations: Evacuation orders, relocations, sheltering, families displaced.\n"
  "- injured_or_dead_people: Injuries, casualties, fatalities.\n"
  "- missing_or_found_people: People reported missing OR confirmed found/located/reunited. If not explicit, do NOT use this.\n"
  "- infrastructure_and_utility_damage: Physical damage or outages to roads, bridges, buildings, power, water, comms, caused by the disaster. If disaster context is unclear, prefer not_humanitarian or other_relevant_information.\n"
  "- sympathy_and_support: Prayers, thoughts, condolences, “stay strong”, morale support ONLY (no requests, offers, warnings).\n"
  "- other_relevant_information: On-topic situation info that fits none of the above (e.g., event stats, forecasts, timelines) AND is clearly disaster-related.\n"
  "- not_humanitarian: Unrelated to disasters/aid or unclear/no disaster context.\n"
  "Tie-breakers:\n"
  "1) ASKING vs OFFERING → ASKING wins (requests_or_urgent_needs).\n"
  "2) People vs infrastructure → if injuries/casualties/missing are present, choose the people class.\n"
  "3) Sympathy vs caution → only actionable verbs → caution_and_advice; otherwise sympathy_and_support.\n"
  "4) Infra damage needs disaster context; otherwise not_humanitarian."
)

# === Structured Outputs schema for Mode S ===
SCHEMA_S = {
    "type": "object",
    "properties": {
        "label": {"type": "string", "enum": LABELS},
        "confidence": {"type": "number", "minimum": 0, "maximum": 1}
    },
    "required": ["label"],
    "additionalProperties": False
}


# Helpers: paths, TSV loader, tiny F1 (optional)

In [24]:
def load_tsv(path, id_col="tweet_id", text_col="tweet_text", label_col="class_label"):
    """
    Reads a TSV to a DataFrame and normalizes column names to tweet_id, tweet_text, class_label (optional).
    """
    df = pd.read_csv(path, sep="\t", dtype=str).fillna("")
    rename = {}
    if id_col != "tweet_id":   rename[id_col] = "tweet_id"
    if text_col != "tweet_text": rename[text_col] = "tweet_text"
    if label_col and label_col in df.columns and label_col != "class_label":
        rename[label_col] = "class_label"
    df = df.rename(columns=rename)
    assert "tweet_id" in df.columns and "tweet_text" in df.columns, "Need tweet_id + tweet_text"
    if "class_label" not in df.columns:
        df["class_label"] = ""
    return df

def plan_run_dirs(
    dataset_path: str,
    out_root: str = "runs",
    model: str = "gpt-4o-mini",
    tag: str = ""
):
    """
    Creates a tidy output layout:
      runs/<event>/<split>/<model>/<run_id>/
        - requests.jsonl
        - outputs.jsonl
        - predictions.csv
    event = parent folder name (e.g., 'california_wildfires_2018')
    split = file stem minus event (e.g., 'train' from 'california_wildfires_2018_train.tsv')
    """
    p = pathlib.Path(dataset_path)
    event = p.parent.name
    stem = p.stem
    split = stem.replace(event, "").strip("_") or "data"
    ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    run_id = f"{ts}{('-'+tag) if tag else ''}"
    run_dir = pathlib.Path(out_root) / event / split / model / run_id
    run_dir.mkdir(parents=True, exist_ok=True)
    return {
        "event": event,
        "split": split,
        "model": model,
        "run_id": run_id,
        "dir": run_dir,
        "requests_jsonl": run_dir / "requests.jsonl",
        "outputs_jsonl": run_dir / "outputs.jsonl",
        "predictions_csv": run_dir / "predictions.csv",
        "batch_meta_json": run_dir / "batch_meta.json",
    }

def macro_f1(df, truth_col="class_label", pred_col="predicted_label"):
    """Quick macro-F1 without scikit; ignores rows with empty truth."""
    sub = df[(df[truth_col] != "") & (df[pred_col] != "")]
    if sub.empty:
        return float("nan")
    labels = sorted(set(sub[truth_col]) | set(sub[pred_col]))
    f1s = []
    for c in labels:
        tp = ((sub[truth_col] == c) & (sub[pred_col] == c)).sum()
        fp = ((sub[truth_col] != c) & (sub[pred_col] == c)).sum()
        fn = ((sub[truth_col] == c) & (sub[pred_col] != c)).sum()
        prec = tp / (tp + fp) if (tp + fp) else 0.0
        rec  = tp / (tp + fn) if (tp + fn) else 0.0
        f1   = 2*prec*rec / (prec+rec) if (prec+rec) else 0.0
        f1s.append(f1)
    return sum(f1s)/len(f1s)


# Dry-run (sync) on a small sample (no Batch)

In [25]:
# === Helper: build the user message (ZERO-SHOT, no examples) ===
def make_user_message(tweet_text: str, rules: str, labels: list[str]) -> str:
    return (
        f"Allowed labels: {labels}\n"
        "Rules:\n"
        f"{rules.strip()}\n"
        "Choose exactly one label. If unrelated, choose 'not_humanitarian'.\n"
        f'Tweet: """{tweet_text}"""'
    )
    
# === Dry-run (sync) on a small sample ===
def sync_test_sample(
    df: pd.DataFrame,
    n: int = 5,
    rules: str = RULES_1,         # pass RULES_2 to compare
    model: str = "gpt-4o-mini",
    temperature: float = 0.0,
    seed: int = 1,
) -> pd.DataFrame:
    """
    ZERO-SHOT test on n examples to verify output format & quick quality.
    Returns a DataFrame with predicted_label, confidence (entropy=NaN in Mode S).
    """
    test = df.sample(min(n, len(df)), random_state=seed).copy()
    rows = []
    for _, r in test.iterrows():
        user_msg = make_user_message(str(r["tweet_text"]), rules, LABELS)
        body = {
            "model": model,
            "temperature": temperature,
            "top_p": 1,
            "max_tokens": 40,   # chat.completions uses max_tokens
            "messages": [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_msg},
            ],
            "response_format": {
                "type": "json_schema",
                "json_schema": {"name": "tweet_label", "schema": SCHEMA_S},
            },
        }
        resp = requests.post(f"{OPENAI_BASE}/chat/completions", headers=H_JSON, json=body, timeout=60)
        if resp.status_code != 200:
            print(">>> API error body:", resp.text)
        resp.raise_for_status()
        choice = resp.json()["choices"][0]["message"]
        parsed = choice.get("parsed")
        if not parsed:
            # chat.completions often returns JSON as a STRING in message.content
            content = choice.get("content", "")
            parsed = json.loads(content) if content else {}
        rows.append({
            "tweet_id": r["tweet_id"],
            "tweet_text": r["tweet_text"],
            "class_label": r.get("class_label", ""),
            "predicted_label": parsed.get("label", ""),
            "confidence": parsed.get("confidence", None),
            "entropy": float("nan"),  # Mode S has no distribution → no entropy
        })
    out = pd.DataFrame(rows)
    print("Macro-F1 (tiny sample):", macro_f1(out))
    return out


# Build Batch requests.jsonl

In [26]:
# === Build Batch requests.jsonl (Mode S, ZERO-SHOT) ===
def build_requests_jsonl_S(
    df: pd.DataFrame,
    out_path: str,
    rules: str = RULES_1,         # pass RULES_2 to compare
    model: str = "gpt-4o-mini",
    temperature: float = 0.0,
):
    """
    Writes a Batch-ready JSONL file targeting /v1/chat/completions with Structured Outputs.
    ZERO-SHOT: only rules are included (no examples).
    """
    with open(out_path, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            tid = str(row["tweet_id"]).strip()
            text = str(row["tweet_text"] or "").replace("\r", " ").strip()
            user_msg = make_user_message(text, rules, LABELS)
            body = {
                "model": model,
                "temperature": temperature,
                "top_p": 1,
                "max_tokens": 40,   # chat.completions uses max_tokens
                "messages": [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": user_msg},
                ],
                "response_format": {
                    "type": "json_schema",
                    "json_schema": {"name": "tweet_label", "schema": SCHEMA_S},
                },
            }
            line = {
                "custom_id": f"tweet-{tid}",
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": body,
            }
            f.write(json.dumps(line, ensure_ascii=False) + "\n")
    return out_path



# Batch helpers (upload → create → poll → download)

In [27]:
def upload_file_for_batch(filepath: str) -> str:
    with open(filepath, "rb") as f:
        r = requests.post(f"{OPENAI_BASE}/files", headers=H_MULTI,
                          files={"file": (os.path.basename(filepath), f)},
                          data={"purpose":"batch"}, timeout=60)
    r.raise_for_status()
    return r.json()["id"]

def create_batch(input_file_id: str, endpoint="/v1/chat/completions", completion_window="24h") -> str:
    payload = {"input_file_id": input_file_id, "endpoint": endpoint, "completion_window": completion_window}
    r = requests.post(f"{OPENAI_BASE}/batches", headers=H_JSON, json=payload, timeout=60)
    r.raise_for_status()
    return r.json()["id"]

def get_batch(batch_id: str) -> dict:
    r = requests.get(f"{OPENAI_BASE}/batches/{batch_id}", headers=H_JSON, timeout=60)
    r.raise_for_status()
    return r.json()

def wait_for_batch(batch_id: str, poll_secs=15) -> dict:
    while True:
        info = get_batch(batch_id)
        status = info.get("status")
        print(f"[batch {batch_id}] status = {status}")
        if status in {"completed","failed","cancelled"}:
            return info
        time.sleep(poll_secs)

def download_file_content(file_id: str, out_path: str) -> str:
    r = requests.get(f"{OPENAI_BASE}/files/{file_id}/content", headers=H_JSON, timeout=300)
    r.raise_for_status()
    with open(out_path, "wb") as f:
        f.write(r.content)
    return out_path


# Parse Batch outputs → DataFrame/CSV

In [28]:
# === Parse Batch outputs back to DataFrame (Mode S robust) ===
def parse_outputs_S_to_df(outputs_jsonl_path: str, source_df: pd.DataFrame) -> pd.DataFrame:
    # Map id → local fields so we don’t pay tokens to echo text/labels
    by_id = {
        str(r["tweet_id"]): {
            "tweet_text": r["tweet_text"],
            "class_label": r.get("class_label", "")
        }
        for _, r in source_df.iterrows()
    }

    rows = []
    with open(outputs_jsonl_path, encoding="utf-8") as f:
        for line in f:
            rec = json.loads(line)
            tid = rec.get("custom_id", "").replace("tweet-", "")
            choice = rec["response"]["body"]["choices"][0]["message"]
            parsed = choice.get("parsed")
            if not parsed:
                content = choice.get("content", "")
                if isinstance(content, list):  # very rare for chat.completions
                    content = content[0].get("text", "")
                parsed = json.loads(content) if content else {}
            local = by_id.get(tid, {"tweet_text": "", "class_label": ""})
            rows.append({
                "tweet_id": tid,
                "tweet_text": local["tweet_text"],
                "class_label": local["class_label"],
                "predicted_label": parsed.get("label", ""),
                "confidence": parsed.get("confidence", None),
                "entropy": float("nan"),
            })
    return pd.DataFrame(rows, columns=["tweet_id", "tweet_text", "class_label", "predicted_label", "confidence", "entropy"])

# TEST WITH RULES 1 (end-to-end)

In [19]:
# 0) Choose an input TSV
dataset_path = "Dataset/HumAID/california_wildfires_2018/california_wildfires_2018_dev.tsv"
df = load_tsv(dataset_path, id_col="tweet_id", text_col="tweet_text", label_col="class_label")

# 1) Dry-run on 10 samples to sanity-check
demo = sync_test_sample(df, n=20, rules=RULES_1, model="gpt-4o-mini", temperature=0.0, seed=42)
demo.head()
print("Sample macro-F1:", macro_f1(demo))

Macro-F1 (on this tiny sample only): 0.6958333333333333
Sample macro-F1: 0.6958333333333333


In [20]:
# 2) Plan run dirs and build requests.jsonl
plan = plan_run_dirs(dataset_path, out_root="runs", model="gpt-4o-mini", tag="modeS")
req_path = build_requests_jsonl_S(df, plan["requests_jsonl"], model=plan["model"], temperature=0.0)
print("Wrote JSONL to:", req_path)

Wrote JSONL to: runs\california_wildfires_2018\dev\gpt-4o-mini\20251017-192014-modeS\requests.jsonl


In [21]:
# 3) Upload → Create batch → Poll
file_id  = upload_file_for_batch(str(plan["requests_jsonl"]))
batch_id = create_batch(file_id, endpoint="/v1/chat/completions", completion_window="24h")
with open(plan["batch_meta_json"], "w", encoding="utf-8") as f: json.dump({"file_id":file_id,"batch_id":batch_id}, f, indent=2)
info = wait_for_batch(batch_id, poll_secs=20)

[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = validating
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = validating
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = validating
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = validating
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = in_progress
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = in_progress
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = in_progress
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = in_progress
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = in_progress
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = in_progress
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = in_progress
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = in_progress
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = in_progress
[batch batch_68f2f9c60e1c819094bb025dce1cc001] status = in_progress
[batch batch_68f2f9c60e1c819094bb025dce1cc001] statu

In [22]:
# 4) Download outputs and parse
out_file_id = info["output_file_id"]
download_file_content(out_file_id, str(plan["outputs_jsonl"]))
preds_df = parse_outputs_S_to_df(plan["outputs_jsonl"], df)
preds_df.to_csv(plan["predictions_csv"], index=False)
print("Saved predictions to:", plan["predictions_csv"])

# 5) (Optional) Evaluate if truth labels exist
print("Macro-F1:", macro_f1(preds_df))
preds_df.head()


Saved predictions to: runs\california_wildfires_2018\dev\gpt-4o-mini\20251017-192014-modeS\predictions.csv
Macro-F1: 0.590928846013498


Unnamed: 0,tweet_id,tweet_text,class_label,predicted_label,confidence,entropy
0,1065844591805390849,"Camp Fire leaves over 13,000 without homes thi...",displaced_people_and_evacuations,displaced_people_and_evacuations,0.9,
1,1061320723749330944,"So in a truly strange world, we have @RealJame...",not_humanitarian,not_humanitarian,0.9,
2,1063535672793944065,66 people have died and more than 600 are stil...,injured_or_dead_people,injured_or_dead_people,0.95,
3,1062711111869333504,BBC News - California wildfires: Nine dead and...,injured_or_dead_people,displaced_people_and_evacuations,0.9,
4,1064807520802197504,Death toll in California’s #CampFire has climb...,injured_or_dead_people,injured_or_dead_people,0.9,


# TEST WITH RULES 2

In [29]:
# 0) Load TSV
dataset_path = "Dataset/HumAID/california_wildfires_2018/california_wildfires_2018_dev.tsv"
df = load_tsv(dataset_path, id_col="tweet_id", text_col="tweet_text", label_col="class_label")

# 1) Dry-run (choose RULES_1 or RULES_2)
demo = sync_test_sample(df, n=20, rules=RULES_2, model="gpt-4o-mini", temperature=0.0, seed=42)
demo.head(); print("Sample macro-F1:", macro_f1(demo))

Macro-F1 (tiny sample): 0.6592592592592593
Sample macro-F1: 0.6592592592592593


In [31]:
# 2) Plan run dirs (tag with the rules name so runs are distinct)
plan = plan_run_dirs(dataset_path, out_root="runs", model="gpt-4o-mini", tag="modeS-RULES2")

# 3) Build requests.jsonl with the chosen rules (ZERO-SHOT)
req_path = build_requests_jsonl_S(df, plan["requests_jsonl"], rules=RULES_2, model=plan["model"], temperature=0.0)

print("Wrote JSONL to:", req_path)

Wrote JSONL to: runs\california_wildfires_2018\dev\gpt-4o-mini\20251017-202955-modeS-RULES2\requests.jsonl


In [32]:
# 4) Submit batch, wait, download, parse (same as before)
file_id  = upload_file_for_batch(str(plan["requests_jsonl"]))
batch_id = create_batch(file_id, endpoint="/v1/chat/completions", completion_window="24h")
with open(plan["batch_meta_json"], "w", encoding="utf-8") as f: json.dump({"file_id":file_id,"batch_id":batch_id}, f, indent=2)
info = wait_for_batch(batch_id, poll_secs=20)

[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = validating
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] status = in_progress
[batch batch_68f30a1f0e208190adf5b3debfc206b9] st

In [33]:
# 5) Download outputs and parse
out_file_id = info["output_file_id"]
download_file_content(out_file_id, str(plan["outputs_jsonl"]))
preds_df = parse_outputs_S_to_df(plan["outputs_jsonl"], df)
preds_df.to_csv(plan["predictions_csv"], index=False)
print("Saved predictions to:", plan["predictions_csv"])

# 6) (Optional) Evaluate if truth labels exist
print("Macro-F1:", macro_f1(preds_df))
preds_df.head()


Saved predictions to: runs\california_wildfires_2018\dev\gpt-4o-mini\20251017-202955-modeS-RULES2\predictions.csv
Macro-F1: 0.6004880811940472


Unnamed: 0,tweet_id,tweet_text,class_label,predicted_label,confidence,entropy
0,1065844591805390849,"Camp Fire leaves over 13,000 without homes thi...",displaced_people_and_evacuations,displaced_people_and_evacuations,0.9,
1,1061320723749330944,"So in a truly strange world, we have @RealJame...",not_humanitarian,not_humanitarian,0.9,
2,1063535672793944065,66 people have died and more than 600 are stil...,injured_or_dead_people,injured_or_dead_people,0.95,
3,1062711111869333504,BBC News - California wildfires: Nine dead and...,injured_or_dead_people,displaced_people_and_evacuations,0.9,
4,1064807520802197504,Death toll in California’s #CampFire has climb...,injured_or_dead_people,injured_or_dead_people,0.9,
