# Step 5 - RDLS Classification Review & Overrides (HDX → RDLS)

**Goal:** add a *defensible* human QA loop to the Step 4 machine classification.

This notebook helps you:
1) Build a **review pack** (CSV) prioritizing low/medium-confidence RDLS candidates (excluding policy-excluded OSM by default).
2) Capture human decisions (keep / exclude / adjust components) in a structured way.
3) Convert the reviewed CSV into `config/overrides.yaml`.
4) Apply overrides to the full classification to produce a **final** classification table + final included IDs list.

## Inputs
- `derived/classification.csv` (from Step 4)
- `policy/osm_excluded_dataset_ids.txt` (from Step 2) — used indirectly because Step 4 already applied it
- Optional existing `config/overrides.yaml` (if you re-run)

## Outputs
- `derived/review/review_pack.csv` — edit this file (in Excel/VS Code) and fill the decision columns
- `config/overrides.yaml` — machine-readable override rules
- `derived/classification_final.csv` — Step 4 classification with overrides applied
- `derived/classification_final_summary.json`
- `derived/rdls_included_dataset_ids_final.txt`

## Override policy
- **OSM exclusion remains authoritative by default.** If an override tries to include an OSM-excluded dataset, it will be ignored unless you explicitly enable `ALLOW_OSM_OVERRIDE = True` in the config cell.


### Cell 1

In [1]:
# =========================
# Configuration & file paths
# =========================
from __future__ import annotations

import json
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd

try:
    import yaml  # PyYAML
except Exception as e:
    raise ImportError(
        "PyYAML is required for this notebook. Install with: pip install pyyaml"
    ) from e


# When you run the notebook from ./notebooks/, we want the project root to be the parent.
# This makes the notebook robust to being executed from different working directories.
def guess_project_root() -> Path:
    cwd = Path.cwd().resolve()
    if cwd.name.lower() in {"notebook", "notebooks"}:
        return cwd.parent
    return cwd

PROJECT_ROOT = guess_project_root()

# Root directory from Step 1 output
DUMP_DIR = PROJECT_ROOT / "hdx_dataset_metadata_dump"

# Step 4 outputs live here
DERIVED_DIR = DUMP_DIR / "derived"
CLASSIFICATION_CSV = DERIVED_DIR / "classification.csv"

# Step 2 outputs live here
POLICY_DIR = DUMP_DIR / "policy"
OSM_EXCLUDED_IDS_TXT = POLICY_DIR / "osm_excluded_dataset_ids.txt"

# Step 5 outputs
REVIEW_DIR = DERIVED_DIR / "review"
REVIEW_PACK_CSV = REVIEW_DIR / "review_pack.csv"

# Overrides config
CONFIG_DIR = DUMP_DIR / "config"
OVERRIDES_YAML = CONFIG_DIR / "overrides.yaml"

# Final outputs
CLASSIFICATION_FINAL_CSV = DERIVED_DIR / "classification_final.csv"
CLASSIFICATION_FINAL_SUMMARY_JSON = DERIVED_DIR / "classification_final_summary.json"
RDLS_INCLUDED_IDS_FINAL_TXT = DERIVED_DIR / "rdls_included_dataset_ids_final.txt"

# Policy switch: by default, do NOT allow humans to override the OSM exclusion decision.
# (Enable only for a curated pilot track.)
ALLOW_OSM_OVERRIDE = False


# How many records to include in the review pack (tune based on your review capacity)
REVIEW_PACK_SIZE = 1500

# Prioritize RDLS candidates with low/medium confidence and not policy-excluded
PRIORITIZE_CONFIDENCE = ("low", "medium")


# Create output folders
REVIEW_DIR.mkdir(parents=True, exist_ok=True)
CONFIG_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DUMP_DIR     :", DUMP_DIR)
print("CLASSIFICATION_CSV exists?", CLASSIFICATION_CSV.exists())


PROJECT_ROOT: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler
DUMP_DIR     : C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump
CLASSIFICATION_CSV exists? True


### Cell 2

In [2]:
# ======================
# Helpers: load & validate
# ======================
import re

REQUIRED_COLUMNS = [
    "dataset_id",
    "title",
    "organization",
    "dataset_source",
    "license_title",
    "tags",
    "groups",
    "formats",
    "excluded_by_policy",
    "rdls_candidate",
    "rdls_components",
    "confidence",
    "score_hazard",
    "score_exposure",
    "score_vulnerability_proxy",
    "score_loss_impact",
]

def _to_bool_series(s):
    """Robust bool coercion for CSV roundtrips (handles True/False/blank)."""
    return (
        s.astype(str)
        .str.strip()
        .str.lower()
        .map({"true": True, "false": False, "1": True, "0": False, "yes": True, "no": False})
        .fillna(False)
        .astype(bool)
    )

def load_classification_table(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Missing Step 4 output: {path}")

    df = pd.read_csv(path)

    missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
    if missing:
        raise ValueError(
            "classification.csv is missing required columns: "
            + ", ".join(missing)
            + "\n"
            + "Did Step 4 run completely with the expected version?"
        )

    # Coerce only the columns we *know* exist here.
    for col in ["excluded_by_policy", "rdls_candidate"]:
        df[col] = _to_bool_series(df[col])

    df["confidence"] = df["confidence"].fillna("unknown").astype(str).str.strip().str.lower()
    return df


def load_osm_excluded_ids(path: Path) -> set[str]:
    # Note: Step 4 already applied this policy, but we load it to enforce "no override include" by default.
    if not path.exists():
        print(f"WARNING: OSM exclusion list not found: {path}. Proceeding with empty exclusion set.")
        return set()

    ids = set()
    for line in path.read_text(encoding="utf-8").splitlines():
        s = line.strip()
        if s:
            ids.add(s)
    return ids


# --- Load inputs ---
osm_excluded_ids = load_osm_excluded_ids(OSM_EXCLUDED_IDS_TXT)
df = load_classification_table(CLASSIFICATION_CSV)

# --- Derive is_osm (optional column) ---
# If Step 4 didn't export `is_osm`, compute it from Step 2 exclusion list.
if "is_osm" not in df.columns:
    df["is_osm"] = df["dataset_id"].astype(str).isin(osm_excluded_ids)
else:
    df["is_osm"] = _to_bool_series(df["is_osm"])

# Defensive: ensure booleans are correct (again, but now `is_osm` exists)
df["excluded_by_policy"] = _to_bool_series(df["excluded_by_policy"])
df["rdls_candidate"] = _to_bool_series(df["rdls_candidate"])

print("Rows:", len(df))
print("Unique dataset_id:", df["dataset_id"].nunique())
print("OSM excluded IDs loaded:", len(osm_excluded_ids))
print("Derived is_osm True:", int(df["is_osm"].sum()))


Rows: 26246
Unique dataset_id: 26246
OSM excluded IDs loaded: 3649
Derived is_osm True: 3649


### Cell 3

In [3]:
# ==========================================
# Build a review pack (CSV) for human labeling
# ==========================================
#
# The review pack is a subset of datasets you will manually check.
# You will edit the output CSV and fill these columns:
#
# - decision: keep | exclude | unsure
# - components_override: comma-separated list, e.g. "hazard,exposure"
# - notes: free text, why you changed it
#
# The notebook will then convert that CSV into config/overrides.yaml.

REVIEW_COLUMNS = [
    "dataset_id",
    "title",
    "organization",
    "dataset_source",
    "license_title",
    "tags",
    "groups",
    "formats",
    "url",
    "is_osm",
    "excluded_by_policy",
    "rdls_candidate",
    "rdls_components",
    "confidence",
    "score_hazard",
    "score_exposure",
    "score_vulnerability_proxy",
    "score_loss_impact",
    "top_signals",
]

# Filter: only candidates that are currently included
eligible = df[(df["rdls_candidate"] == True) & (df["excluded_by_policy"] == False)].copy()

# Prioritize by confidence, then by total score descending
eligible["total_score"] = (
    eligible["score_hazard"]
    + eligible["score_exposure"]
    + eligible["score_vulnerability_proxy"]
    + eligible["score_loss_impact"]
)

# Priority subset (low/medium confidence)
priority = eligible[eligible["confidence"].isin(PRIORITIZE_CONFIDENCE)].copy()

# If priority subset is smaller than requested, top up with "high" confidence to keep review_pack_size stable.
priority = priority.sort_values(["confidence", "total_score"], ascending=[True, False])
review_pack = priority.head(REVIEW_PACK_SIZE)

if len(review_pack) < REVIEW_PACK_SIZE:
    remaining = eligible[~eligible.index.isin(review_pack.index)].sort_values(
        ["total_score"], ascending=[False]
    )
    review_pack = pd.concat([review_pack, remaining.head(REVIEW_PACK_SIZE - len(review_pack))], ignore_index=True)

# Keep only columns we want in the pack (if present)
available_cols = [c for c in REVIEW_COLUMNS if c in review_pack.columns]
review_pack = review_pack[available_cols].copy()

# Add empty human-edit fields
review_pack["decision"] = ""  # keep | exclude | unsure
review_pack["components_override"] = ""  # e.g. hazard,exposure
review_pack["notes"] = ""

review_pack.to_csv(REVIEW_PACK_CSV, index=False, encoding="utf-8")
print(f"Wrote review pack: {REVIEW_PACK_CSV} (rows={len(review_pack)})")

# Quick QA summary for reviewers
print("\nReview pack breakdown:")
print(review_pack["confidence"].value_counts(dropna=False))


Wrote review pack: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\derived\review\review_pack.csv (rows=1500)

Review pack breakdown:
confidence
medium    1500
Name: count, dtype: int64


### Cell 4

In [4]:
# ==========================================================
# Convert reviewed CSV -> overrides.yaml (after you edit the CSV)
# ==========================================================
#
# Workflow:
# 1) Open derived/review/review_pack.csv
# 2) Fill decision (keep/exclude/unsure) and optional components_override + notes
# 3) Re-run this cell to generate config/overrides.yaml
#
# Notes:
# - components_override is optional. If omitted and decision=keep, the Step 4 components remain.
# - If decision=exclude, the dataset will be excluded even if Step 4 included it.
# - If decision=keep and Step 4 excluded by policy (OSM), it will only be included when ALLOW_OSM_OVERRIDE=True.

VALID_DECISIONS = {"keep", "exclude", "unsure", ""}

def parse_components_list(s: str) -> List[str]:
    parts = [p.strip().lower() for p in str(s).split(",") if p.strip()]
    # de-duplicate while preserving order
    out = []
    for p in parts:
        if p not in out:
            out.append(p)
    return out

reviewed = pd.read_csv(REVIEW_PACK_CSV).fillna("")
reviewed["decision"] = reviewed["decision"].astype(str).str.strip().str.lower()

bad = reviewed[~reviewed["decision"].isin(VALID_DECISIONS)]
if len(bad) > 0:
    raise ValueError(
        "Invalid decision value(s) found. Allowed: keep, exclude, unsure, blank.\n"
        + bad[["dataset_id", "decision"]].head(20).to_string(index=False)
    )

overrides: Dict[str, Any] = {"overrides": {}}

for _, r in reviewed.iterrows():
    dsid = str(r["dataset_id"]).strip()
    decision = str(r["decision"]).strip().lower()
    if not dsid or not decision or decision == "unsure":
        continue

    entry: Dict[str, Any] = {"decision": decision}

    comps = parse_components_list(r.get("components_override", ""))
    if comps:
        entry["components"] = comps

    notes = str(r.get("notes", "")).strip()
    if notes:
        entry["notes"] = notes

    overrides["overrides"][dsid] = entry

# Write overrides.yaml deterministically
with OVERRIDES_YAML.open("w", encoding="utf-8") as f:
    yaml.safe_dump(overrides, f, sort_keys=True, allow_unicode=True)

print(f"Wrote overrides: {OVERRIDES_YAML} (entries={len(overrides['overrides'])})")


Wrote overrides: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\config\overrides.yaml (entries=0)


### Cell 5

In [5]:
# ==========================================
# Apply overrides to the full classification
# ==========================================
#
# Produces:
# - derived/classification_final.csv
# - derived/classification_final_summary.json
# - derived/rdls_included_dataset_ids_final.txt

def load_overrides(path: Path) -> Dict[str, Dict[str, Any]]:
    if not path.exists():
        print(f"WARNING: overrides file not found: {path}. Proceeding with no overrides.")
        return {}
    data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
    return data.get("overrides", {}) or {}

overrides_map = load_overrides(OVERRIDES_YAML)

final = df.copy()

# Add columns to track overrides
final["override_decision"] = ""
final["override_components"] = ""
final["excluded_by_override"] = False

# Apply overrides row-by-row by dataset_id (efficient enough for 26k rows)
for i, r in final.iterrows():
    dsid = r["dataset_id"]
    ov = overrides_map.get(dsid)
    if not ov:
        continue

    decision = str(ov.get("decision", "")).strip().lower()
    comps = ov.get("components", None)

    if decision in {"exclude", "keep"}:
        final.at[i, "override_decision"] = decision

    if isinstance(comps, list) and comps:
        final.at[i, "override_components"] = ",".join([str(c).lower() for c in comps])

    if decision == "exclude":
        final.at[i, "excluded_by_override"] = True

    if decision == "keep":
        # 'keep' means "treat as RDLS candidate" with optional component override
        final.at[i, "rdls_candidate"] = True
        if isinstance(comps, list) and comps:
            final.at[i, "rdls_components"] = ",".join([str(c).lower() for c in comps])

# Final exclusion rule: policy OR override-exclude
final["final_excluded"] = final["excluded_by_policy"] | final["excluded_by_override"]

# =======================================
# Enforce RDLS component combination rules
# =======================================
# Policy:
# - vulnerability must co-occur with hazard or exposure
# - loss must co-occur with hazard or exposure
# Inclusive normalization approach:
# - if vulnerability-only -> add exposure
# - if loss-only -> add exposure
#
# We keep a trace column so you can audit what changed.

final["components_normalized"] = False
final["components_normalization_notes"] = ""

def _parse_components(s: Any) -> set[str]:
    parts = [p.strip().lower() for p in str(s).split(",") if p.strip()]
    return set(parts)

def _join_components(s: set[str]) -> str:
    # stable order
    # order = ["hazard", "exposure", "vulnerability", "loss"]
    order = ["hazard", "exposure", "vulnerability_proxy", "loss_impact"]
    return ",".join([c for c in order if c in s])

for i, r in final.iterrows():
    if not bool(r.get("rdls_candidate", False)):
        continue
    if bool(r.get("final_excluded", False)):
        continue

    comps = _parse_components(r.get("rdls_components", ""))

    # skip empty (should not happen for candidates, but keep safe)
    if not comps:
        continue

    notes = []

    # vulnerability requires hazard or exposure
    # if "vulnerability" in comps and not (("hazard" in comps) or ("exposure" in comps)):
    #     comps.add("exposure")
    #     notes.append("added_exposure_for_vulnerability")
    if "vulnerability_proxy" in comps and not (("hazard" in comps) or ("exposure" in comps)):
        comps.add("exposure")
        notes.append("added_exposure_for_vulnerability_proxy")

    # loss requires hazard or exposure
    # if "loss" in comps and not (("hazard" in comps) or ("exposure" in comps)):
    #     comps.add("exposure")
    #     notes.append("added_exposure_for_loss")
    if "loss_impact" in comps and not (("hazard" in comps) or ("exposure" in comps)):
        comps.add("exposure")
        notes.append("added_exposure_for_loss_impact")

    # if any changes happened, write back
    if notes:
        final.at[i, "rdls_components"] = _join_components(comps)
        final.at[i, "components_normalized"] = True
        final.at[i, "components_normalization_notes"] = ";".join(notes)

# Enforce OSM policy unless explicitly allowed
if not ALLOW_OSM_OVERRIDE and len(osm_excluded_ids) > 0:
    # If someone tried to keep an OSM-excluded dataset, revert inclusion.
    mask_illegal_keep = final["dataset_id"].isin(osm_excluded_ids) & (final["override_decision"] == "keep")
    illegal = int(mask_illegal_keep.sum())
    if illegal > 0:
        print(
            f"WARNING: {illegal} override(s) attempted to include OSM-excluded datasets. "
            "Reverting to excluded (ALLOW_OSM_OVERRIDE=False)."
        )
        final.loc[mask_illegal_keep, "final_excluded"] = True

# Final included set: must be rdls_candidate AND not finally excluded
final["final_included"] = final["rdls_candidate"] & (~final["final_excluded"])

# Write final artifacts
final.to_csv(CLASSIFICATION_FINAL_CSV, index=False, encoding="utf-8")
included_ids = final.loc[final["final_included"], "dataset_id"].astype(str).tolist()
RDLS_INCLUDED_IDS_FINAL_TXT.write_text("\n".join(included_ids) + "\n", encoding="utf-8")

summary = {
    "total_datasets": int(len(final)),
    "policy": {
        "osm_excluded_ids_loaded": int(len(osm_excluded_ids)),
        "datasets_excluded_by_policy": int(final["excluded_by_policy"].sum()),
    },
    "overrides": {
        "override_entries_loaded": int(len(overrides_map)),
        "datasets_excluded_by_override": int(final["excluded_by_override"].sum()),
        "datasets_with_component_override": int((final["override_components"].astype(str) != "").sum()),
    },
    "rdls": {
        "candidates_total": int(final["rdls_candidate"].sum()),
        "included_total": int(final["final_included"].sum()),
    },
    "confidence_counts": final["confidence"].value_counts().to_dict(),
    "component_nonzero_counts": {
        "hazard": int((final["score_hazard"] > 0).sum()),
        "exposure": int((final["score_exposure"] > 0).sum()),
        "vulnerability_proxy": int((final["score_vulnerability_proxy"] > 0).sum()),
        "loss_impact": int((final["score_loss_impact"] > 0).sum()),
    },
}

CLASSIFICATION_FINAL_SUMMARY_JSON.write_text(json.dumps(summary, indent=2), encoding="utf-8")

print(f"Wrote: {CLASSIFICATION_FINAL_CSV}")
print(f"Wrote: {RDLS_INCLUDED_IDS_FINAL_TXT} ( {len(included_ids)} ids )")
print(f"Wrote: {CLASSIFICATION_FINAL_SUMMARY_JSON}")
print(summary)


Wrote: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\derived\classification_final.csv
Wrote: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\derived\rdls_included_dataset_ids_final.txt ( 10759 ids )
Wrote: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\derived\classification_final_summary.json
{'total_datasets': 26246, 'policy': {'osm_excluded_ids_loaded': 3649, 'datasets_excluded_by_policy': 3649}, 'overrides': {'override_entries_loaded': 0, 'datasets_excluded_by_override': 0, 'datasets_with_component_override': 0}, 'rdls': {'candidates_total': 13668, 'included_total': 10759}, 'confidence_counts': {'low': 12578, 'high': 7216, 'medium': 6452}, 'component_nonzero_counts': {'hazard': 4056, 'exposure': 13671, 'vulnerability_proxy': 12952, 'loss_impact': 2745}}
