# Notebook 02: OSM Policy Exclusion Index

**Purpose**: Scan HDX dataset metadata and flag datasets derived from OpenStreetMap (OSM) for exclusion.

**Process**:
1. Load HDX dataset-level metadata JSON files
2. Detect OSM-derived datasets using multiple signals
3. Produce exclusion list (dataset UUIDs)
4. Generate audit report with detection reasons

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1

---

## 1. Setup

In [None]:
"""
1.1 Import Dependencies
"""

from __future__ import annotations

import csv
import json
import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple

# Optional: tqdm for progress bars
try:
    from tqdm.notebook import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False
    print("Note: tqdm not installed. Install with: pip install tqdm")

print(f"Notebook started: {datetime.now().isoformat()}")
print(f"Progress bars: {'Available' if HAS_TQDM else 'Not available'}")

In [None]:
"""
1.2 Configure Paths
"""

NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebook' else NOTEBOOK_DIR

# Input: HDX dataset metadata from Notebook 01
DUMP_DIR = BASE_DIR / 'hdx_dataset_metadata_dump'
DATASET_DIR = DUMP_DIR / 'dataset_metadata'

# Output: Policy artifacts
POLICY_DIR = DUMP_DIR / 'policy'
OUT_IDS_TXT = POLICY_DIR / 'osm_excluded_dataset_ids.txt'
OUT_REPORT_CSV = POLICY_DIR / 'osm_exclusion_report.csv'
OUT_PILOT_CSV = POLICY_DIR / 'osm_candidates_for_pilot.csv'

# Configuration
USE_FAST_PREFILTER = True  # Text scan before JSON parse
PILOT_MAX_PER_BUCKET = 10  # Limit candidates per org/theme

# Create output directory
POLICY_DIR.mkdir(parents=True, exist_ok=True)

print(f"Input: {DATASET_DIR}")
print(f"Output: {POLICY_DIR}")

## 2. Detection Logic

In [None]:
"""
2.1 Helper Functions

Utilities for robust JSON access and data extraction.
"""

def read_text(path: Path) -> str:
    """Read text safely with UTF-8 encoding."""
    return path.read_text(encoding="utf-8", errors="ignore")


def load_json(path: Path) -> Dict[str, Any]:
    """Load JSON file into dict."""
    return json.loads(read_text(path))


def normalize_dataset_record(raw: Dict[str, Any]) -> Dict[str, Any]:
    """
    Normalize dataset record shape.
    
    Handles both direct HDX export and CKAN fallback wrapper.
    """
    if isinstance(raw, dict) and "id" in raw:
        return raw
    if isinstance(raw, dict) and "dataset" in raw and isinstance(raw["dataset"], dict):
        return raw["dataset"]
    return raw


def norm_str(x: Any) -> str:
    """Normalize value to lowercase stripped string."""
    return (x or "").__str__().strip().lower()


def get_org_title(ds: Dict[str, Any]) -> str:
    """Extract organization title/name."""
    org = ds.get("organization")
    if isinstance(org, dict):
        return (org.get("title") or org.get("name") or "").strip()
    return (org or "").strip()


def get_tags(ds: Dict[str, Any]) -> List[str]:
    """Extract tags as lowercase strings."""
    tags = ds.get("tags") or []
    out: List[str] = []
    if isinstance(tags, list):
        for t in tags:
            if isinstance(t, dict):
                name = t.get("name") or ""
                if name:
                    out.append(name.strip().lower())
            elif isinstance(t, str):
                out.append(t.strip().lower())
    return out


def get_resources(ds: Dict[str, Any]) -> List[Dict[str, Any]]:
    """Extract resources list."""
    res = ds.get("resources") or []
    return res if isinstance(res, list) else []


def get_license_title(ds: Dict[str, Any]) -> str:
    """Extract normalized license string."""
    lt = ds.get("license_title") or ds.get("license_id") or ""
    return (lt or "").strip()


print("Helper functions defined.")

In [None]:
"""
2.2 OSM Detection Rules

Policy-based detection with traceable reasons.
"""

@dataclass(frozen=True)
class OSMDetectionResult:
    """
    Result of OSM detection for a dataset.
    
    Attributes
    ----------
    is_osm : bool
        Whether dataset is OSM-derived
    reasons : Tuple[str, ...]
        Detection rule IDs that fired
    signals : Dict[str, Any]
        Evidence fields for auditing
    """
    is_osm: bool
    reasons: Tuple[str, ...]
    signals: Dict[str, Any]


# Fast prefilter markers
FAST_MARKERS = (
    "openstreetmap contributors",
    '"dataset_source":',
    '"license_title": "odbl"',
    '"license_title":"odbl"',
    "open database license",
    "hotosm",
    "export.hotosm.org",
    "openstreetmap.org",
)

# URL markers for OSM/HOT
OSM_URL_MARKERS = (
    "openstreetmap.org",
    "hotosm.org",
    "export.hotosm.org",
    "exports-stage.hotosm.org",
    "production-raw-data-api",
)

# Organization markers
OSM_ORG_MARKERS = (
    "humanitarian openstreetmap",
    "hotosm",
    "openstreetmap",
)

# Title markers
OSM_TITLE_MARKERS = (
    "openstreetmap export",
    "(openstreetmap export)",
    "openstreetmap",
)

# Notes markers
OSM_NOTES_MARKERS = (
    "openstreetmap",
    "wiki.openstreetmap.org",
    "osm",
)


def prefilter_maybe_osm(text: str) -> bool:
    """Quick text scan for OSM indicators."""
    t = text.lower()
    return any(m in t for m in FAST_MARKERS)


def detect_osm(ds: Dict[str, Any]) -> OSMDetectionResult:
    """
    Detect whether a dataset is derived from OpenStreetMap.
    
    Uses multiple signals with policy-based scoring:
    1. dataset_source (strongest)
    2. license evidence (ODbL) + OSM cues
    3. resource URL evidence
    4. organization/title/notes evidence
    
    Parameters
    ----------
    ds : Dict[str, Any]
        Dataset metadata dict
        
    Returns
    -------
    OSMDetectionResult
        Detection result with reasons
    """
    title = ds.get("title") or ""
    notes = ds.get("notes") or ""
    dataset_source = ds.get("dataset_source") or ""
    org_title = get_org_title(ds)
    license_title = get_license_title(ds)
    tags = get_tags(ds)
    resources = get_resources(ds)
    
    title_l = norm_str(title)
    notes_l = norm_str(notes)
    dataset_source_l = norm_str(dataset_source)
    org_l = norm_str(org_title)
    license_l = norm_str(license_title)
    
    reasons: List[str] = []
    
    # Rule 1: dataset_source references OpenStreetMap
    if "openstreetmap" in dataset_source_l:
        reasons.append("dataset_source_mentions_openstreetmap")
    
    # Rule 2: ODbL license with OSM cues
    if license_l in {"odbl", "odc-odbl"} or "odbl" in license_l or "open database license" in license_l:
        if ("openstreetmap" in title_l) or ("openstreetmap" in notes_l) or ("openstreetmap" in dataset_source_l):
            reasons.append("odbl_license_plus_osm_cue")
    
    # Rule 3: Resource URLs point to HOT/OSM
    for r in resources:
        url = norm_str(r.get("download_url") or r.get("url") or "")
        if url and any(m in url for m in OSM_URL_MARKERS):
            reasons.append("resource_url_osm_domain")
            break
    
    # Rule 4: Organization suggests OSM/HOT
    if any(m in org_l for m in OSM_ORG_MARKERS):
        reasons.append("organization_mentions_osm_or_hot")
    
    # Rule 5: Title suggests OSM export
    if any(m in title_l for m in OSM_TITLE_MARKERS):
        reasons.append("title_mentions_osm_export")
    
    # Rule 6: Tags include openstreetmap
    if "openstreetmap" in tags:
        reasons.append("tag_openstreetmap_present")
    
    # Rule 7: Notes include OSM references
    if any(m in notes_l for m in OSM_NOTES_MARKERS) and "openstreetmap" in notes_l:
        reasons.append("notes_mentions_openstreetmap")
    
    # Policy: mark OSM if any strong rules fire
    strong = {
        "dataset_source_mentions_openstreetmap",
        "resource_url_osm_domain",
        "odbl_license_plus_osm_cue",
        "tag_openstreetmap_present",
    }
    is_osm = any(r in strong for r in reasons)
    
    # Allow supporting evidence to upgrade borderline cases
    supporting = set(reasons) - strong
    if not is_osm and len(supporting) >= 2:
        is_osm = True
        reasons.append("supporting_evidence_threshold_met")
    
    signals = {
        "dataset_source": dataset_source,
        "license_title": license_title,
        "organization": org_title,
        "tags": tags,
        "resource_url_sample": (resources[0].get("download_url") if resources else None),
    }
    
    return OSMDetectionResult(
        is_osm=is_osm,
        reasons=tuple(sorted(set(reasons))),
        signals=signals
    )


print("OSM detection rules defined.")

## 3. Scan Corpus

In [None]:
"""
3.1 Scan All Dataset Files

Process all JSON files and detect OSM datasets.
"""

def iter_json_files(folder: Path) -> Iterable[Path]:
    """Yield JSON files in folder, sorted for determinism."""
    if not folder.exists():
        raise FileNotFoundError(f"Input folder not found: {folder}")
    yield from sorted(folder.glob("*.json"))


def scan_folder_for_osm(input_dir: Path) -> Tuple[List[Dict[str, Any]], List[str]]:
    """
    Scan folder for OSM-derived datasets.
    
    Parameters
    ----------
    input_dir : Path
        Folder containing dataset JSON files
        
    Returns
    -------
    Tuple[List[Dict], List[str]]
        (report_rows, excluded_ids)
    """
    report_rows: List[Dict[str, Any]] = []
    excluded_ids: List[str] = []
    
    files = list(iter_json_files(input_dir))
    total = len(files)
    
    print(f"Scanning {total:,} JSON files in: {input_dir}")
    
    # Use tqdm if available
    iterator = tqdm(files, desc="Scanning for OSM") if HAS_TQDM else files
    
    for i, path in enumerate(iterator, start=1):
        # Progress for non-tqdm
        if not HAS_TQDM and i % 2000 == 0:
            print(f"  Processed {i:,}/{total:,}")
        
        try:
            txt = read_text(path) if USE_FAST_PREFILTER else ""
            if USE_FAST_PREFILTER and not prefilter_maybe_osm(txt):
                continue
            
            raw = json.loads(txt) if USE_FAST_PREFILTER else load_json(path)
            ds = normalize_dataset_record(raw)
            
            ds_id = ds.get("id") or ""
            title = ds.get("title") or ""
            name = ds.get("name") or ""
            org = get_org_title(ds)
            
            result = detect_osm(ds)
            
            if result.is_osm:
                excluded_ids.append(ds_id)
                
                report_rows.append({
                    "dataset_id": ds_id,
                    "name": name,
                    "title": title,
                    "organization": org,
                    "dataset_source": ds.get("dataset_source"),
                    "license_title": get_license_title(ds),
                    "reasons": ";".join(result.reasons),
                    "tags": ";".join(get_tags(ds)),
                    "n_resources": len(get_resources(ds)),
                    "file": str(path),
                })
        
        except Exception as e:
            report_rows.append({
                "dataset_id": "",
                "name": "",
                "title": "",
                "organization": "",
                "dataset_source": "",
                "license_title": "",
                "reasons": f"ERROR:{type(e).__name__}:{e}",
                "tags": "",
                "n_resources": "",
                "file": str(path),
            })
    
    # Deduplicate IDs
    excluded_ids = sorted(set([x for x in excluded_ids if x]))
    
    return report_rows, excluded_ids


# Run scan
report_rows, excluded_ids = scan_folder_for_osm(DATASET_DIR)

print(f"\nFlagged OSM-derived datasets: {len(excluded_ids):,}")

## 4. Write Outputs

In [None]:
"""
4.1 Save Exclusion List and Report
"""

def write_ids_txt(path: Path, ids: Sequence[str]) -> None:
    """Write IDs to text file, one per line."""
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for x in ids:
            f.write(f"{x}\n")


def write_report_csv(path: Path, rows: Sequence[Dict[str, Any]]) -> None:
    """Write report to CSV file."""
    path.parent.mkdir(parents=True, exist_ok=True)
    header = [
        "dataset_id", "name", "title", "organization",
        "dataset_source", "license_title", "reasons",
        "tags", "n_resources", "file"
    ]
    with path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=header)
        w.writeheader()
        for r in rows:
            w.writerow({k: r.get(k, "") for k in header})


# Write outputs
write_ids_txt(OUT_IDS_TXT, excluded_ids)
write_report_csv(OUT_REPORT_CSV, report_rows)

print(f"Wrote: {OUT_IDS_TXT}")
print(f"Wrote: {OUT_REPORT_CSV}")

In [None]:
"""
4.2 Create Pilot Shortlist (Optional)

Small sample for future OSM pilot experiments.
"""

def derive_theme(tags: List[str]) -> str:
    """Infer theme category from tags."""
    if any(t in tags for t in ("roads", "railways", "transportation", "aviation")):
        return "transport"
    if any(t in tags for t in ("health facilities", "health")):
        return "health_facilities"
    if any(t in tags for t in ("waterways", "rivers", "hydrology")):
        return "hydrology"
    if any(t in tags for t in ("administrative boundaries-divisions", "gazetteer")):
        return "boundaries_gazetteer"
    return "other"


def make_pilot_shortlist(rows: List[Dict[str, Any]], max_per_bucket: int = 10) -> List[Dict[str, Any]]:
    """
    Create small pilot shortlist grouped by org/theme.
    
    Parameters
    ----------
    rows : List[Dict]
        Report rows
    max_per_bucket : int
        Maximum samples per org/theme combination
        
    Returns
    -------
    List[Dict]
        Pilot shortlist
    """
    clean = [r for r in rows if r.get("dataset_id")]
    
    for r in clean:
        tags = (r.get("tags") or "").split(";") if r.get("tags") else []
        tags = [t.strip().lower() for t in tags if t.strip()]
        r["theme"] = derive_theme(tags)
    
    # Group by (org, theme)
    buckets: Dict[Tuple[str, str], List[Dict[str, Any]]] = {}
    for r in clean:
        key = ((r.get("organization") or "unknown").strip(), (r.get("theme") or "other").strip())
        buckets.setdefault(key, []).append(r)
    
    pilot: List[Dict[str, Any]] = []
    for (org, theme), items in sorted(buckets.items(), key=lambda x: (-len(x[1]), x[0])):
        items_sorted = sorted(items, key=lambda r: (r.get("title") or ""))
        pilot.extend(items_sorted[:max_per_bucket])
    
    return [
        {
            "dataset_id": r["dataset_id"],
            "title": r["title"],
            "organization": r["organization"],
            "theme": r["theme"],
            "reasons": r["reasons"],
        }
        for r in pilot
    ]


pilot_rows = make_pilot_shortlist(report_rows, max_per_bucket=PILOT_MAX_PER_BUCKET)

if pilot_rows:
    with OUT_PILOT_CSV.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=["dataset_id", "title", "organization", "theme", "reasons"])
        w.writeheader()
        w.writerows(pilot_rows)
    print(f"Wrote: {OUT_PILOT_CSV} ({len(pilot_rows)} rows)")
else:
    print("No pilot rows produced.")

## 5. Summary

In [None]:
"""
5.1 Display Summary
"""

print(f"\n{'='*60}")
print("OSM EXCLUSION SUMMARY")
print(f"{'='*60}")
print(f"Total flagged: {len(excluded_ids):,}")
print(f"Pilot candidates: {len(pilot_rows):,}")

print(f"\nExample excluded IDs (first 10):")
for ds_id in excluded_ids[:10]:
    print(f"  - {ds_id}")

print(f"\nOutputs:")
print(f"  - {OUT_IDS_TXT}")
print(f"  - {OUT_REPORT_CSV}")
print(f"  - {OUT_PILOT_CSV}")

print(f"\nNotebook completed: {datetime.now().isoformat()}")