# 08 â€” OSM pilot track (controlled)

This notebook prepares a **small, curated whitelist** for OSM-related datasets/products, so you can test conversion without flooding the RDLS output.

**Outputs**
- `rdls/pilot/osm_whitelist_dataset_ids.txt`
- `rdls/docs/osm_pilot_protocol.md`


In [None]:
# ======================
# Config (edit if needed)
# ======================
from pathlib import Path
import pandas as pd

DUMP_DIR = Path("../hdx_dataset_metadata_dump").resolve()
RDLS_DIR = DUMP_DIR / "rdls"

# Input from earlier steps (adjust names if needed)
OSM_EXCLUSION_REPORT = RDLS_DIR / "reports" / "osm_exclusion_report.csv"   # optional
STEP5_REVIEW_PACK = RDLS_DIR / "reports" / "review_pack.csv"             # optional (if you export it here)

PILOT_DIR = RDLS_DIR / "pilot"
DOCS_DIR = RDLS_DIR / "docs"
PILOT_DIR.mkdir(parents=True, exist_ok=True)
DOCS_DIR.mkdir(parents=True, exist_ok=True)

OUT_WHITELIST = PILOT_DIR / "osm_whitelist_dataset_ids.txt"
OUT_PROTOCOL = DOCS_DIR / "osm_pilot_protocol.md"

TARGET_N = 20  # size of pilot

print("DUMP_DIR:", DUMP_DIR)
print("OUT_WHITELIST:", OUT_WHITELIST)


In [None]:
# ======================
# Build a pilot shortlist (best-effort)
# ======================
import random

candidates = []

if STEP5_REVIEW_PACK.exists():
    df = pd.read_csv(STEP5_REVIEW_PACK)
    # Heuristic: prefer OSM-tagged candidates that are NOT excluded by policy
    cols = set(df.columns)
    if {"dataset_id", "rdls_candidate"}.issubset(cols):
        sub = df[df["rdls_candidate"] == True].copy()
        # If you have explicit OSM flag/label, filter further; otherwise keep as is.
        candidates = sub["dataset_id"].astype(str).dropna().tolist()

elif OSM_EXCLUSION_REPORT.exists():
    df = pd.read_csv(OSM_EXCLUSION_REPORT)
    if "dataset_id" in df.columns:
        candidates = df["dataset_id"].astype(str).dropna().tolist()

# Fallback: empty -> stop early
if not candidates:
    print("No candidate list found. Provide STEP5_REVIEW_PACK or OSM_EXCLUSION_REPORT.")
else:
    # Deduplicate and sample
    candidates = sorted(set(candidates))
    if len(candidates) > TARGET_N:
        random.seed(42)
        pick = random.sample(candidates, TARGET_N)
    else:
        pick = candidates

    OUT_WHITELIST.write_text("\n".join(pick) + "\n", encoding="utf-8")
    print("Wrote whitelist IDs:", len(pick))
    print("First 10:", pick[:10])


In [None]:
# ======================
# Write a simple pilot protocol (Markdown)
# ======================
protocol = f"""# OSM Pilot Protocol

## Purpose
Test the RDLS translation pipeline on a **small controlled set** of OSM-related datasets/products.

## Inputs
- Whitelist: `{OUT_WHITELIST}`
- RDLS schema: `rdls/schema/rdls_schema_v0.3.json`

## Pilot principles
1. Generate RDLS metadata only for the curated whitelist (or, preferably, for your own derived OSM products).
2. Deduplicate by (ISO3, theme, date) if multiple HDX datasets represent the same logical product.
3. No override-includes for datasets that are excluded by policy, unless explicitly agreed.

## How to use
1. Run Step 06 with an argument/flag to restrict to the whitelist IDs.
2. Validate outputs (Step 07).
3. Review the outputs manually; iterate mapping rules only after you agree on policy.

"""
OUT_PROTOCOL.write_text(protocol, encoding="utf-8")
print("Wrote:", OUT_PROTOCOL)
