# Step 6 - Translate HDX Metadata to RDLS v0.3 JSON

**Purpose:** Transform HDX dataset-level metadata exports into RDLS v0.3 metadata records.

**Process:**
1. Load classification results and included dataset IDs from Step 5
2. Build RDLS-compliant records with proper attributions, resources, and spatial info
3. Apply component gating rules (V/L require H or E)
4. Validate against JSON Schema and write outputs

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1

---

## Inputs
- Step 5 outputs:
  - `hdx_dataset_metadata_dump/derived/classification_final.csv`
  - `hdx_dataset_metadata_dump/derived/rdls_included_dataset_ids_final.txt`
- HDX dataset metadata JSON corpus:
  - `hdx_dataset_metadata_dump/dataset_metadata/*.json`
- RDLS v0.3 assets:
  - `rdls_schema_v0.3.json`
  - `rdls_template_v03.json`

## Outputs
- `hdx_dataset_metadata_dump/rdls/records/*.json` — one RDLS record per included HDX dataset
- `hdx_dataset_metadata_dump/rdls/index/rdls_index.jsonl` — index of written records
- `hdx_dataset_metadata_dump/rdls/reports/translation_blocked.csv` — datasets blocked by policy/required-field gaps
- `hdx_dataset_metadata_dump/rdls/reports/schema_validation.csv` — JSON Schema validation results

## Strictness & Policy
- **Schema-first:** required RDLS fields are always populated; optional fields omitted unless safely filled
- **No extra fields:** output contains only fields defined in the RDLS schema
- **No invented content:** missing values → absent optional fields (not empty strings)
- **Open codelists:** values may be kept as-is if not in suggestions
- **Component combination rule:**
  - hazard-only and exposure-only are allowed
  - vulnerability must accompany hazard or exposure
  - loss must accompany hazard or exposure

## 1. Setup and Configuration

In [None]:
"""
Setup: Import libraries and configure paths.

Configuration Options:
    MAX_DATASETS: Limit number of datasets to process (None for all, 50 for testing)
    OUTPUT_MODE: 'in_place' or 'run_folder' for versioned outputs
    SKIP_EXISTING: Resume-safe mode to skip already processed records
    WRITE_PRETTY_JSON: Pretty-print JSON for readability
"""
from __future__ import annotations

import json
import re
from copy import deepcopy
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import pandas as pd

# --- tqdm with graceful fallback ---
try:
    from tqdm.auto import tqdm
    TQDM_AVAILABLE = True
except ImportError:
    TQDM_AVAILABLE = False
    def tqdm(iterable, **kwargs):
        """Fallback: return iterable unchanged if tqdm not installed."""
        return iterable

print(f"tqdm available: {TQDM_AVAILABLE}")


@dataclass
class TranslationConfig:
    """
    Configuration for HDX to RDLS translation.
    
    Attributes:
        dump_dir: Root directory for HDX metadata dump
        max_datasets: Limit number of datasets (None for all)
        output_mode: 'in_place' or 'run_folder'
        clean_before_run: Clean existing outputs before writing (in_place only)
        skip_existing: Skip already processed records
        write_pretty_json: Pretty-print JSON output
        auto_repair_components: Auto-add exposure for standalone V/L
    """
    dump_dir: Path = field(default_factory=lambda: (Path("..") / "hdx_dataset_metadata_dump").resolve())
    max_datasets: Optional[int] = 50  # Set to None for production
    output_mode: str = "in_place"  # "in_place" | "run_folder"
    clean_before_run: bool = True
    skip_existing: bool = False
    write_pretty_json: bool = True
    auto_repair_components: bool = True
    
    def __post_init__(self):
        """Validate configuration."""
        if self.output_mode == "run_folder" and self.clean_before_run:
            raise ValueError("clean_before_run cannot be True when output_mode='run_folder'")


# Initialize configuration
config = TranslationConfig()

# --- Resolve paths ---
DUMP_DIR = config.dump_dir
DATASET_DIR = (DUMP_DIR / "dataset_metadata").resolve()
DERIVED_DIR = (DUMP_DIR / "derived").resolve()
CLASSIFICATION_FINAL_CSV = (DERIVED_DIR / "classification_final.csv").resolve()
INCLUDED_IDS_TXT = (DERIVED_DIR / "rdls_included_dataset_ids_final.txt").resolve()

# RDLS assets
RDLS_DIR = (DUMP_DIR / "rdls").resolve()
RDLS_SCHEMA_PATH = (RDLS_DIR / "schema" / "rdls_schema_v0.3.json").resolve()
RDLS_TEMPLATE_PATH = (RDLS_DIR / "template" / "rdls_template_v03.json").resolve()

# Resolve run directory
if config.output_mode == "run_folder":
    RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")
    RDLS_RUN_DIR = (RDLS_DIR / "runs" / RUN_ID).resolve()
else:
    RUN_ID = "in_place"
    RDLS_RUN_DIR = RDLS_DIR

# Output directories
OUT_RECORDS_DIR = RDLS_RUN_DIR / "records"
OUT_INDEX_DIR = RDLS_RUN_DIR / "index"
OUT_REPORTS_DIR = RDLS_RUN_DIR / "reports"

OUT_INDEX_JSONL = OUT_INDEX_DIR / "rdls_index.jsonl"
OUT_BLOCKED_CSV = OUT_REPORTS_DIR / "translation_blocked.csv"
OUT_VALIDATION_CSV = OUT_REPORTS_DIR / "schema_validation.csv"
OUT_QA_CSV = OUT_REPORTS_DIR / "translation_qa.csv"

# Ensure output folders exist
for p in [OUT_RECORDS_DIR, OUT_INDEX_DIR, OUT_REPORTS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print(f"Configuration:")
print(f"  DUMP_DIR: {DUMP_DIR}")
print(f"  DATASET_DIR: {DATASET_DIR}")
print(f"  OUTPUT_MODE: {config.output_mode}")
print(f"  RUN_ID: {RUN_ID}")
print(f"  MAX_DATASETS: {config.max_datasets}")
print(f"  SKIP_EXISTING: {config.skip_existing}")

## 2. Mapping Configuration

In [None]:
"""
Mapping configurations for HDX to RDLS translation.

Includes:
    - Hazard filename aliases
    - Resource format mapping (HDX -> RDLS)
    - License mapping (HDX -> RDLS)
    - Hazard keywords for inference
"""

# --- Hazard inference / filename alias ---
HAZARD_FILENAME_ALIASES: Dict[str, str] = {
    "strong_wind": "windstorm",
    # Add more aliases as needed
}

# --- Resource format mapping (HDX -> RDLS data_format enum) ---
HDX_FORMAT_TO_RDLS: Dict[str, str] = {
    "CSV": "CSV (csv)",
    "XLS": "Excel (xlsx)",
    "XLSX": "Excel (xlsx)",
    "EXCEL": "Excel (xlsx)",
    "JSON": "JSON (json)",
    "GEOJSON": "GeoJSON (geojson)",
    "SHP": "Shapefile (shp)",
    "SHAPEFILE": "Shapefile (shp)",
    "GPKG": "GeoPackage (gpkg)",
    "GEOPACKAGE": "GeoPackage (gpkg)",
    "KML": "KML (kml)",
    "PDF": "PDF (pdf)",
    "NC": "NetCDF (nc)",
    "NETCDF": "NetCDF (nc)",
    "TIF": "GeoTIFF (tif)",
    "TIFF": "GeoTIFF (tif)",
    "COG": "Cloud Optimized GeoTIFF (cog)",
    "PARQUET": "Parquet (parquet)",
    "XML": "XML (xml)",
}

# --- License mapping (HDX -> RDLS) ---
HDX_LICENSE_TO_RDLS: Dict[str, str] = {
    "public domain": "PDDL-1.0",
    "odbl": "ODbL-1.0",
    "cc-by-4.0": "CC-BY-4.0",
    "cc by 4.0": "CC-BY-4.0",
    "cc-by": "CC-BY-4.0",
    "cc0": "CC0-1.0",
    "cc0-1.0": "CC0-1.0",
    "copyright": "Copyright",
}

# --- Hazard keyword to type mapping ---
HAZARD_KEYWORDS_TO_TYPE: Dict[str, str] = {
    # Hydro
    "flood": "flood", "flooding": "flood", "river flood": "flood",
    "flash flood": "flood", "inundation": "flood",
    # Drought
    "drought": "drought", "dry spell": "drought", "water scarcity": "drought",
    # Storms / wind
    "cyclone": "windstorm", "hurricane": "windstorm", "typhoon": "windstorm",
    "windstorm": "windstorm", "storm": "windstorm",
    # Heat / wildfire
    "heatwave": "heat", "extreme heat": "heat",
    "wildfire": "wildfire", "fire": "wildfire",
    # Seismic
    "earthquake": "earthquake", "tsunami": "tsunami",
    # Landslide
    "landslide": "landslide", "mudslide": "landslide", "avalanche": "landslide",
}

# --- Risk components mapping (Step 5 -> RDLS enum) ---
COMPONENT_MAP: Dict[str, str] = {
    "hazard": "hazard",
    "exposure": "exposure",
    "vulnerability_proxy": "vulnerability",
    "loss_impact": "loss",
    "vulnerability": "vulnerability",
    "loss": "loss",
}

print(f"Format mappings loaded: {len(HDX_FORMAT_TO_RDLS)} entries")
print(f"Hazard keywords loaded: {len(HAZARD_KEYWORDS_TO_TYPE)} entries")

## 3. Load RDLS Schema and Template

In [None]:
"""
Load RDLS schema and template for validation and record building.

Raises:
    FileNotFoundError: If schema or template files are missing.
"""

def safe_load_json(path: Path) -> Dict[str, Any]:
    """Load JSON file with UTF-8 encoding."""
    return json.loads(path.read_text(encoding="utf-8"))


# Validate schema/template existence
if not RDLS_SCHEMA_PATH.exists():
    raise FileNotFoundError(f"RDLS schema not found: {RDLS_SCHEMA_PATH}")

if not RDLS_TEMPLATE_PATH.exists():
    raise FileNotFoundError(f"RDLS template not found: {RDLS_TEMPLATE_PATH}")

rdls_schema = safe_load_json(RDLS_SCHEMA_PATH)
rdls_template = safe_load_json(RDLS_TEMPLATE_PATH)

# RDLS dataset object allowed and required keys
RDLS_ALLOWED_KEYS = set(rdls_schema["properties"].keys())
RDLS_REQUIRED_KEYS = list(rdls_schema.get("required", []))

print(f"RDLS required keys: {RDLS_REQUIRED_KEYS}")
print(f"RDLS allowed keys count: {len(RDLS_ALLOWED_KEYS)}")

## 4. Load Step 5 Outputs

In [None]:
"""
Load classification results and build dataset index.

Loads:
    - classification_final.csv from Step 5
    - included dataset IDs list
    - Builds dataset_id -> file path index
"""

def read_ids_txt(path: Path) -> List[str]:
    """
    Read dataset IDs from text file (one per line).
    
    Parameters:
        path: Path to the IDs file
        
    Returns:
        List of dataset IDs
    """
    if not path.exists():
        raise FileNotFoundError(f"Missing IDs list: {path}")
    return [line.strip() for line in path.read_text(encoding="utf-8").splitlines() if line.strip()]


# Validate inputs exist
for path, name in [
    (CLASSIFICATION_FINAL_CSV, "Classification CSV"),
    (INCLUDED_IDS_TXT, "Included IDs list"),
    (DATASET_DIR, "Dataset metadata folder"),
]:
    if not path.exists():
        raise FileNotFoundError(f"Missing Step 5 output: {name} at {path}")

# Load classification data
df = pd.read_csv(CLASSIFICATION_FINAL_CSV)
included_ids = read_ids_txt(INCLUDED_IDS_TXT)

# Fast lookups
df = df.set_index("dataset_id", drop=False)
included_set = set(included_ids)

print(f"Classification rows: {len(df):,}")
print(f"Included IDs: {len(included_ids):,}")
print(f"Included IDs in classification: {sum(did in df.index for did in included_ids):,}")

# Build dataset_id -> file path mapping (avoid N glob calls)
print("\nBuilding dataset file index...")
dataset_file_index: Dict[str, Path] = {}
for fp in tqdm(sorted(DATASET_DIR.glob("*.json")), desc="Indexing files"):
    stem = fp.stem
    dataset_uuid = stem.split("__", 1)[0] if "__" in stem else stem
    dataset_file_index[dataset_uuid] = fp

print(f"Dataset files indexed: {len(dataset_file_index):,}")

# Apply MAX_DATASETS limit for testing
if config.max_datasets is not None:
    included_ids = included_ids[:config.max_datasets]
    included_set = set(included_ids)
    print(f"\nTEST MODE: Limited to {len(included_ids)} datasets")

## 5. Helper Functions

In [None]:
"""
Helper functions for parsing, slugifying, and mapping.

Includes:
    - Text parsing utilities
    - ISO3 country inference
    - Hazard type inference
    - License and format mapping
"""

def slugify_token(s: str, max_len: int = 32) -> str:
    """Convert string to URL-safe slug token."""
    s = (s or "").strip().lower()
    s = re.sub(r"[^a-z0-9]+", "_", s)
    s = re.sub(r"_+", "_", s).strip("_")
    return (s[:max_len].strip("_") or "unknown")


def split_semicolon_list(s: Any) -> List[str]:
    """Split semicolon/comma separated string into list."""
    if s is None or (isinstance(s, float) and pd.isna(s)):
        return []
    if isinstance(s, list):
        return [str(x).strip() for x in s if str(x).strip()]
    s = str(s).strip()
    if not s:
        return []
    return [x.strip() for x in re.split(r"[;,]", s) if x.strip()]


def looks_like_url(s: str) -> bool:
    """Check if string looks like a URL."""
    return bool(re.match(r"^https?://", (s or "").strip(), flags=re.I))


# --- ISO3 inference ---
def try_import_pycountry():
    """Try to import pycountry for country code lookups."""
    try:
        import pycountry
        return pycountry
    except ImportError:
        return None

_pycountry = try_import_pycountry()

COMMON_COUNTRY_FIXES: Dict[str, str] = {
    "cote d'ivoire": "CIV", "ivory coast": "CIV",
    "democratic republic of the congo": "COD", "dr congo": "COD",
    "republic of the congo": "COG", "congo, rep.": "COG", "congo, dem. rep.": "COD",
    "lao pdr": "LAO", "viet nam": "VNM",
    "korea, rep.": "KOR", "korea, dem. rep.": "PRK",
    "syrian arab republic": "SYR", "iran, islamic republic of": "IRN",
    "tanzania, united republic of": "TZA", "venezuela, bolivarian republic of": "VEN",
    "bolivia, plurinational state of": "BOL", "moldova, republic of": "MDA",
    "palestine": "PSE", "russia": "RUS",
    "united states": "USA", "united kingdom": "GBR",
}

def _norm_country_key(s: str) -> str:
    """Normalize country name for lookup."""
    s = (s or "").strip().lower()
    s = re.sub(r"[\(\)\[\]\{\}\.\,\;\:]", " ", s)
    s = s.replace("&", " and ")
    s = re.sub(r"[^a-z0-9\s\-']", " ", s)
    return re.sub(r"\s+", " ", s).strip()

COMMON_COUNTRY_FIXES_NORM = {_norm_country_key(k): v for k, v in COMMON_COUNTRY_FIXES.items()}

# Load country ISO3 table if available
COUNTRY_ISO3_CSV = (DUMP_DIR / "config" / "country_name_to_iso3.csv")

def load_country_iso3_table(path: Path) -> Dict[str, str]:
    """Load country name to ISO3 mapping from CSV."""
    if not path.exists():
        return {}
    try:
        df_iso = pd.read_csv(path)
        cols = {c.lower(): c for c in df_iso.columns}
        name_col = cols.get("name") or cols.get("country") or cols.get("country_name")
        iso3_col = cols.get("iso3") or cols.get("alpha_3") or cols.get("code")
        if not name_col or not iso3_col:
            return {}
        return {
            _norm_country_key(str(r.get(name_col, ""))): str(r.get(iso3_col, "")).strip().upper()
            for _, r in df_iso.iterrows()
            if str(r.get(name_col, "")).strip() and len(str(r.get(iso3_col, "")).strip()) == 3
        }
    except Exception:
        return {}

COUNTRY_ISO3_TABLE = load_country_iso3_table(COUNTRY_ISO3_CSV)


def country_name_to_iso3(name: str) -> Optional[str]:
    """
    Convert country name to ISO3 code.
    
    Parameters:
        name: Country name or ISO3 code
        
    Returns:
        ISO3 code or None if not found
    """
    n = (name or "").strip()
    if not n:
        return None
    
    # Already ISO3?
    if len(n) == 3 and n.isalpha():
        return n.upper()
    
    key = _norm_country_key(n)
    
    # Check fixes first
    if key in COMMON_COUNTRY_FIXES_NORM:
        return COMMON_COUNTRY_FIXES_NORM[key]
    
    # Check table
    if key in COUNTRY_ISO3_TABLE:
        return COUNTRY_ISO3_TABLE[key]
    
    # Try pycountry
    if _pycountry is not None:
        try:
            c = _pycountry.countries.lookup(n)
            return getattr(c, "alpha_3", None)
        except Exception:
            pass
    
    return None


def infer_spatial(groups: List[str]) -> Dict[str, Any]:
    """Infer RDLS spatial block from HDX country groups."""
    iso3s = sorted(set(filter(None, [country_name_to_iso3(g) for g in groups])))
    
    if len(iso3s) == 1:
        return {"scale": "national", "countries": iso3s}
    if len(iso3s) > 1:
        return {"scale": "regional", "countries": iso3s}
    return {"scale": "global"}


def infer_hazard_types(tags: List[str], title: str = "", notes: str = "") -> List[str]:
    """Infer hazard types from tags and text content."""
    text = " ".join([*tags, title or "", notes or ""]).lower()
    hits = set()
    for k, ht in HAZARD_KEYWORDS_TO_TYPE.items():
        if k in text:
            hits.add(ht)
    return sorted(hits)


def hazard_suffix_for_filename(hazard_types: List[str]) -> str:
    """Generate hazard suffix for filename."""
    if not hazard_types:
        return ""
    if len(hazard_types) > 1:
        return "_multihazard"
    ht = hazard_types[0]
    return "_" + slugify_token(HAZARD_FILENAME_ALIASES.get(ht, ht), max_len=24)


def parse_components(s: Any) -> List[str]:
    """Parse risk components from semicolon-separated string."""
    parts = split_semicolon_list(s)
    seen = set()
    result = []
    for p in parts:
        mapped = COMPONENT_MAP.get(p.strip().lower())
        if mapped and mapped not in seen:
            result.append(mapped)
            seen.add(mapped)
    return result


def choose_prefix(risk_data_type: List[str]) -> str:
    """Choose RDLS filename prefix based on risk data type."""
    rset = set(risk_data_type)
    if "loss" in rset:
        return "rdls_lss-"
    if "vulnerability" in rset:
        return "rdls_vln-"
    if "exposure" in rset:
        return "rdls_exp-"
    return "rdls_hzd-"


def map_license(hdx_license_title: str) -> str:
    """
    Map HDX license strings to RDLS schema suggestions.
    
    Parameters:
        hdx_license_title: License title from HDX
        
    Returns:
        RDLS-compatible license identifier
    """
    raw = (hdx_license_title or "").strip()
    if not raw:
        return ""
    
    key = re.sub(r"\s+", " ", raw.lower().strip())
    
    # Pattern mappings
    if re.search(r"\bcc0\b", key) or ("public domain" in key and "cc0" in key):
        return "CC0-1.0"
    if "odbl" in key or "open database license" in key:
        return "ODbL-1.0"
    if "pddl" in key or "public domain dedication" in key:
        return "PDDL-1.0"
    
    # Creative Commons variants
    k2 = key.replace("creative commons", "cc")
    
    if re.search(r"\bcc\s*by\b", k2) and "sa" not in k2 and "nd" not in k2 and "nc" not in k2:
        return "CC-BY-4.0" if "4.0" in k2 or "v4" in k2 else ("CC-BY-3.0" if "3.0" in k2 else "CC-BY-4.0")
    if "by-sa" in k2 or re.search(r"\bcc\s*by\s*sa\b", k2):
        return "CC-BY-SA-4.0" if "4.0" in k2 else ("CC-BY-SA-3.0" if "3.0" in k2 else "CC-BY-SA-4.0")
    if "by-nc" in k2 or re.search(r"\bcc\s*by\s*nc\b", k2):
        return "CC-BY-NC-4.0" if "4.0" in k2 else ("CC-BY-NC-3.0" if "3.0" in k2 else "CC-BY-NC-4.0")
    
    return HDX_LICENSE_TO_RDLS.get(re.sub(r"\s+", " ", k2).strip(), raw)


def map_data_format(hdx_fmt: str, url: str = "") -> Optional[str]:
    """Map HDX format to RDLS data_format enum."""
    s = (hdx_fmt or "").strip().upper()
    if s in HDX_FORMAT_TO_RDLS:
        return HDX_FORMAT_TO_RDLS[s]
    
    # Guess from URL extension
    u = (url or "").lower()
    ext_map = [
        (".geojson", "GeoJSON (geojson)"), (".json", "JSON (json)"),
        (".csv", "CSV (csv)"), (".xlsx", "Excel (xlsx)"), (".xls", "Excel (xlsx)"),
        (".shp", "Shapefile (shp)"), (".zip", "Shapefile (shp)"),
        (".tif", "GeoTIFF (tif)"), (".tiff", "GeoTIFF (tif)"),
        (".nc", "NetCDF (nc)"), (".pdf", "PDF (pdf)"),
        (".parquet", "Parquet (parquet)"), (".gpkg", "GeoPackage (gpkg)"),
    ]
    for ext, rdls in ext_map:
        if u.endswith(ext):
            return rdls
    return None


print("Helper functions loaded successfully.")

## 6. Component Gating Logic

In [None]:
"""
Component gating logic for RDLS risk_data_type validation.

Rules:
    - vulnerability must co-occur with hazard or exposure
    - loss must co-occur with hazard or exposure
    - risk_data_type must be non-empty
"""

@dataclass(frozen=True)
class ComponentGateResult:
    """
    Result of component gating validation.
    
    Attributes:
        ok: Whether validation passed
        reasons: Tuple of reason codes
        risk_data_type: Validated/repaired risk data type list
    """
    ok: bool
    reasons: Tuple[str, ...]
    risk_data_type: List[str]


def apply_component_gate(components: List[str]) -> ComponentGateResult:
    """
    Enforce RDLS component combination rules.
    
    Parameters:
        components: List of risk components
        
    Returns:
        ComponentGateResult with validation status
    """
    allowed = {"hazard", "exposure", "vulnerability", "loss"}
    rset = {c for c in (components or []) if c in allowed}
    
    if not rset:
        return ComponentGateResult(
            ok=False,
            reasons=("empty_or_unrecognized_components",),
            risk_data_type=[],
        )
    
    reasons: List[str] = []
    ok = True
    
    # Vulnerability requires hazard or exposure
    if "vulnerability" in rset and not ({"hazard", "exposure"} & rset):
        if config.auto_repair_components:
            rset.add("exposure")
            reasons.append("auto_added_exposure_for_vulnerability")
        else:
            ok = False
            reasons.append("vulnerability_without_hazard_or_exposure")
    
    # Loss requires hazard or exposure
    if "loss" in rset and not ({"hazard", "exposure"} & rset):
        if config.auto_repair_components:
            rset.add("exposure")
            reasons.append("auto_added_exposure_for_loss")
        else:
            ok = False
            reasons.append("loss_without_hazard_or_exposure")
    
    return ComponentGateResult(ok=ok, reasons=tuple(reasons), risk_data_type=sorted(rset))


print(f"Component gating configured (auto_repair={config.auto_repair_components})")

## 7. RDLS Record Builder

In [None]:
"""
Build RDLS dataset records from HDX metadata.

Creates minimal, schema-safe records with:
    - Required attributions (publisher, creator, contact)
    - Resources with proper format mapping
    - Spatial information inferred from groups
"""

def build_attributions(hdx: Dict[str, Any], dataset_id: str, dataset_page_url: str) -> List[Dict[str, Any]]:
    """
    Build RDLS attributions from HDX metadata.
    
    Parameters:
        hdx: HDX dataset metadata
        dataset_id: Dataset UUID
        dataset_page_url: HDX dataset landing page URL
        
    Returns:
        List of attribution objects (minItems=3)
    """
    org = (hdx.get("organization") or "").strip() or "Unknown publisher"
    src = (hdx.get("dataset_source") or "").strip() or org
    creator_url = src if looks_like_url(src) else dataset_page_url
    
    return [
        {"id": "attribution_publisher", "role": "publisher", "entity": {"name": org, "url": dataset_page_url}},
        {"id": "attribution_creator", "role": "creator", "entity": {"name": src, "url": creator_url}},
        {"id": "attribution_contact", "role": "contact_point", "entity": {"name": org, "url": dataset_page_url}},
    ]


def build_resources(hdx: Dict[str, Any], dataset_id: str) -> List[Dict[str, Any]]:
    """
    Build RDLS resources from HDX resources.
    
    Parameters:
        hdx: HDX dataset metadata
        dataset_id: Dataset UUID
        
    Returns:
        List of resource objects (minItems=1)
    """
    # Always include HDX metadata export
    meta_url = f"https://data.humdata.org/dataset/{dataset_id}/download_metadata?format=json"
    resources = [{
        "id": "hdx_dataset_metadata_json",
        "title": "HDX dataset metadata (JSON)",
        "description": "Dataset-level metadata exported from HDX.",
        "data_format": "JSON (json)",
        "access_modality": "file_download",
        "download_url": meta_url,
    }]
    
    for r in hdx.get("resources", []) or []:
        rid = (r.get("id") or "").strip()
        rname = (r.get("name") or "").strip() or rid[:8] or "resource"
        desc = (r.get("description") or "").strip() or f"HDX resource: {rname}"
        dl = (r.get("download_url") or "").strip()
        fmt = map_data_format(r.get("format") or "", dl)
        
        if not dl or not fmt:
            continue
        
        resources.append({
            "id": f"hdx_res_{rid[:8] or slugify_token(rname, 8)}",
            "title": rname,
            "description": desc,
            "data_format": fmt,
            "access_modality": "file_download",
            "download_url": dl,
        })
    
    # Deduplicate by id
    seen = set()
    return [r for r in resources if not (r["id"] in seen or seen.add(r["id"]))]


def build_rdls_record(
    hdx: Dict[str, Any],
    class_row: pd.Series,
) -> Tuple[Optional[Dict[str, Any]], Dict[str, Any]]:
    """
    Build RDLS record from HDX metadata and classification.
    
    Parameters:
        hdx: HDX dataset metadata
        class_row: Classification row from Step 5
        
    Returns:
        Tuple of (rdls_record or None if blocked, info dict)
    """
    dataset_id = str(class_row["dataset_id"])
    title = (hdx.get("title") or class_row.get("title") or "").strip()
    notes = (hdx.get("notes") or "").strip()
    
    # Parse and validate components
    components = parse_components(class_row.get("rdls_components"))
    gate = apply_component_gate(components)
    
    if not gate.ok:
        return None, {
            "dataset_id": dataset_id,
            "blocked": True,
            "blocked_reasons": ";".join(gate.reasons),
            "risk_data_type": ";".join(gate.risk_data_type),
        }
    
    # Infer spatial from groups
    groups = split_semicolon_list(class_row.get("groups"))
    spatial = infer_spatial(groups)
    
    # Infer hazard types for naming
    tags = split_semicolon_list(class_row.get("tags"))
    hazard_types = infer_hazard_types(tags, title=title, notes=notes)
    
    dataset_page_url = f"https://data.humdata.org/dataset/{dataset_id}"
    
    # Build entity token for naming
    hdx_slug = slugify_token(str(hdx.get("name") or ""), max_len=48)
    title_slug = slugify_token(title, max_len=48)
    dataset_slug = hdx_slug if hdx_slug != "unknown" else title_slug
    
    org_token = slugify_token(str(class_row.get("organization") or hdx.get("organization") or "unknown"), max_len=20)
    iso3_tok = str(spatial["countries"][0]).lower() if spatial.get("countries") and len(spatial["countries"]) == 1 else ""
    
    # Compose identifier
    parts = [p for p in [org_token, iso3_tok, dataset_slug] if p]
    entity_token = "_".join(parts)
    
    prefix = choose_prefix(gate.risk_data_type) + "hdx_"
    hz_suffix = hazard_suffix_for_filename(hazard_types) if ({"hazard", "loss"} & set(gate.risk_data_type)) else ""
    
    stem_base = f"{prefix}{entity_token}{hz_suffix}"
    stem = stem_base
    
    # Collision-proofing
    out_path = OUT_RECORDS_DIR / f"{stem}.json"
    if out_path.exists():
        stem = f"{stem_base}__{dataset_id[:8]}"
    
    # Map license
    license_raw = str(class_row.get("license_title") or hdx.get("license_title") or "").strip()
    license_mapped = map_license(license_raw or "Custom")
    
    # Build RDLS dataset record
    rdls_ds: Dict[str, Any] = {
        "id": stem,
        "title": title or f"HDX dataset {dataset_id}",
        "description": notes or None,
        "risk_data_type": gate.risk_data_type,
        "spatial": spatial,
        "license": license_mapped,
        "attributions": build_attributions(hdx, dataset_id, dataset_page_url),
        "resources": build_resources(hdx, dataset_id),
        "links": [{
            "href": rdls_schema.get("$id") or "https://docs.riskdatalibrary.org/en/latest/reference/rdls_schema/",
            "rel": "describedby",
        }],
    }
    
    # Remove None values and filter to allowed keys
    rdls_ds = {k: v for k, v in rdls_ds.items() if v is not None and k in RDLS_ALLOWED_KEYS}
    
    # Wrap in top-level structure
    rdls_record = {"datasets": [rdls_ds]}
    
    info = {
        "dataset_id": dataset_id,
        "rdls_id": stem,
        "filename": f"{stem}.json",
        "risk_data_type": ";".join(gate.risk_data_type),
        "spatial_scale": spatial.get("scale", ""),
        "countries_count": len(spatial.get("countries", []) or []),
        "license_raw": license_raw,
        "orgtoken": org_token,
        "hazard_suffix": hz_suffix.lstrip("_"),
        "organization_token": org_token,
        "iso3": iso3_tok,
        "hazard_types": ";".join(hazard_types),
        "blocked": False,
        "blocked_reasons": "",
    }
    
    return rdls_record, info


print("Record builder functions loaded.")

## 8. Clean Previous Outputs (Optional)

In [None]:
"""
Clean previous outputs if configured.

Only cleans when OUTPUT_MODE='in_place' and CLEAN_BEFORE_RUN=True.
Includes safety guardrails to prevent accidental deletion.
"""

def _safe_clean_folder(folder: Path, pattern: str) -> int:
    """
    Safely delete files matching pattern in folder.
    
    Parameters:
        folder: Directory to clean
        pattern: Glob pattern for files to delete
        
    Returns:
        Number of files deleted
    """
    folder = folder.resolve()
    
    # Safety: only clean inside RDLS_DIR
    if not str(folder).startswith(str(RDLS_DIR)):
        raise ValueError(f"Refusing to clean outside rdls/: {folder}")
    
    # Safety: only allow expected folder names
    if folder.name not in {"records", "index", "reports"}:
        raise ValueError(f"Unexpected folder name for cleaning: {folder.name}")
    
    n = 0
    for f in folder.glob(pattern):
        try:
            f.unlink()
            n += 1
        except Exception as e:
            print(f"WARNING: failed to delete {f}: {e}")
    return n


if config.output_mode == "in_place" and config.clean_before_run:
    print("Cleaning previous outputs...")
    removed_records = _safe_clean_folder(OUT_RECORDS_DIR, "*.json")
    removed_index = _safe_clean_folder(OUT_INDEX_DIR, "*.jsonl")
    removed_reports = _safe_clean_folder(OUT_REPORTS_DIR, "*.csv")
    print(f"Cleaned: records={removed_records}, index={removed_index}, reports={removed_reports}")
else:
    print("No cleaning performed (clean_before_run=False or output_mode=run_folder)")

## 9. Validate and Write Records

In [None]:
"""
Process all datasets: build records, validate, and write outputs.

Outputs:
    - Individual RDLS JSON records
    - Index JSONL file
    - Blocked datasets report
    - Validation results report
    - QA summary report
"""

# --- JSON Schema validation setup ---
def try_import_jsonschema():
    try:
        import jsonschema
        return jsonschema
    except ImportError:
        return None

_jsonschema = try_import_jsonschema()
validator = None

if _jsonschema is not None:
    try:
        validator = _jsonschema.Draft202012Validator(rdls_schema)
        print("jsonschema validation enabled (Draft2020-12)")
    except Exception as e:
        print(f"WARNING: jsonschema init failed: {e}")
else:
    print("WARNING: jsonschema not installed; validation will be skipped")


def validate_record(rec: Dict[str, Any]) -> Tuple[bool, str]:
    """Validate RDLS record against schema."""
    if validator is None:
        return True, ""
    errors = sorted(validator.iter_errors(rec["datasets"][0]), key=lambda e: e.path)
    if not errors:
        return True, ""
    msgs = [f"{'.'.join(str(p) for p in e.path)}: {e.message}" for e in errors[:10]]
    return False, " | ".join(msgs)


def append_jsonl(path: Path, obj: Dict[str, Any]) -> None:
    """Append object to JSONL file."""
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")


# Initialize output file
OUT_INDEX_JSONL.write_text("", encoding="utf-8")

# Tracking lists
blocked_rows: List[Dict[str, Any]] = []
validation_rows: List[Dict[str, Any]] = []
qa_rows: List[Dict[str, Any]] = []

# Counters
written = 0
skipped_existing = 0
blocked = 0
validated_ok = 0

print(f"\nProcessing {len(included_ids):,} datasets...")

for dataset_id in tqdm(included_ids, desc="Translating to RDLS"):
    # Find dataset file
    fp = dataset_file_index.get(dataset_id)
    
    if fp is None or not fp.exists():
        blocked += 1
        blocked_rows.append({
            "dataset_id": dataset_id,
            "status": "blocked_missing_hdx_dataset_json",
            "reason": "missing_hdx_dataset_json",
            "risk_data_type": "",
        })
        qa_rows.append({
            "dataset_id": dataset_id, "output_id": "", "filename": "",
            "risk_data_type": "", "spatial_scale": "", "countries_count": 0,
            "license_raw": "", "orgtoken": "", "hazard_suffix": "",
            "status": "blocked_missing_hdx_dataset_json", "reason": "missing_hdx_dataset_json",
        })
        continue
    
    # Load HDX metadata
    hdx = safe_load_json(fp)
    row = df.loc[dataset_id]
    
    # Build RDLS record
    rdls_rec, info = build_rdls_record(hdx, row)
    
    if rdls_rec is None:
        blocked += 1
        reason = info.get("blocked_reasons") or "blocked_by_policy"
        rdt = info.get("risk_data_type") or ""
        blocked_rows.append({
            "dataset_id": dataset_id,
            "status": "blocked_by_policy",
            "reason": reason,
            "risk_data_type": rdt,
        })
        qa_rows.append({
            "dataset_id": dataset_id, "output_id": "", "filename": "",
            "risk_data_type": rdt, "spatial_scale": "", "countries_count": 0,
            "license_raw": "", "orgtoken": "", "hazard_suffix": "",
            "status": "blocked_by_policy", "reason": reason,
        })
        continue
    
    out_path = OUT_RECORDS_DIR / info["filename"]
    
    # Skip if exists and configured
    if config.skip_existing and out_path.exists():
        skipped_existing += 1
        qa_rows.append({
            "dataset_id": dataset_id, "output_id": info.get("rdls_id", ""),
            "filename": info.get("filename", ""), "risk_data_type": info.get("risk_data_type", ""),
            "spatial_scale": info.get("spatial_scale", ""), "countries_count": info.get("countries_count", 0),
            "license_raw": info.get("license_raw", ""), "orgtoken": info.get("orgtoken", ""),
            "hazard_suffix": info.get("hazard_suffix", ""),
            "status": "skipped_existing", "reason": "",
        })
        continue
    
    # Validate
    ok, msg = validate_record(rdls_rec)
    validation_rows.append({
        "dataset_id": dataset_id,
        "rdls_id": info["rdls_id"],
        "filename": info["filename"],
        "valid": ok,
        "message": msg,
    })
    if ok:
        validated_ok += 1
    
    # Write JSON
    if config.write_pretty_json:
        out_path.write_text(json.dumps(rdls_rec, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
    else:
        out_path.write_text(json.dumps(rdls_rec, ensure_ascii=False) + "\n", encoding="utf-8")
    
    append_jsonl(OUT_INDEX_JSONL, info)
    written += 1
    
    qa_rows.append({
        "dataset_id": dataset_id, "output_id": info.get("rdls_id", ""),
        "filename": info.get("filename", ""), "risk_data_type": info.get("risk_data_type", ""),
        "spatial_scale": info.get("spatial_scale", ""), "countries_count": info.get("countries_count", 0),
        "license_raw": info.get("license_raw", ""), "orgtoken": info.get("orgtoken", ""),
        "hazard_suffix": info.get("hazard_suffix", ""),
        "status": "written", "reason": "",
    })

# Summary
print(f"\n" + "="*50)
print(f"TRANSLATION COMPLETE")
print(f"="*50)
print(f"Written: {written:,}")
print(f"Skipped (existing): {skipped_existing:,}")
print(f"Blocked: {blocked:,}")
print(f"Schema valid: {validated_ok:,} of {len(validation_rows):,}")

## 10. Save Reports

In [None]:
"""
Save translation reports to CSV files.
"""

# Save blocked datasets report
blocked_df = pd.DataFrame(blocked_rows, columns=["dataset_id", "status", "reason", "risk_data_type"])
blocked_df.to_csv(OUT_BLOCKED_CSV, index=False)
print(f"Wrote: {OUT_BLOCKED_CSV}")

# Save validation report
val_df = pd.DataFrame(validation_rows, columns=["dataset_id", "rdls_id", "filename", "valid", "message"])
val_df.to_csv(OUT_VALIDATION_CSV, index=False)
print(f"Wrote: {OUT_VALIDATION_CSV}")

# Save QA report
qa_df = pd.DataFrame(qa_rows, columns=[
    "dataset_id", "output_id", "filename", "risk_data_type",
    "spatial_scale", "countries_count", "license_raw", "orgtoken",
    "hazard_suffix", "status", "reason",
])
qa_df.to_csv(OUT_QA_CSV, index=False)
print(f"Wrote: {OUT_QA_CSV}")

print(f"\nIndex file: {OUT_INDEX_JSONL}")

## 11. QA Summary

In [None]:
"""
Quick QA summary of translation results.
"""
from pandas.errors import EmptyDataError


def safe_read_csv(path: Path) -> pd.DataFrame:
    """Read CSV safely, returning empty DataFrame if file is empty."""
    try:
        return pd.read_csv(path)
    except EmptyDataError:
        return pd.DataFrame()


print("\n" + "="*50)
print("QA SUMMARY")
print("="*50)

# Index stats
idx_lines = OUT_INDEX_JSONL.read_text(encoding="utf-8").strip().splitlines()
print(f"Index lines: {len(idx_lines):,}")
print(f"Records on disk: {len(list(OUT_RECORDS_DIR.glob('*.json'))):,}")

# Blocked stats
if OUT_BLOCKED_CSV.exists():
    blocked_df = safe_read_csv(OUT_BLOCKED_CSV)
    print(f"\nBlocked datasets: {len(blocked_df):,}")
    if not blocked_df.empty and "reason" in blocked_df.columns:
        print("Top blocked reasons:")
        for reason, count in blocked_df["reason"].value_counts().head(5).items():
            print(f"  - {reason}: {count:,}")

# Validation stats
if OUT_VALIDATION_CSV.exists():
    val_df = safe_read_csv(OUT_VALIDATION_CSV)
    if not val_df.empty:
        failures = val_df.loc[val_df["valid"] == False, "message"]
        print(f"\nValidation failures: {len(failures):,}")
        if len(failures) > 0:
            print("Top validation errors:")
            for msg, count in failures.value_counts().head(5).items():
                print(f"  - {msg[:60]}...: {count:,}")

# QA status
if OUT_QA_CSV.exists():
    qa_df = safe_read_csv(OUT_QA_CSV)
    if not qa_df.empty and "status" in qa_df.columns:
        print(f"\nQA Status Distribution:")
        for status, count in qa_df["status"].value_counts().items():
            print(f"  - {status}: {count:,}")

print("\n" + "="*50)
print("Step 6 complete. Proceed to Step 7 for validation and packaging.")
print("="*50)