# Notebook 03: Define RDLS Mapping Model

**Purpose**: Create mapping configuration for classifying HDX datasets into RDLS components.

**Process**:
1. Scan corpus to collect tag/org/format statistics
2. Generate weighted mapping configs (tags → RDLS components)
3. Create keyword pattern rules for title/notes
4. Produce review sample for calibration

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1

---

## 1. Setup

In [1]:
"""
1.1 Import Dependencies
"""

from __future__ import annotations

import csv
import json
import random
import re
from collections import Counter, defaultdict
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

# Optional: tqdm for progress bars
try:
    from tqdm.notebook import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False
    print("Note: tqdm not installed. Install with: pip install tqdm")

print(f"Notebook started: {datetime.now().isoformat()}")
print(f"Progress bars: {'Available' if HAS_TQDM else 'Not available'}")

Notebook started: 2026-02-10T21:11:41.780502
Progress bars: Available


In [2]:
"""
1.2 Configure Paths
"""

NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebook' else NOTEBOOK_DIR

# Input directories
DUMP_DIR = BASE_DIR / 'hdx_dataset_metadata_dump'
DATASET_DIR = DUMP_DIR / 'dataset_metadata'
POLICY_DIR = DUMP_DIR / 'policy'
OSM_EXCLUDED_IDS_TXT = POLICY_DIR / 'osm_excluded_dataset_ids.txt'

# Output directories
CONFIG_DIR = DUMP_DIR / 'config'
REFERENCE_DIR = DUMP_DIR / 'reference'  # Pipeline reference materials (mapping docs, samples)

# Configuration
RANDOM_SEED = 42
SAMPLE_SIZE = 400
MAX_NOTES_CHARS = 350
OVERWRITE_CONFIG = True

# Create directories
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
REFERENCE_DIR.mkdir(parents=True, exist_ok=True)

print(f"Dataset dir: {DATASET_DIR}")
print(f"Config dir: {CONFIG_DIR}")
print(f"Reference dir: {REFERENCE_DIR}")

# ── Output cleanup mode ───────────────────────────────────────────────
# Default "skip" because config/reference files are semi-static.
CLEANUP_MODE = "replace"


Dataset dir: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/dataset_metadata
Config dir: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/config
Reference dir: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/reference


In [3]:
"""
1.3 Clean Previous Outputs

Remove stale output files from previous runs (controlled by CLEANUP_MODE).
Default is "skip" because config files are semi-static and curated.
"""

def clean_previous_outputs(output_dir, patterns, label, mode="replace"):
    """
    Remove previous output files matching the given glob patterns.

    Parameters
    ----------
    output_dir : Path
        Directory containing old outputs.
    patterns : list[str]
        Glob patterns to match.
    label : str
        Human-readable label for log messages.
    mode : str
        One of: "replace" (auto-delete), "prompt" (ask user),
        "skip" (keep old files), "abort" (error if stale files exist).

    Returns
    -------
    dict  with keys 'deleted' (int) and 'skipped' (bool)
    """
    result = {'deleted': 0, 'skipped': False}
    targets = {}
    for pattern in patterns:
        matches = sorted(output_dir.glob(pattern))
        if matches:
            targets[pattern] = matches
    total = sum(len(files) for files in targets.values())

    if total == 0:
        print(f'Output cleanup [{label}]: Directory is clean.')
        return result

    summary = []
    for pattern, files in targets.items():
        summary.append(f'  {pattern:40s}: {len(files):,} files')

    if mode == 'skip':
        print(f'Output cleanup [{label}]: SKIPPED ({total:,} existing files kept)')
        result['skipped'] = True
        return result

    if mode == 'abort':
        raise RuntimeError(
            f'Output cleanup [{label}]: ABORT -- {total:,} stale files found. '
            f'Delete manually or change CLEANUP_MODE.'
        )

    if mode == 'prompt':
        print(f'Output cleanup [{label}]: Found {total:,} existing output files:')
        for line in summary:
            print(line)
        choice = input('Choose [R]eplace / [S]kip / [A]bort: ').strip().lower()
        if choice in ('s', 'skip'):
            print('  Skipped.')
            result['skipped'] = True
            return result
        elif choice in ('a', 'abort'):
            raise RuntimeError('User chose to abort.')
        elif choice not in ('r', 'replace', ''):
            print(f'  Unknown choice, defaulting to Replace.')

    # Mode: replace (default)
    print(f'Output cleanup [{label}]:')
    for line in summary:
        print(line)
    for pattern, files in targets.items():
        for f in files:
            try:
                f.unlink()
                result['deleted'] += 1
            except Exception as e:
                print(f'  WARNING: Could not delete {f.name}: {e}')
    deleted_count = result['deleted']
    print(f'  Cleaned {deleted_count:,} files. Ready for fresh output.')
    print()
    return result

# ── Run cleanup ────────────────────────────────────────────────────────
clean_previous_outputs(
    CONFIG_DIR,
    patterns=[
        "tag_to_rdls_component.yaml",
        "keyword_to_rdls_component.yaml",
        "org_hints.yaml",
    ],
    label="NB 03 Config Files",
    mode=CLEANUP_MODE,
)

clean_previous_outputs(
    REFERENCE_DIR,
    patterns=[
        "mapping_rules.md",
        "samples_for_mapping.csv",
    ],
    label="NB 03 Reference Files",
    mode=CLEANUP_MODE,
)


Output cleanup [NB 03 Config Files]:
  tag_to_rdls_component.yaml              : 1 files
  keyword_to_rdls_component.yaml          : 1 files
  org_hints.yaml                          : 1 files
  Cleaned 3 files. Ready for fresh output.

Output cleanup [NB 03 Reference Files]:
  mapping_rules.md                        : 1 files
  samples_for_mapping.csv                 : 1 files
  Cleaned 2 files. Ready for fresh output.



{'deleted': 2, 'skipped': False}

## 2. Helper Functions

In [4]:
"""
2.1 JSON and Data Extraction Helpers
"""

def iter_json_files(folder: Path) -> Iterable[Path]:
    """Yield JSON files in folder, sorted for determinism."""
    if not folder.exists():
        raise FileNotFoundError(f"Dataset folder not found: {folder}")
    yield from sorted(folder.glob("*.json"))


def read_json(path: Path) -> Dict[str, Any]:
    """Read JSON with UTF-8 encoding."""
    return json.loads(path.read_text(encoding="utf-8", errors="ignore"))


def normalize_dataset_record(raw: Dict[str, Any]) -> Dict[str, Any]:
    """Handle possible wrapper {'dataset': {...}}."""
    if isinstance(raw, dict) and "id" in raw:
        return raw
    if isinstance(raw, dict) and "dataset" in raw and isinstance(raw["dataset"], dict):
        return raw["dataset"]
    return raw


def get_org_title(ds: Dict[str, Any]) -> str:
    """Extract organization title."""
    org = ds.get("organization")
    if isinstance(org, dict):
        return (org.get("title") or org.get("name") or "").strip()
    return (org or "").strip()


def get_tags(ds: Dict[str, Any]) -> List[str]:
    """Extract tags as lowercase strings."""
    tags = ds.get("tags") or []
    out: List[str] = []
    if isinstance(tags, list):
        for t in tags:
            if isinstance(t, dict):
                name = t.get("name") or ""
                if name:
                    out.append(name.strip().lower())
            elif isinstance(t, str):
                out.append(t.strip().lower())
    return out


def get_resource_formats(ds: Dict[str, Any]) -> List[str]:
    """Extract resource formats."""
    formats: List[str] = []
    for r in (ds.get("resources") or []):
        if isinstance(r, dict):
            fmt = (r.get("format") or "").strip().lower()
            if fmt:
                formats.append(fmt)
    return formats


def short_text(s: str, max_len: int) -> str:
    """Truncate text with ellipsis."""
    s = (s or "").strip()
    s = re.sub(r"\s+", " ", s)
    return s[:max_len] + ("…" if len(s) > max_len else "")


def load_excluded_ids(path: Path) -> set:
    """Load OSM exclusion list."""
    if not path.exists():
        print(f"WARNING: OSM exclusion list not found: {path}")
        return set()
    ids = set()
    for line in path.read_text(encoding="utf-8", errors="ignore").splitlines():
        line = line.strip()
        if line:
            ids.add(line)
    return ids


print("Helper functions defined.")

Helper functions defined.


## 3. Scan Corpus

In [5]:
"""
3.1 Load Exclusion List and Scan Files

Collect statistics on tags, organizations, and formats.
"""

excluded_ids = load_excluded_ids(OSM_EXCLUDED_IDS_TXT)
print(f"Loaded {len(excluded_ids):,} excluded OSM dataset IDs")

# Counters
tag_counter = Counter()
org_counter = Counter()
fmt_counter = Counter()

# Records for sampling
records_for_sampling: List[Dict[str, Any]] = []

# Scan files
files = list(iter_json_files(DATASET_DIR))
total = len(files)
kept = 0
skipped_osm = 0

print(f"Scanning {total:,} JSON files...")

iterator = tqdm(files, desc="Scanning corpus") if HAS_TQDM else files

for i, path in enumerate(iterator, start=1):
    if not HAS_TQDM and i % 5000 == 0:
        print(f"  Processed {i:,}/{total:,}")
    
    raw = read_json(path)
    ds = normalize_dataset_record(raw)
    
    ds_id = (ds.get("id") or "").strip()
    if not ds_id:
        continue
    
    if ds_id in excluded_ids:
        skipped_osm += 1
        continue
    
    kept += 1
    
    tags = get_tags(ds)
    org = get_org_title(ds)
    fmts = get_resource_formats(ds)
    
    tag_counter.update(tags)
    if org:
        org_counter.update([org])
    fmt_counter.update(fmts)
    
    records_for_sampling.append({
        "dataset_id": ds_id,
        "title": ds.get("title") or "",
        "name": ds.get("name") or "",
        "organization": org,
        "dataset_source": ds.get("dataset_source") or "",
        "license_title": ds.get("license_title") or ds.get("license_id") or "",
        "tags": tags,
        "formats": fmts,
        "notes": ds.get("notes") or "",
    })

print(f"\n{'='*60}")
print("CORPUS SUMMARY (non-OSM)")
print(f"{'='*60}")
print(f"Total files: {total:,}")
print(f"Excluded (OSM): {skipped_osm:,}")
print(f"Kept (non-OSM): {kept:,}")
print(f"Unique tags: {len(tag_counter):,}")
print(f"Unique orgs: {len(org_counter):,}")
print(f"Unique formats: {len(fmt_counter):,}")

Loaded 3,649 excluded OSM dataset IDs
Scanning 26,246 JSON files...


Scanning corpus:   0%|          | 0/26246 [00:00<?, ?it/s]


CORPUS SUMMARY (non-OSM)
Total files: 26,246
Excluded (OSM): 3,649
Kept (non-OSM): 22,597
Unique tags: 142
Unique orgs: 357
Unique formats: 47


In [6]:
"""
3.2 Display Top Statistics
"""

TOP_N = 40

print(f"\n{'='*60}")
print(f"TOP {TOP_N} TAGS")
print(f"{'='*60}")
for t, c in tag_counter.most_common(TOP_N):
    print(f"  {t:<45} {c:>6,}")

print(f"\n{'='*60}")
print("TOP 20 ORGANIZATIONS")
print(f"{'='*60}")
for o, c in org_counter.most_common(20):
    print(f"  {o:<55} {c:>6,}")

print(f"\n{'='*60}")
print("TOP 20 FORMATS")
print(f"{'='*60}")
for f, c in fmt_counter.most_common(20):
    print(f"  {f:<15} {c:>8,}")


TOP 40 TAGS
  hxl                                            9,764
  indicators                                     7,387
  geodata                                        5,761
  health                                         2,925
  baseline population                            2,301
  food security                                  2,147
  economics                                      2,113
  education                                      2,092
  development                                    1,496
  environment                                    1,483
  demographics                                   1,393
  facilities-infrastructure                      1,349
  conflict-violence                              1,265
  climate-weather                                1,169
  population                                     1,141
  internally displaced persons-idp               1,134
  socioeconomics                                 1,099
  nutrition                                      1,0

## 4. Create Review Sample

In [7]:
"""
4.1 Generate Review Sample CSV

Random sample plus examples for top tags.
"""

random.seed(RANDOM_SEED)

# Random sample
sample = random.sample(records_for_sampling, k=min(SAMPLE_SIZE, len(records_for_sampling)))

# Ensure examples for top tags
top_tags = [t for t, _ in tag_counter.most_common(40)]
present_ids = {r["dataset_id"] for r in sample}

for tag in top_tags:
    for r in records_for_sampling:
        if r["dataset_id"] in present_ids:
            continue
        if tag in r["tags"]:
            sample.append(r)
            present_ids.add(r["dataset_id"])
            break

# Write sample CSV
out_sample_csv = REFERENCE_DIR / "samples_for_mapping.csv"

with out_sample_csv.open("w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(
        f,
        fieldnames=[
            "dataset_id", "title", "organization", "dataset_source", 
            "license_title", "tags", "formats", "notes_snippet"
        ],
    )
    w.writeheader()
    for r in sample:
        w.writerow({
            "dataset_id": r["dataset_id"],
            "title": r["title"],
            "organization": r["organization"],
            "dataset_source": r["dataset_source"],
            "license_title": r["license_title"],
            "tags": ";".join(r["tags"]),
            "formats": ";".join(sorted(set(r["formats"]))),
            "notes_snippet": short_text(r["notes"], MAX_NOTES_CHARS),
        })

print(f"Wrote: {out_sample_csv}")
print(f"Sample rows: {len(sample)}")

Wrote: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/reference/samples_for_mapping.csv
Sample rows: 440


## 5. Generate Mapping Configs

In [8]:
"""
5.1 Minimal YAML Writer

Safe YAML serializer without external dependencies.
"""

def yaml_escape(s: str) -> str:
    """Escape string for YAML if needed."""
    if s == "" or any(ch in s for ch in [":", "#", "{", "}", "[", "]", ",", "\n", "\r", "\t"]) or s.strip() != s:
        return '"' + s.replace('"', '\\"') + '"'
    if s.lower() in {"true", "false", "null", "~"}:
        return '"' + s + '"'
    return s


def dump_yaml(obj: Any, indent: int = 0) -> str:
    """Convert object to YAML string."""
    sp = "  " * indent
    if isinstance(obj, dict):
        lines = []
        for k in sorted(obj.keys()):
            v = obj[k]
            key = yaml_escape(str(k))
            if isinstance(v, (dict, list)):
                lines.append(f"{sp}{key}:")
                lines.append(dump_yaml(v, indent + 1))
            else:
                lines.append(f"{sp}{key}: {yaml_escape(str(v))}")
        return "\n".join(lines)
    if isinstance(obj, list):
        lines = []
        for item in obj:
            if isinstance(item, (dict, list)):
                lines.append(f"{sp}-")
                lines.append(dump_yaml(item, indent + 1))
            else:
                lines.append(f"{sp}- {yaml_escape(str(item))}")
        return "\n".join(lines)
    return f"{sp}{yaml_escape(str(obj))}"


def write_yaml(path: Path, obj: Any, overwrite: bool = False) -> None:
    """Write object to YAML file."""
    if path.exists() and not overwrite:
        print(f"SKIP (exists): {path}")
        return
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(dump_yaml(obj) + "\n", encoding="utf-8")
    print(f"Wrote: {path}")


print("YAML writer defined.")

YAML writer defined.


In [9]:
"""
5.2 Define Mapping Rules

Tag weights, keyword patterns, and organization hints.
"""

# Tag to RDLS component mapping (weights: 1=weak, 5=strong)
tag_to_rdls = {
    "hazard": {
        "flooding": 5,
        "drought": 5,
        "cyclones-hurricanes-typhoons": 5,
        "earthquake-tsunami": 5,
        "climate hazards": 4,
        "hydrology": 3,
        "natural disasters": 3,
        "hazards and risk": 3,
        "forecasting": 2,
        "topography": 2,
    },
    "loss_impact": {
        "damage assessment": 5,
        "casualties": 5,
        "fatalities": 5,
        "mortality": 4,
        "affected population": 4,
        "affected area": 4,
        "people in need-pin": 3,
        "severity": 3,
    },
    "exposure": {
        "facilities-infrastructure": 5,
        "populated places-settlements": 4,
        "population": 4,
        "roads": 4,
        "railways": 3,
        "ports": 3,
        "aviation": 3,
        "points of interest-poi": 3,
        "health facilities": 4,
        "education facilities-schools": 4,
        "energy": 3,
        "gazetteer": 2,
        "geodata": 2,
    },
    "vulnerability_proxy": {
        "demographics": 4,
        "poverty": 4,
        "socioeconomics": 4,
        "disability": 3,
        "gender": 3,
        "food security": 3,
        "health": 2,
        "education": 2,
        "livelihoods": 2,
        "nutrition": 2,
    },
}

# Keyword patterns for title/notes (regex)
keyword_to_rdls = {
    "hazard": [
        r"\bflood(s|ing)?\b",
        r"\bdrought\b",
        r"\bcyclone(s)?\b",
        r"\bhurricane(s)?\b",
        r"\btyphoon(s)?\b",
        r"\bearthquake(s)?\b",
        r"\btsunami\b",
        r"\breturn period\b",
        r"\bhazard\b",
    ],
    "loss_impact": [
        r"\bdamage\b",
        r"\bloss(es)?\b",
        r"\bcost(s)?\b",
        r"\bfatalit(y|ies)\b",
        r"\bcasualt(y|ies)\b",
        r"\baffected\b",
    ],
    "exposure": [
        r"\bairport(s)?\b",
        r"\broad(s)?\b",
        r"\bbridge(s)?\b",
        r"\bport(s)?\b",
        r"\bhospital(s)?\b",
        r"\bschool(s)?\b",
        r"\bfacilit(y|ies)\b",
        r"\binfrastructure\b",
        r"\bbuildings?\b",
        r"\bsettlement(s)?\b",
    ],
    "vulnerability_proxy": [
        r"\bpoverty\b",
        r"\bdisabilit(y|ies)\b",
        r"\bmalnutrition\b",
        r"\bfood security\b",
        r"\bhealth indicator(s)?\b",
        r"\bdemographic(s)?\b",
        r"\bvulnerability\b",
        r"\bhousehold(s)?\b",
    ],
}

# Organization hints
org_hints = {
    "World Bank Group": {"vulnerability_proxy": 2},
    "The DHS Program": {"vulnerability_proxy": 4},
    "Food and Agriculture Organization": {"vulnerability_proxy": 3},
    "UNICEF": {"vulnerability_proxy": 3},
}

print("Mapping rules defined.")

Mapping rules defined.


In [10]:
"""
5.3 Write Mapping Config Files
"""

# Output paths
path_tag_yaml = CONFIG_DIR / "tag_to_rdls_component.yaml"
path_kw_yaml = CONFIG_DIR / "keyword_to_rdls_component.yaml"
path_org_yaml = CONFIG_DIR / "org_hints.yaml"

# Write configs
write_yaml(path_tag_yaml, tag_to_rdls, overwrite=OVERWRITE_CONFIG)
write_yaml(path_kw_yaml, keyword_to_rdls, overwrite=OVERWRITE_CONFIG)
write_yaml(path_org_yaml, org_hints, overwrite=OVERWRITE_CONFIG)

Wrote: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/config/tag_to_rdls_component.yaml
Wrote: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/config/keyword_to_rdls_component.yaml
Wrote: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/config/org_hints.yaml


In [11]:
"""
5.4 Write Mapping Rules Documentation
"""

rules_md = REFERENCE_DIR / "mapping_rules.md"

if (not rules_md.exists()) or OVERWRITE_CONFIG:
    content = f"""# Mapping Rules (Draft)

This document describes the mapping used to translate HDX dataset metadata into RDLS components.

## Components

- **hazard**: Datasets describing hazard events/intensity/footprints or hazard model inputs
- **exposure**: Datasets describing exposed elements (population, facilities, infrastructure)
- **vulnerability_proxy**: Indicator datasets used as proxies for vulnerability/sensitivity
- **loss_impact**: Datasets describing observed/estimated impacts (damage, fatalities)

## Evidence Sources (Priority Order)

1. **HDX tags** (weighted)
2. **Keywords** in title/notes (regex patterns)
3. **Organization hints** (helps with indicator series)

## OSM Policy

OSM-derived datasets are excluded using the exclusion list in:
`{OSM_EXCLUDED_IDS_TXT.as_posix()}`

## Configuration Files

- `config/tag_to_rdls_component.yaml` - Tag weights
- `config/keyword_to_rdls_component.yaml` - Keyword patterns
- `config/org_hints.yaml` - Organization hints

## Next Steps

1. Review `reference/samples_for_mapping.csv` to calibrate weights
2. Expand tag mappings for common tags in corpus
3. Add organization mappings for frequent publishers

---
*Generated by HDX-RDLS Pipeline Notebook 03*
"""
    rules_md.write_text(content, encoding="utf-8")
    print(f"Wrote: {rules_md}")
else:
    print(f"SKIP (exists): {rules_md}")

Wrote: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/reference/mapping_rules.md


## 6. Summary

In [12]:
"""
6.1 Display Summary
"""

print(f"\n{'='*60}")
print("MAPPING CONFIG COMPLETE")
print(f"{'='*60}")
print(f"\nCorpus statistics:")
print(f"  Non-OSM datasets: {kept:,}")
print(f"  Unique tags: {len(tag_counter):,}")
print(f"  Unique organizations: {len(org_counter):,}")

print(f"\nOutputs:")
print(f"  - {path_tag_yaml}")
print(f"  - {path_kw_yaml}")
print(f"  - {path_org_yaml}")
print(f"  - {rules_md}")
print(f"  - {out_sample_csv}")

print(f"\nNext: Run Notebook 04 to classify datasets using these configs.")
print(f"\nNotebook completed: {datetime.now().isoformat()}")


MAPPING CONFIG COMPLETE

Corpus statistics:
  Non-OSM datasets: 22,597
  Unique tags: 142
  Unique organizations: 357

Outputs:
  - /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/config/tag_to_rdls_component.yaml
  - /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/config/keyword_to_rdls_component.yaml
  - /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/config/org_hints.yaml
  - /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/reference/mapping_rules.md
  - /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/reference/samples_for_mapping.csv

Next: Run Notebook 04 to classify datasets using these configs.

Notebook completed: 2026-02-10T21:15:43.194697


## End of Code