# Notebook 10: RDLS Exposure Block Extractor

**Purpose**: Extract and populate RDLS v0.3 Exposure component blocks from HDX metadata.

**Input**:
- HDX dataset metadata JSON files
- Signal Dictionary (`config/signal_dictionary.yaml`)
- RDLS Schema (`rdls/schema/rdls_schema_v0.3.json`)

**Output**:
- Exposure block extractions with confidence scores
- Updated RDLS records with populated exposure blocks

**RDLS Exposure Block Structure**:
```json
"exposure": [
  {
    "id": "...",
    "category": "agriculture|buildings|infrastructure|population|natural_environment|economic_indicator|development_index",
    "taxonomy": "GED4ALL|MOVER|GLIDE|EMDAT|USGS_EHP|OED|HAZUS|EMS-98|PAGER|CDC-SVI|INFORM|Custom",
    "metrics": [
      {
        "id": "...",
        "dimension": "structure|content|product|disruption|population|index",
        "quantity_kind": "area|count|monetary|length|time",
        "currency": "USD (only when quantity_kind = monetary)"
      }
    ]
  }
]
```

**Schema Reference**:
- `category`: closed codelist (7 values)
- `taxonomy`: closed codelist (12 values)
- `dimension` (metric_dimension): closed codelist (6 values, all lowercase)
- `quantity_kind`: open codelist (5 suggested values, extensible)
- `currency`: ISO 4217 three-letter code, mandatory when quantity_kind = monetary

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.2

---

## 1. Setup and Configuration

In [1]:
"""
1.1 Import Dependencies
"""

import json
import os
import re
import yaml
from pathlib import Path
from collections import defaultdict, Counter
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any, Set
from dataclasses import dataclass, field, asdict
from copy import deepcopy

import pandas as pd
import numpy as np

try:
    from tqdm.notebook import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 120)

print(f"Notebook started: {datetime.now().isoformat()}")

Notebook started: 2026-02-10T09:01:07.221770


In [2]:
"""
1.2 Define Paths
"""

NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebook' else NOTEBOOK_DIR

DATASET_METADATA_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'dataset_metadata'
SIGNAL_DICT_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'config' / 'signal_dictionary.yaml'
RDLS_SCHEMA_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'schema' / 'rdls_schema_v0.3.json'

OUTPUT_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'extracted'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# â”€â”€ Output cleanup mode â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Controls what happens to old output files when this notebook is re-run.
#   "replace" - Auto-delete old outputs and continue (default)
#   "prompt"  - Show what will be deleted, ask user to confirm
#   "skip"    - Keep old files, write new on top (may leave orphans)
#   "abort"   - Stop if old outputs exist (for CI/automated runs)
CLEANUP_MODE = "replace"

assert DATASET_METADATA_DIR.exists(), f"Not found: {DATASET_METADATA_DIR}"
assert SIGNAL_DICT_PATH.exists(), f"Not found: {SIGNAL_DICT_PATH}"

print(f"Base: {BASE_DIR}")
print(f"Output: {OUTPUT_DIR}")
print(f"Cleanup mode: {CLEANUP_MODE}")

Base: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler
Output: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/rdls/extracted
Cleanup mode: replace


In [3]:
"""
1.3 Load Signal Dictionary and RDLS Schema Constants
"""

with open(SIGNAL_DICT_PATH, 'r', encoding='utf-8') as f:
    SIGNAL_DICT = yaml.safe_load(f)

with open(RDLS_SCHEMA_PATH, 'r', encoding='utf-8') as f:
    RDLS_SCHEMA = json.load(f)

# --- Extract valid codelists from schema ---
VALID_EXPOSURE_CATEGORIES = set(
    RDLS_SCHEMA.get('$defs', {}).get('exposure_category', {}).get('enum', [])
)
VALID_METRIC_DIMENSIONS = set(
    RDLS_SCHEMA.get('$defs', {}).get('metric_dimension', {}).get('enum', [])
)
VALID_TAXONOMIES = set(
    RDLS_SCHEMA.get('$defs', {}).get('taxonomy', {}).get('enum', [])
)

# Quantity kind is an OPEN codelist - these are suggestions
QUANTITY_KIND_SUGGESTIONS = ['area', 'count', 'monetary', 'length', 'time']

# --- Valid Metric Triplets ---
# Constrains which (category, dimension, quantity_kind) combinations are allowed.
# Derived from RDLS codelist definitions and domain logic.
# First entry in each list is the DEFAULT for that category.
VALID_TRIPLETS = {
    # agriculture -> structure/area, product/monetary, product/count
    'agriculture': [
        ('structure', 'area'),       # hectares of cultivated land
        ('product',   'monetary'),   # value of crop yield
        ('product',   'count'),      # head of livestock
    ],
    # buildings -> structure/count, structure/monetary, structure/area, content/monetary
    'buildings': [
        ('structure', 'count'),      # number of buildings
        ('structure', 'monetary'),   # replacement cost of building stock
        ('structure', 'area'),       # total floor area
        ('content',   'monetary'),   # value of contents/inventory
    ],
    # infrastructure -> structure/length, structure/monetary, disruption/time, disruption/monetary
    'infrastructure': [
        ('structure',  'length'),    # km of road/railway
        ('structure',  'monetary'),  # construction cost of utility network
        ('disruption', 'time'),      # duration of service outage
        ('disruption', 'monetary'),  # economic loss from service interruption
    ],
    # population -> population/count  (the only valid combination)
    'population': [
        ('population', 'count'),     # number of people
    ],
    # natural_environment -> structure/area  (the only valid combination)
    'natural_environment': [
        ('structure', 'area'),       # sq km of protected forest/vegetation
    ],
    # economic_indicator -> product/monetary, index/count
    'economic_indicator': [
        ('product', 'monetary'),     # total GDP exposure
        ('index',   'count'),        # inflation rate, unemployment rate
    ],
    # development_index -> index/count  (composite scores, unitless/normalized)
    'development_index': [
        ('index', 'count'),          # HDI score, SVI score
    ],
}

# CATEGORY_DEFAULT_METRICS = first entry from each VALID_TRIPLETS list
CATEGORY_DEFAULT_METRICS = {
    cat: triplets[0] for cat, triplets in VALID_TRIPLETS.items()
}

# Validate all triplets use valid schema values
for cat, triplets in VALID_TRIPLETS.items():
    assert cat in VALID_EXPOSURE_CATEGORIES, f"Invalid category: {cat}"
    for dim, qty in triplets:
        assert dim in VALID_METRIC_DIMENSIONS, f"Invalid dimension: {dim} for {cat}"

# --- Currency detection patterns (ISO 4217) ---
COMMON_CURRENCIES = {
    'USD', 'EUR', 'GBP', 'JPY', 'CHF', 'CAD', 'AUD', 'CNY', 'INR', 'BRL',
    'ZAR', 'MXN', 'SGD', 'HKD', 'NOK', 'SEK', 'DKK', 'NZD', 'THB', 'IDR',
    'PHP', 'MYR', 'KRW', 'TRY', 'RUB', 'PLN', 'CZK', 'HUF', 'CLP', 'COP',
    'PEN', 'ARS', 'EGP', 'NGN', 'KES', 'GHS', 'TZS', 'UGX', 'ETB', 'BDT',
    'PKR', 'LKR', 'MMK', 'VND', 'KHR', 'LAK', 'NPR', 'AFN', 'IQD', 'SYP',
}

print(f"Schema constants loaded:")
print(f"  Valid exposure categories ({len(VALID_EXPOSURE_CATEGORIES)}): {sorted(VALID_EXPOSURE_CATEGORIES)}")
print(f"  Valid metric dimensions ({len(VALID_METRIC_DIMENSIONS)}): {sorted(VALID_METRIC_DIMENSIONS)}")
print(f"  Valid taxonomies ({len(VALID_TAXONOMIES)}): {sorted(VALID_TAXONOMIES)}")
print(f"  Category defaults: {len(CATEGORY_DEFAULT_METRICS)} mappings")
print(f"  Valid triplets: {sum(len(v) for v in VALID_TRIPLETS.values())} across {len(VALID_TRIPLETS)} categories")

Schema constants loaded:
  Valid exposure categories (7): ['agriculture', 'buildings', 'development_index', 'economic_indicator', 'infrastructure', 'natural_environment', 'population']
  Valid metric dimensions (6): ['content', 'disruption', 'index', 'population', 'product', 'structure']
  Valid taxonomies (12): ['CDC-SVI', 'Custom', 'EMDAT', 'EMS-98', 'GED4ALL', 'GLIDE', 'HAZUS', 'INFORM', 'MOVER', 'OED', 'PAGER', 'USGS_EHP']
  Category defaults: 7 mappings
  Valid triplets: 16 across 7 categories


## 2. Exposure Extraction Classes

In [4]:
"""
2.1 Data Classes for Exposure Extraction
"""

@dataclass
class ExtractionMatch:
    """Single pattern match with confidence."""
    value: str
    confidence: float
    source_field: str
    matched_text: str
    pattern: str

@dataclass
class MetricExtraction:
    """Extracted metric information."""
    dimension: str
    quantity_kind: str
    confidence: float
    source_hint: str = ""

@dataclass
class ExposureExtraction:
    """Complete exposure extraction for a dataset."""
    categories: List[ExtractionMatch] = field(default_factory=list)
    metrics: List[MetricExtraction] = field(default_factory=list)
    taxonomy_hint: Optional[str] = None
    overall_confidence: float = 0.0
    
    def to_dict(self) -> Dict[str, Any]:
        return {
            'categories': [asdict(m) for m in self.categories],
            'metrics': [asdict(m) for m in self.metrics],
            'taxonomy_hint': self.taxonomy_hint,
            'overall_confidence': self.overall_confidence
        }

print("Data classes defined.")

Data classes defined.


In [5]:
"""
2.2 Metric Inference Patterns

Patterns for detecting metric dimensions, quantity kinds, taxonomies, and currencies.
All dimension keys use lowercase to match RDLS schema metric_dimension enum.
"""

# --- Metric dimension inference patterns ---
# Keys MUST match schema enum: structure, content, product, disruption, population, index
METRIC_DIMENSION_PATTERNS = {
    'structure': [
        r'\b(building|structure|footprint|floor.?area)\b',
        r'\b(construction|built|asset|facility)\b',
        r'\b(road|bridge|railway|rail.?line|highway)\b',
        r'\b(airport|port|harbor|terminal|pipeline)\b',
        r'\b(power.?line|electricity.?grid|water.?supply)\b',
        r'\b(hospital|school|health.?center|clinic)\b',
    ],
    'content': [
        r'\b(content|inventory|equipment|furnishing)\b',
        r'\b(stock|goods|material|supply)\b',
        r'\b(land.?cover|land.?use|vegetation)\b',
        r'\b(ecosystem|habitat|species)\b',
    ],
    'product': [
        r'\b(crop|harvest|yield|production)\b',
        r'\b(output|commodity|livestock|cattle)\b',
        r'\b(food.?production|agricultural.?output)\b',
    ],
    'disruption': [
        r'\b(disruption|downtime|outage|interruption)\b',
        r'\b(delay|loss.?of.?function|service.?disruption)\b',
        r'\b(business.?interruption|closur)\b',
    ],
    'population': [
        r'\b(population[\s._-]?(?:count|density|data|distribution|grid|layer|estimate))\b',
        r'\b((?:census|demographic)[\s._-]?(?:data|survey|layer))\b',
        r'\b(household[\s._-]?(?:survey|count|data|size))\b',
        r'\b((?:displaced|refugee|idp)[\s._-]?(?:population|count|data|number))\b',
        r'\b(population[\s._-]?(?:exposure|at[\s._-]?risk|affected|vulnerable))\b',
    ],
    'index': [
        r'\b(index|indicator|score|ranking)\b',
        r'\b(hdi|svi|inform.?risk|poverty.?index)\b',
        r'\b(vulnerability.?index|resilience.?index)\b',
        r'\b(development.?index|risk.?index)\b',
        r'\b(gdp|gni|economic.?indicator)\b',
    ],
}

# --- Quantity kind inference patterns ---
# quantity_kind is an OPEN codelist, but these are the standard values
QUANTITY_KIND_PATTERNS = {
    'count': [
        r'\b(count[\s._-]?(?:of|data|per))\b',
        r'\b(number[\s._-]?of[\s._-]?(?:building|structure|house|people|person|household))\b',
        r'\b(total[\s._-]?(?:count|number|population|building|structure))\b',
    ],
    'area': [
        r'\b(area|hectare|acre|sq\.?\s*(?:m|km|ft))\b',
        r'\b(square|coverage|extent|footprint)\b',
    ],
    'length': [
        r'\b(length|distance|km|kilometer|mile)\b',
        r'\b(route|corridor|line|network)\b',
    ],
    'monetary': [
        r'\b(value|cost|price|worth|\$|usd|eur)\b',
        r'\b(economic|financial|monetary|budget)\b',
        r'\b(replacement|rebuild|damage.?cost)\b',
        r'\b(gdp|gni|income|expenditure)\b',
    ],
    'time': [
        r'\b(duration|time|hours|days|weeks)\b',
        r'\b(downtime|turnaround|recovery.?time)\b',
    ],
}

# --- Taxonomy detection patterns ---
# Keys MUST match schema taxonomy enum exactly
TAXONOMY_PATTERNS = {
    'GED4ALL':  [r'\b(ged4all|gem.?taxonomy)\b'],
    'MOVER':    [r'\bmover\b'],
    'GLIDE':    [r'\bglide\b'],
    'EMDAT':    [r'\b(em[\-\s]?dat|emdat)\b'],
    'USGS_EHP': [r'\b(usgs.?ehp|usgs.?earthquake.?hazard)\b'],
    'OED':      [r'\boed\b'],
    'HAZUS':    [r'\b(hazus|fema.?taxonomy)\b'],
    'EMS-98':   [r'\b(ems[\-\s]?98|european.?macroseismic)\b'],
    'PAGER':    [r'\b(pager|usgs.?pager)\b'],
    'CDC-SVI':  [r'\b(cdc[\-\s]?svi|social.?vulnerability.?index)\b'],
    'INFORM':   [r'\binform\s+(?:risk|index|severity)\b'],
    'Custom':   [],  # Fallback only, not detected by pattern
}

# --- Currency detection patterns ---
CURRENCY_PATTERNS = [
    (r'\b(USD|US\s*\$|United\s*States\s*Dollar)\b', 'USD'),
    (r'\b(EUR|Euro)\b', 'EUR'),
    (r'\b(GBP|British\s*Pound)\b', 'GBP'),
    (r'\b(JPY|Japanese\s*Yen)\b', 'JPY'),
    (r'\b(CHF|Swiss\s*Franc)\b', 'CHF'),
    (r'\b(AUD|Australian\s*Dollar)\b', 'AUD'),
    (r'\b(CAD|Canadian\s*Dollar)\b', 'CAD'),
    (r'\b(CNY|RMB|Chinese\s*Yuan)\b', 'CNY'),
    (r'\b(INR|Indian\s*Rupee)\b', 'INR'),
    (r'\b(BRL|Brazilian\s*Real)\b', 'BRL'),
]

# NOTE: All 7 exposure categories (including economic_indicator and
# development_index) are now in signal_dictionary.yaml.

print("Metric inference patterns defined:")
print(f"  Dimension patterns: {len(METRIC_DIMENSION_PATTERNS)} dimensions (all lowercase)")
print(f"  Quantity kind patterns: {len(QUANTITY_KIND_PATTERNS)} kinds")
print(f"  Taxonomy patterns: {len(TAXONOMY_PATTERNS)} schemes")
print(f"  Currency patterns: {len(CURRENCY_PATTERNS)} currencies")
print(f"  Note: All 7 exposure categories now in signal_dictionary.yaml")

Metric inference patterns defined:
  Dimension patterns: 6 dimensions (all lowercase)
  Quantity kind patterns: 5 kinds
  Taxonomy patterns: 12 schemes
  Currency patterns: 10 currencies
  Note: All 7 exposure categories now in signal_dictionary.yaml


In [6]:
"""
2.3 Exposure Extractor Class

Extracts RDLS Exposure block components from HDX metadata.

Architecture: 3-tier cascade
  Tier 1 (title, name, tags)  â€” full confidence, always authoritative
  Tier 2 (individual resources) â€” 0.85 weight, can add new categories
  Tier 3 (notes, methodology)  â€” 0.6 weight, corroboration ONLY
                                  (never adds new categories except as
                                   fallback when Tiers 1+2 find nothing)

Key constraints:
- All 7 exposure category patterns loaded from signal_dictionary.yaml
- Metric inference constrained to VALID_TRIPLETS (category->dimension->quantity_kind)
- Scoped metric inference per category source field
"""

from dataclasses import dataclass as _dc, field as _fld

@_dc
class TieredFields:
    """Structured text fields preserving tier hierarchy."""
    title: str = ""
    name: str = ""
    tags: str = ""
    resources: list = _fld(default_factory=list)   # List[Dict] with 'name','description','format'
    notes: str = ""
    methodology: str = ""


class ExposureExtractor:
    """
    Extracts RDLS Exposure block components from HDX metadata
    using a 3-tier confidence cascade.
    """

    CONFIDENCE_MAP = {'high': 0.9, 'medium': 0.7, 'low': 0.5}

    # Tier-level confidence multipliers
    TIER1_CONFIDENCE = 1.0     # title, name, tags
    TIER2_CONFIDENCE = 0.85    # individual resource name+description
    TIER3_CONFIDENCE = 0.6     # notes, methodology
    CORROBORATION_BOOST = 0.05

    def __init__(self, signal_dict: Dict[str, Any]):
        self.signal_dict = signal_dict
        self._compile_patterns()

    # ------------------------------------------------------------------
    # Pattern compilation
    # ------------------------------------------------------------------
    def _compile_patterns(self) -> None:
        """Pre-compile all regex patterns."""
        self.category_patterns = {}
        self.dimension_patterns = {}
        self.quantity_patterns = {}
        self.taxonomy_patterns = {}
        self.currency_compiled = []

        # --- Compile exposure category patterns from signal dict ---
        for category, config in self.signal_dict.get('exposure_category', {}).items():
            if category not in VALID_EXPOSURE_CATEGORIES:
                continue
            patterns = config.get('patterns', [])
            confidence = self.CONFIDENCE_MAP.get(config.get('confidence', 'medium'), 0.7)
            self.category_patterns[category] = {
                'compiled': [re.compile(p, re.IGNORECASE) for p in patterns],
                'confidence': confidence
            }

        # All 7 exposure categories are now in signal_dictionary.yaml

        # --- Compile metric dimension patterns (all lowercase keys) ---
        for dimension, patterns in METRIC_DIMENSION_PATTERNS.items():
            assert dimension in VALID_METRIC_DIMENSIONS, f"Bad dim key: {dimension}"
            self.dimension_patterns[dimension] = [
                re.compile(p, re.IGNORECASE) for p in patterns
            ]

        # --- Compile quantity kind patterns ---
        for kind, patterns in QUANTITY_KIND_PATTERNS.items():
            self.quantity_patterns[kind] = [
                re.compile(p, re.IGNORECASE) for p in patterns
            ]

        # --- Compile taxonomy patterns ---
        for taxonomy, patterns in TAXONOMY_PATTERNS.items():
            if patterns:
                self.taxonomy_patterns[taxonomy] = [
                    re.compile(p, re.IGNORECASE) for p in patterns
                ]

        # --- Compile currency patterns ---
        for pattern, code in CURRENCY_PATTERNS:
            self.currency_compiled.append(
                (re.compile(pattern, re.IGNORECASE), code)
            )

        # natural_environment patterns now tightened in signal_dictionary.yaml

    # ------------------------------------------------------------------
    # Text extraction
    # ------------------------------------------------------------------
    def _extract_tiered_fields(self, hdx_record: Dict[str, Any]) -> TieredFields:
        """Extract text fields preserving tier structure."""
        raw_tags = hdx_record.get('tags', [])
        tag_names = []
        for t in raw_tags:
            if isinstance(t, dict):
                tag_names.append(t.get('name', ''))
            elif isinstance(t, str):
                tag_names.append(t)

        resources_list = []
        for r in hdx_record.get('resources', []):
            resources_list.append({
                'name': r.get('name', ''),
                'description': r.get('description', ''),
                'format': r.get('format', ''),
            })

        return TieredFields(
            title=hdx_record.get('title', ''),
            name=hdx_record.get('name', ''),
            tags=' '.join(tag_names),
            resources=resources_list,
            notes=hdx_record.get('notes', '') or '',
            methodology=hdx_record.get('methodology_other', '') or '',
        )

    # ------------------------------------------------------------------
    # Tier scanning helpers
    # ------------------------------------------------------------------
    def _scan_fields(
        self,
        field_texts: Dict[str, str],
        tier_weight: float,
    ) -> Dict[str, ExtractionMatch]:
        """
        Scan a set of named text fields against all category patterns.
        Returns {category: best ExtractionMatch} for categories found.
        """
        matches: Dict[str, ExtractionMatch] = {}
        for category, config in self.category_patterns.items():
            for compiled in config['compiled']:
                for field_name, text in field_texts.items():
                    if not text:
                        continue
                    m = compiled.search(text)
                    if m:
                        weighted = config['confidence'] * tier_weight
                        if category not in matches or weighted > matches[category].confidence:
                            matches[category] = ExtractionMatch(
                                value=category,
                                confidence=weighted,
                                source_field=field_name,
                                matched_text=m.group(0),
                                pattern=compiled.pattern,
                            )
                        break  # first match per pattern is enough
        return matches

    def _scan_tier1(self, fields: TieredFields) -> Dict[str, ExtractionMatch]:
        """Tier 1: title, name, tags â€” full confidence."""
        return self._scan_fields(
            {'title': fields.title, 'name': fields.name, 'tags': fields.tags},
            self.TIER1_CONFIDENCE,
        )

    def _scan_tier2(self, fields: TieredFields) -> Dict[str, ExtractionMatch]:
        """Tier 2: individual resources â€” 0.85 weight, can add new categories."""
        merged: Dict[str, ExtractionMatch] = {}
        for i, res in enumerate(fields.resources):
            combined = f"{res.get('name', '')} {res.get('description', '')}"
            if not combined.strip():
                continue
            hits = self._scan_fields(
                {f'resource[{i}]': combined},
                self.TIER2_CONFIDENCE,
            )
            for cat, match in hits.items():
                if cat not in merged or match.confidence > merged[cat].confidence:
                    merged[cat] = match
        return merged

    def _scan_tier3(self, fields: TieredFields) -> Dict[str, ExtractionMatch]:
        """Tier 3: notes, methodology â€” corroboration only."""
        return self._scan_fields(
            {'notes': fields.notes, 'methodology': fields.methodology},
            self.TIER3_CONFIDENCE,
        )

    # ------------------------------------------------------------------
    # Tier merging
    # ------------------------------------------------------------------
    def _merge_tiers(
        self,
        tier1: Dict[str, ExtractionMatch],
        tier2: Dict[str, ExtractionMatch],
        tier3: Dict[str, ExtractionMatch],
    ) -> List[ExtractionMatch]:
        """
        Merge tiers into final category list.

        Rules:
          1. Tier 1 always included.
          2. Tier 2 adds new categories OR boosts existing.
          3. Tier 3 ONLY boosts (never adds), except as
             fallback when Tiers 1+2 found nothing.
        """
        final: Dict[str, ExtractionMatch] = {}

        # Tier 1 â€” always accepted
        for cat, match in tier1.items():
            final[cat] = match

        # Tier 2 â€” add or boost
        for cat, match in tier2.items():
            if cat in final:
                # Corroborate: keep higher confidence + boost
                best = final[cat]
                final[cat] = ExtractionMatch(
                    value=cat,
                    confidence=min(best.confidence + self.CORROBORATION_BOOST, 1.0),
                    source_field=best.source_field,
                    matched_text=best.matched_text,
                    pattern=best.pattern,
                )
            else:
                # New category from resource â€” allow it
                final[cat] = match

        # Tier 3 â€” corroborate only (or fallback)
        if final:
            # Have categories already â€” Tier 3 can only boost
            for cat, match in tier3.items():
                if cat in final:
                    best = final[cat]
                    final[cat] = ExtractionMatch(
                        value=cat,
                        confidence=min(best.confidence + self.CORROBORATION_BOOST, 1.0),
                        source_field=best.source_field,
                        matched_text=best.matched_text,
                        pattern=best.pattern,
                    )
                # else: discard â€” notes cannot introduce new categories
        else:
            # Fallback: Tiers 1+2 found nothing, allow Tier 3
            for cat, match in tier3.items():
                final[cat] = match

        return list(final.values())

    # ------------------------------------------------------------------
    # Metric inference (scoped by tier source)
    # ------------------------------------------------------------------
    def _detect_dimensions_for_category(
        self,
        all_text: str,
        category: str,
    ) -> List[Tuple[str, float]]:
        """Detect metric dimensions relevant to a category."""
        detected = []
        for dimension, patterns in self.dimension_patterns.items():
            for pattern in patterns:
                if pattern.search(all_text):
                    detected.append((dimension, 0.8))
                    break

        if detected:
            default_dim = CATEGORY_DEFAULT_METRICS.get(category, ('content', 'count'))[0]
            if any(d == default_dim for d, _ in detected):
                return [(default_dim, 0.85)]
            else:
                return [detected[0]]

        default_dim = CATEGORY_DEFAULT_METRICS.get(category, ('content', 'count'))[0]
        return [(default_dim, 0.5)]

    def _detect_quantity_kind(
        self,
        all_text: str,
        category: str,
    ) -> Tuple[str, float]:
        """Detect quantity kind from text with category fallback."""
        for kind, patterns in self.quantity_patterns.items():
            for pattern in patterns:
                if pattern.search(all_text):
                    return (kind, 0.8)
        default_qty = CATEGORY_DEFAULT_METRICS.get(category, ('content', 'count'))[1]
        return (default_qty, 0.5)

    def _detect_currency(self, all_text: str) -> str:
        """Detect ISO 4217 currency code."""
        for pattern, code in self.currency_compiled:
            if pattern.search(all_text):
                return code
        for word in re.findall(r'\b([A-Z]{3})\b', all_text):
            if word in COMMON_CURRENCIES:
                return word
        return ""

    def _build_scoped_text(
        self,
        fields: TieredFields,
        source_field: str,
    ) -> str:
        """Build scoped text for metric inference from source field + title/tags."""
        scoped_parts: List[str] = []
        if source_field.startswith('resource['):
            try:
                idx = int(source_field.split('[')[1].rstrip(']'))
                res = fields.resources[idx]
                scoped_parts.append(res.get('name', ''))
                scoped_parts.append(res.get('description', ''))
            except (IndexError, ValueError):
                pass
        elif source_field == 'title':
            scoped_parts.append(fields.title)
        elif source_field == 'name':
            scoped_parts.append(fields.name)
        elif source_field == 'tags':
            scoped_parts.append(fields.tags)
        elif source_field == 'notes':
            scoped_parts.append(fields.notes)
        elif source_field == 'methodology':
            scoped_parts.append(fields.methodology)
        # Always include title + tags for context
        if source_field != 'title':
            scoped_parts.append(fields.title)
        if source_field != 'tags':
            scoped_parts.append(fields.tags)
        return ' '.join(p for p in scoped_parts if p)

    def _infer_metrics(
        self,
        fields: TieredFields,
        categories: List[ExtractionMatch],
    ) -> Dict[str, List[MetricExtraction]]:
        """
        Infer metrics PER CATEGORY, constrained to VALID_TRIPLETS.

        Logic:
        1. Build scoped text from the source field where category was matched
        2. Detect dimension and quantity_kind from text patterns
        3. Validate (category, dimension, quantity_kind) against VALID_TRIPLETS
        4. If valid -> use it; if dimension valid but qty wrong -> fix qty;
           otherwise -> fall back to category default
        5. Currency detection is global (dataset-level signal)
        """
        # Global text for currency detection only
        global_parts = [fields.title, fields.name, fields.tags,
                        fields.notes, fields.methodology]
        for r in fields.resources:
            global_parts.append(r.get('name', ''))
            global_parts.append(r.get('description', ''))
        global_text = ' '.join(p for p in global_parts if p)
        currency = self._detect_currency(global_text)

        metrics_by_category: Dict[str, List[MetricExtraction]] = {}

        for cat_match in categories:
            category = cat_match.value
            source_field = cat_match.source_field
            allowed = VALID_TRIPLETS.get(category, [])
            default_dim, default_qty = CATEGORY_DEFAULT_METRICS.get(
                category, ('content', 'count')
            )

            scoped_text = self._build_scoped_text(fields, source_field)

            # Detect dimension and quantity from text patterns
            dims = self._detect_dimensions_for_category(scoped_text, category)
            qty_kind, qty_conf = self._detect_quantity_kind(scoped_text, category)

            category_metrics = []
            for dim, dim_conf in dims:
                # --- Validate against VALID_TRIPLETS ---
                if (dim, qty_kind) in allowed:
                    # Exact match â€” use as detected
                    pass
                elif any(d == dim for d, _ in allowed):
                    # Dimension valid but qty wrong â€” use the allowed qty for this dim
                    qty_kind = next(q for d, q in allowed if d == dim)
                    qty_conf = 0.5
                else:
                    # Dimension not valid for this category â€” fall back to default
                    dim = default_dim
                    qty_kind = default_qty
                    dim_conf = 0.5
                    qty_conf = 0.5

                metric = MetricExtraction(
                    dimension=dim,
                    quantity_kind=qty_kind,
                    confidence=min(dim_conf, qty_conf),
                    source_hint=f'for_{category}',
                )
                if qty_kind == 'monetary' and currency:
                    metric.source_hint = f'for_{category}_currency_{currency}'
                category_metrics.append(metric)

            if not category_metrics:
                category_metrics.append(MetricExtraction(
                    dimension=default_dim,
                    quantity_kind=default_qty,
                    confidence=0.4,
                    source_hint=f'default_for_{category}',
                ))

            metrics_by_category[category] = category_metrics

        return metrics_by_category

    # ------------------------------------------------------------------
    # Taxonomy detection
    # ------------------------------------------------------------------
    def _detect_taxonomy(self, fields: TieredFields) -> Optional[str]:
        """Detect taxonomy scheme from all text."""
        all_parts = [fields.title, fields.name, fields.tags,
                     fields.notes, fields.methodology]
        for r in fields.resources:
            all_parts.append(r.get('name', ''))
            all_parts.append(r.get('description', ''))
        all_text = ' '.join(p for p in all_parts if p)

        for taxonomy, patterns in self.taxonomy_patterns.items():
            for pattern in patterns:
                if pattern.search(all_text):
                    if taxonomy in VALID_TAXONOMIES:
                        return taxonomy
        return None

    # ------------------------------------------------------------------
    # Main entry point
    # ------------------------------------------------------------------
    def extract(self, hdx_record: Dict[str, Any]) -> ExposureExtraction:
        """
        Extract exposure information using 3-tier cascade.
        """
        fields = self._extract_tiered_fields(hdx_record)

        # Tiered category detection
        tier1 = self._scan_tier1(fields)
        tier2 = self._scan_tier2(fields)
        tier3 = self._scan_tier3(fields)
        categories = self._merge_tiers(tier1, tier2, tier3)

        # Per-category metrics
        metrics_by_category = self._infer_metrics(fields, categories)
        all_metrics = []
        for cat_metrics in metrics_by_category.values():
            all_metrics.extend(cat_metrics)

        # Taxonomy
        taxonomy = self._detect_taxonomy(fields)

        # Currency (global)
        global_parts = [fields.title, fields.name, fields.tags,
                        fields.notes, fields.methodology]
        for r in fields.resources:
            global_parts.append(r.get('name', ''))
            global_parts.append(r.get('description', ''))
        global_text = ' '.join(p for p in global_parts if p)
        currency = self._detect_currency(global_text)

        # Confidence
        confidences = [c.confidence for c in categories]
        confidences.extend([m.confidence for m in all_metrics])
        overall_confidence = np.mean(confidences) if confidences else 0.0

        extraction = ExposureExtraction(
            categories=categories,
            metrics=all_metrics,
            taxonomy_hint=taxonomy,
            overall_confidence=overall_confidence,
        )
        extraction._metrics_by_category = metrics_by_category
        extraction._currency = currency
        extraction._tier_info = {
            'tier1_cats': list(tier1.keys()),
            'tier2_cats': list(tier2.keys()),
            'tier3_cats': list(tier3.keys()),
        }

        return extraction


# Initialize
exposure_extractor = ExposureExtractor(SIGNAL_DICT)
print(f"ExposureExtractor initialized (3-tier cascade).")
print(f"  - Categories: {len(exposure_extractor.category_patterns)} "
      f"({sorted(exposure_extractor.category_patterns.keys())})")
print(f"  - Tier weights: T1={ExposureExtractor.TIER1_CONFIDENCE}, "
      f"T2={ExposureExtractor.TIER2_CONFIDENCE}, "
      f"T3={ExposureExtractor.TIER3_CONFIDENCE}")
print(f"  - Corroboration boost: {ExposureExtractor.CORROBORATION_BOOST}")
print(f"  - Dimension patterns: {len(exposure_extractor.dimension_patterns)} dimensions")
print(f"  - Taxonomy patterns: {len(exposure_extractor.taxonomy_patterns)} schemes")


ExposureExtractor initialized (3-tier cascade).
  - Categories: 7 (['agriculture', 'buildings', 'development_index', 'economic_indicator', 'infrastructure', 'natural_environment', 'population'])
  - Tier weights: T1=1.0, T2=0.85, T3=0.6
  - Corroboration boost: 0.05
  - Dimension patterns: 6 dimensions
  - Taxonomy patterns: 11 schemes


## 3. RDLS Exposure Block Builder

In [7]:
"""
3.1 Build RDLS Exposure Block

Generates schema-compliant RDLS v0.3 exposure array.
Key compliance rules:
- dimension values MUST be lowercase (schema metric_dimension enum)
- currency field included when quantity_kind = monetary
- Each exposure item gets category-specific metrics (not duplicated)
- All values validated against schema codelists
"""

def build_exposure_block(
    extraction: ExposureExtraction,
    dataset_id: str
) -> Optional[List[Dict[str, Any]]]:
    """
    Build RDLS exposure block from extraction results.
    
    Parameters
    ----------
    extraction : ExposureExtraction
        Extraction results (with _metrics_by_category and _currency attached)
    dataset_id : str
        Dataset identifier for generating unique IDs
        
    Returns
    -------
    Optional[List[Dict[str, Any]]]
        RDLS exposure array or None if no categories detected
    """
    if not extraction.categories:
        return None
    
    # Get per-category metrics (attached by extractor)
    metrics_by_category = getattr(extraction, '_metrics_by_category', {})
    currency = getattr(extraction, '_currency', '')
    
    exposure_items = []
    id_prefix = dataset_id[:8] if dataset_id else 'unknown'
    
    for i, cat in enumerate(extraction.categories):
        category = cat.value
        
        # Validate category against schema
        if category not in VALID_EXPOSURE_CATEGORIES:
            continue
        
        exposure_item = {
            'id': f"exposure_{id_prefix}_{i+1}",
            'category': category,
        }
        
        # Add taxonomy if detected
        if extraction.taxonomy_hint and extraction.taxonomy_hint in VALID_TAXONOMIES:
            exposure_item['taxonomy'] = extraction.taxonomy_hint
        
        # --- Build metrics for this specific category ---
        cat_metrics = metrics_by_category.get(category, [])
        
        if cat_metrics:
            metrics = []
            for j, m in enumerate(cat_metrics):
                # Validate and enforce lowercase dimension
                dim = m.dimension.lower()
                if dim not in VALID_METRIC_DIMENSIONS:
                    # Fallback to category default
                    dim = CATEGORY_DEFAULT_METRICS.get(category, ('content', 'count'))[0]
                
                metric = {
                    'id': f"metric_{id_prefix}_{i+1}_{j+1}",
                    'dimension': dim,
                    'quantity_kind': m.quantity_kind,
                }
                
                # Add currency when quantity_kind is monetary
                if m.quantity_kind == 'monetary':
                    # Try to extract currency from metric source_hint
                    metric_currency = ''
                    if hasattr(m, 'source_hint') and '_currency_' in m.source_hint:
                        metric_currency = m.source_hint.split('_currency_')[-1]
                    if not metric_currency:
                        metric_currency = currency
                    metric['currency'] = metric_currency
                
                metrics.append(metric)
            exposure_item['metrics'] = metrics
        else:
            # Fallback: create default metric for this category
            default_dim, default_qty = CATEGORY_DEFAULT_METRICS.get(
                category, ('content', 'count')
            )
            metric = {
                'id': f"metric_{id_prefix}_{i+1}_1",
                'dimension': default_dim,
                'quantity_kind': default_qty,
            }
            if default_qty == 'monetary':
                metric['currency'] = currency
            exposure_item['metrics'] = [metric]
        
        exposure_items.append(exposure_item)
    
    return exposure_items if exposure_items else None

print("Exposure block builder defined with schema compliance:")
print("  - dimension values: lowercase only (Issue 1)")
print("  - no 'Other' dimension (Issue 2)")
print("  - per-category metrics (Issue 4)")
print("  - currency field when quantity_kind = monetary (Issue 6)")
print("  - all values validated against schema codelists")

Exposure block builder defined with schema compliance:
  - dimension values: lowercase only (Issue 1)
  - no 'Other' dimension (Issue 2)
  - per-category metrics (Issue 4)
  - currency field when quantity_kind = monetary (Issue 6)
  - all values validated against schema codelists


## 4. Test Extraction

In [8]:
"""
4.1 Load Comprehensive Sample Records for Testing

Curated samples across all 7 RDLS exposure categories to test:
- Issue 1: lowercase dimension values
- Issue 2: No "Other" dimension
- Issue 3: All 7 categories detected
- Issue 4: Per-category metrics
- Issue 5: Smart category-specific defaults
- Issue 6: Currency detection for monetary metrics
- Issue 7: Expanded taxonomy detection
"""

EXPOSURE_TEST_SAMPLES = {
    'buildings': [
        ('00a27905-efd2-4ce5-b5d2-6c862ad7d852__ai-building-footprint-in-izabal-guatemala.json',
         'AI building footprints - structure/area metric expected'),
        ('04c4bf06-511e-4009-9c03-4ec97334d944__hurricane-melissa-building-damage-assessment-in-jamaica.json',
         'Building damage assessment - post-disaster, monetary possible'),
        ('04c95b2c-7472-4516-80f7-b6ed950a5255__sri-lanka-floods-building-damage-assessment-in-colombo.json',
         'Flood building damage - cross-category test (buildings + population)'),
        ('01bb1f79-8e9f-4bb7-bc33-8693a7a3dd7c__hotosm-bgd-buildings.json',
         'OSM building export - Bangladesh, structure/count'),
        ('0453224c-92d5-4eed-a93c-5a465ef85883__hotosm-esp-buildings.json',
         'OSM building export - Spain, structure/count'),
    ],
    'infrastructure': [
        ('acf1231c-4248-4834-8e1f-e6d6939b0c6e__urban-transport-world.json',
         'Urban transport accessibility - global SDG indicator'),
        ('49c342a1-274f-4416-931a-c026841b128c__schools-in-syria-2019.json',
         'Schools/facilities - infrastructure as health/education assets'),
        ('7a565edc-67e7-4e12-ad9c-8c0756a558ed__world-bank-infrastructure-indicators-for-czech-republic.json',
         'World Bank infrastructure indicators'),
        ('fe9c7f0b-4024-4a2c-a98d-78ceee623497__hotosm-brb-roads.json',
         'OSM roads - Barbados, structure/length metric expected'),
        ('0bcaa7d8-84fd-4d7b-a831-25aaa6480167__world-bank-infrastructure-indicators-for-korea-dem-peoples-rep.json',
         'World Bank infrastructure indicators - DPRK'),
        ('dece59ba-c7c9-4c59-8481-abfcf059dddd__uganda-energy-gas-facilities.json',
         'Energy/Gas facilities - tiered cascade regression test'),
    ],
    'population': [
        ('6ffa577e-1f50-44a2-8f49-6b302e1da1a0__nepal-population-census-2011-cod.json',
         'Nepal census - population/count, demographic data'),
        ('60967843-7a89-4c4a-b76a-543bc0a4e2bc__saint-vincent-and-the-grenadines-gridded-population-dataset.json',
         'Gridded population - spatial, area-based'),
        ('1d7193a5-52ed-4e89-9d14-fb62f2ca8e9c__1999-2013-tally-of-internaly-displaced-persons-resulting-from-natural-disasters.json',
         'Displaced persons tally - population/count'),
        ('5ddc6df3-7d8d-438a-9867-70dbc04f91b4__spatialized-100m-school-age-population-grid-for-togo-by-educational-level-and-se.json',
         'School-age population grid - 100m resolution'),
        ('0a3aff95-e37e-48f4-844b-a105cd2325e8__dominica-gridded-population-dataset.json',
         'Gridded population - Caribbean island'),
    ],
    'agriculture': [
        ('80fafa94-f78f-4cfd-ad92-7cb4c7c2d7bc__horn-of-africa-food-security.json',
         'Food security data - FEWSNET, multi-country'),
        ('76654673-8c03-4431-a947-fd0fb853a70b__ethiopia-food-security.json',
         'Ethiopia food security - product/area expected'),
        ('1b9da567-1e83-40b0-9953-6f57c396a23c__guinea-food-security.json',
         'Guinea food security'),
        ('8aca8a6f-10a7-48e3-ae2c-807117303649__agricultural-and-food-production-index-indice-de-la-production-agricole-et-alime.json',
         'Agricultural production index - product/count'),
        ('7322e8fe-b7b4-4b3c-b445-a069ccb7f95e__african-development-bank-food-security-prices-monthly-december-2011.json',
         'Food prices - may detect monetary/currency'),
    ],
    'natural_environment': [
        ('00710e3e-c8ef-46c2-9cb0-d87070600bd7__geodata-of-landcover-classification-kalobeyei-turkana-county-kenya-july-01-2015.json',
         'Land cover classification - content/area expected'),
        ('342ad68f-501b-4f34-948e-a2d195018f95__world-bank-environment-indicators-for-czech-republic.json',
         'Environment indicators - may overlap with index'),
        ('31618b9d-0b3d-4ffd-9689-1b91ee80287b__world-bank-environment-indicators-for-turkey.json',
         'Environment indicators - Turkey'),
    ],
    'economic_indicator': [
        ('a3b57464-fa02-49c3-a8a0-3f26e12a7ebf__multi-hazard-average-annual-loss.json',
         'Multi-hazard AAL - monetary/USD expected (Issue 6 test)'),
        ('23efa9ad-5b20-43e1-bf2b-0c90226ff956__impact-data-casualties-and-damage-typhoon-haiyan-yolanda.json',
         'Typhoon damage costs - monetary expected'),
        ('290602c1-cd52-482f-b938-1af750e57bfe__world-bank-trade-indicators-for-turkey.json',
         'Trade indicators - economic, monetary/USD'),
        ('37894f3f-62ee-408f-9341-0572f2cea977__world-bank-financial-sector-indicators-for-czech-republic.json',
         'Financial sector indicators'),
        ('4fdcd4dc-5c2f-43af-a1e4-93c9b6539a27__wfp-food-prices.json',
         'WFP food prices - currency detection test'),
    ],
    'development_index': [
        ('0166def4-bac0-4d33-a929-f5596d18e178__indonesia-population-living-below-the-poverty-line-2007-2018.json',
         'Poverty line data - development_index/count'),
        ('0e5412f2-7c17-4ce9-8928-458865c5f7e9__education-index.json',
         'Education index - index/count expected'),
        ('9403e827-55cb-40d9-9d2b-192cf66f5726__indonesia-gender-empowerment-index-2010-2017.json',
         'Gender empowerment index'),
        ('2ab7d812-be62-48cb-bd94-407fd522e21e__kenya-health-spending-per-capita-per-county.json',
         'Health spending per capita - development/monetary'),
        ('03519575-043a-4414-92c4-203afe52b004__indicadores-del-sidih.json',
         'SIDIH indicators - development indices'),
    ],
}

# --- Load all samples ---
sample_records = []
sample_metadata = []
loaded_ids = set()
skipped = 0

for expected_cat, samples in EXPOSURE_TEST_SAMPLES.items():
    for filename, note in samples:
        filepath = DATASET_METADATA_DIR / filename
        if filepath.exists():
            with open(filepath, 'r', encoding='utf-8') as f:
                record = json.load(f)
            rid = record.get('id', '')
            if rid not in loaded_ids:
                loaded_ids.add(rid)
                sample_records.append(record)
                sample_metadata.append({
                    'expected_category': expected_cat,
                    'note': note,
                    'filename': filename,
                })
            else:
                skipped += 1
        else:
            print(f"  WARNING: Not found: {filename[:60]}...")

print(f"Loaded {len(sample_records)} unique sample records across {len(EXPOSURE_TEST_SAMPLES)} categories.")
if skipped:
    print(f"  ({skipped} duplicates across categories)")
print(f"\nSamples per expected category:")
for cat, samples in EXPOSURE_TEST_SAMPLES.items():
    print(f"  {cat}: {len(samples)} samples")

Loaded 34 unique sample records across 7 categories.

Samples per expected category:
  buildings: 5 samples
  infrastructure: 6 samples
  population: 5 samples
  agriculture: 5 samples
  natural_environment: 3 samples
  economic_indicator: 5 samples
  development_index: 5 samples


In [9]:
"""
4.2 Run Extraction and Validate All 7 Issues
"""

print("=" * 90)
print("COMPREHENSIVE EXPOSURE EXTRACTION TEST RESULTS")
print(f"Testing {len(sample_records)} samples across {len(EXPOSURE_TEST_SAMPLES)} categories")
print("=" * 90)

extraction_results = []

for i, (record, meta) in enumerate(zip(sample_records, sample_metadata)):
    extraction = exposure_extractor.extract(record)
    
    result = {
        'id': record.get('id'),
        'title': record.get('title', '')[:70],
        'record': record,
        'extraction': extraction,
        'expected_category': meta['expected_category'],
        'note': meta['note'],
    }
    extraction_results.append(result)
    
    # Check if expected category was detected
    detected_cats = [c.value for c in extraction.categories]
    expected_found = meta['expected_category'] in detected_cats
    status = "MATCH" if expected_found else "MISS"
    
    print(f"\n{'â”€' * 90}")
    print(f"[{status}] [{meta['expected_category']}] {record.get('title', '')[:75]}")
    print(f"  Note: {meta['note']}")
    
    if extraction.categories:
        cats_str = ', '.join(f"{c.value}({c.confidence:.1f})" for c in extraction.categories)
        print(f"  Categories: {cats_str}")
    else:
        print(f"  Categories: None detected")
    
    # Show per-category metrics
    metrics_by_cat = getattr(extraction, '_metrics_by_category', {})
    if metrics_by_cat:
        for cat, metrics in metrics_by_cat.items():
            for m in metrics:
                print(f"  Metric [{cat}]: dim={m.dimension}, qty={m.quantity_kind} "
                      f"(conf={m.confidence:.1f}, {m.source_hint})")
    
    currency = getattr(extraction, '_currency', '')
    if currency:
        print(f"  Currency: {currency}")
    if extraction.taxonomy_hint:
        print(f"  Taxonomy: {extraction.taxonomy_hint}")

# --- Summary ---
print(f"\n{'=' * 90}")
print("EXTRACTION SUMMARY BY CATEGORY")
print(f"{'=' * 90}")

for expected_cat in EXPOSURE_TEST_SAMPLES:
    cat_results = [r for r in extraction_results if r['expected_category'] == expected_cat]
    detected = sum(1 for r in cat_results
                   if expected_cat in [c.value for c in r['extraction'].categories])
    total = len(cat_results)
    
    bar = "#" * detected + "." * (total - detected)
    print(f"  {expected_cat:22s} [{bar}] {detected}/{total} detected")

COMPREHENSIVE EXPOSURE EXTRACTION TEST RESULTS
Testing 34 samples across 7 categories

â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
[MATCH] [buildings] AI building footprint in Izabal, Guatemala
  Note: AI building footprints - structure/area metric expected
  Categories: buildings(1.0), infrastructure(0.9)
  Metric [buildings]: dim=structure, qty=area (conf=0.8, for_buildings)
  Metric [infrastructure]: dim=structure, qty=length (conf=0.5, for_infrastructure)

â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
[MATCH] [buildings] Hurricane Melissa

In [10]:
"""
4.3 Build Exposure Blocks and Verify Structural Compliance

Verifies all 7 issues are fixed in the generated RDLS exposure blocks.
"""

print("=" * 90)
print("RDLS EXPOSURE BLOCK STRUCTURAL VERIFICATION")
print("=" * 90)

# Compliance counters
total_blocks = 0
total_items = 0
total_metrics = 0
uppercase_dims = 0
invalid_dims = 0
other_dims = 0
monetary_with_currency = 0
monetary_without_currency = 0
categories_seen = set()

for result in extraction_results:
    if not result['extraction'].categories:
        continue
    
    exposure_block = build_exposure_block(
        result['extraction'],
        result['id']
    )
    if not exposure_block:
        continue
    
    total_blocks += 1
    
    print(f"\n{'â”€' * 90}")
    print(f"[{result['expected_category']}] {result['title']}")
    
    for item in exposure_block:
        total_items += 1
        categories_seen.add(item['category'])
        
        taxonomy_str = f", taxonomy={item.get('taxonomy', '-')}" if 'taxonomy' in item else ""
        print(f"  {item['id']}: category={item['category']}{taxonomy_str}")
        
        for metric in item['metrics']:
            total_metrics += 1
            dim = metric['dimension']
            qty = metric['quantity_kind']
            currency_str = f", currency={metric.get('currency', '')}" if 'currency' in metric else ""
            
            # Check compliance
            if dim != dim.lower():
                uppercase_dims += 1
            if dim.lower() not in VALID_METRIC_DIMENSIONS:
                invalid_dims += 1
            if dim.lower() == 'other':
                other_dims += 1
            if qty == 'monetary':
                if 'currency' in metric:
                    monetary_with_currency += 1
                else:
                    monetary_without_currency += 1
            
            print(f"    {metric['id']}: dim={dim}, qty={qty}{currency_str}")
    
    # Show full JSON for first 2 blocks
    if total_blocks <= 2:
        print(f"\n  Full JSON preview:")
        print(json.dumps(exposure_block, indent=2)[:1500])

# --- Final compliance report ---
print(f"\n{'=' * 90}")
print("STRUCTURAL COMPLIANCE REPORT")
print(f"{'=' * 90}")
print(f"  Total exposure blocks built:        {total_blocks}")
print(f"  Total exposure items:               {total_items}")
print(f"  Total metrics:                      {total_metrics}")
print(f"  Unique categories detected:         {sorted(categories_seen)}")
print(f"")
print(f"  Issue 1 - Lowercase dimensions:     {'PASS' if uppercase_dims == 0 else f'FAIL ({uppercase_dims} uppercase)'}")
print(f"  Issue 2 - No 'Other' dimension:     {'PASS' if other_dims == 0 else f'FAIL ({other_dims} found)'}")
print(f"  Issue 3 - All valid dimensions:     {'PASS' if invalid_dims == 0 else f'FAIL ({invalid_dims} invalid)'}")
print(f"  Issue 4 - Categories detected:      {len(categories_seen)}/7 ({sorted(categories_seen)})")
print(f"  Issue 6 - Monetary with currency:   {monetary_with_currency} (without: {monetary_without_currency})")
print(f"  Missing categories:                 {VALID_EXPOSURE_CATEGORIES - categories_seen or 'none'}")

RDLS EXPOSURE BLOCK STRUCTURAL VERIFICATION

â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
[buildings] AI building footprint in Izabal, Guatemala
  exposure_00a27905_1: category=buildings
    metric_00a27905_1_1: dim=structure, qty=area
  exposure_00a27905_2: category=infrastructure
    metric_00a27905_2_1: dim=structure, qty=length

  Full JSON preview:
[
  {
    "id": "exposure_00a27905_1",
    "category": "buildings",
    "metrics": [
      {
        "id": "metric_00a27905_1_1",
        "dimension": "structure",
        "quantity_kind": "area"
      }
    ]
  },
  {
    "id": "exposure_00a27905_2",
    "category": "infrastructure",
    "metrics": [
      {
        "id": "metric_00a27905_2_1",
        "dimension": "structure",
        "quantity_kind": "length"
  

## 5. Batch Processing

In [11]:
"""
5.1 Process All Records
"""

def process_exposure_extraction(
    metadata_dir: Path,
    extractor: ExposureExtractor,
    limit: Optional[int] = None
) -> pd.DataFrame:
    """
    Process all HDX records for exposure extraction.
    """
    json_files = list(metadata_dir.glob('*.json'))
    if limit:
        json_files = json_files[:limit]
    
    results = []
    iterator = tqdm(json_files, desc="Extracting") if HAS_TQDM else json_files
    
    for filepath in iterator:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                record = json.load(f)
            
            extraction = extractor.extract(record)
            
            results.append({
                'id': record.get('id'),
                'title': record.get('title'),
                'organization': record.get('organization'),
                'categories': [c.value for c in extraction.categories],
                'category_count': len(extraction.categories),
                'has_exposure': len(extraction.categories) > 0,
                'taxonomy': extraction.taxonomy_hint,
                'overall_confidence': extraction.overall_confidence,
                'extraction': extraction
            })
            
        except Exception as e:
            results.append({'id': filepath.stem, 'error': str(e)})
    
    return pd.DataFrame(results)

# Process
PROCESS_LIMIT = None  # Set to None for full corpus

print(f"Processing {PROCESS_LIMIT or 'all'} records...")
df_exposure = process_exposure_extraction(DATASET_METADATA_DIR, exposure_extractor, limit=PROCESS_LIMIT)

Processing all records...


Extracting:   0%|          | 0/26246 [00:00<?, ?it/s]

In [12]:
"""
5.2 Extraction Statistics
"""

print("=" * 60)
print("EXPOSURE EXTRACTION STATISTICS")
print("=" * 60)

total = len(df_exposure)
with_exposure = df_exposure['has_exposure'].sum()

print(f"\nTotal records: {total:,}")
print(f"With exposure signals: {with_exposure:,} ({with_exposure/total*100:.1f}%)")

# Category distribution
cat_counts = Counter()
for cats in df_exposure['categories'].dropna():
    cat_counts.update(cats)

print(f"\nCategory Distribution:")
for cat, count in cat_counts.most_common():
    print(f"  {cat}: {count}")

# Confidence distribution
conf = df_exposure[df_exposure['has_exposure']]['overall_confidence']
print(f"\nConfidence (exposure records):")
print(f"  Mean: {conf.mean():.2f}")
print(f"  High (>=0.8): {(conf >= 0.8).sum()}")

EXPOSURE EXTRACTION STATISTICS

Total records: 26,246
With exposure signals: 21,717 (82.7%)

Category Distribution:
  population: 12760
  infrastructure: 8647
  economic_indicator: 6577
  agriculture: 5041
  natural_environment: 2746
  buildings: 1562
  development_index: 1248

Confidence (exposure records):
  Mean: 0.67
  High (>=0.8): 254


## 6. Export Results

In [13]:
"""
6.0 Clean Previous Outputs

Removes stale output files before writing new ones.
Controlled by CLEANUP_MODE in cell 1.2 above.
"""

def clean_previous_outputs(output_dir, patterns, label, mode="replace"):
    """
    Remove previous output files matching the given glob patterns.

    Parameters
    ----------
    output_dir : Path
        Directory containing old outputs.
    patterns : list[str]
        Glob patterns to match.
    label : str
        Human-readable label for log messages.
    mode : str
        One of: "replace" (auto-delete), "prompt" (ask user),
        "skip" (keep old files), "abort" (error if stale files exist).

    Returns
    -------
    dict  with keys 'deleted' (int) and 'skipped' (bool)
    """
    result = {'deleted': 0, 'skipped': False}
    targets = {}
    for pattern in patterns:
        matches = sorted(output_dir.glob(pattern))
        if matches:
            targets[pattern] = matches
    total = sum(len(files) for files in targets.values())

    if total == 0:
        print(f'Output cleanup [{label}]: Directory is clean.')
        return result

    summary = []
    for pattern, files in targets.items():
        summary.append(f'  {pattern:40s}: {len(files):,} files')

    if mode == 'skip':
        print(f'Output cleanup [{label}]: SKIPPED ({total:,} existing files kept)')
        result['skipped'] = True
        return result

    if mode == 'abort':
        raise RuntimeError(
            f'Output cleanup [{label}]: ABORT -- {total:,} stale files found. '
            f'Delete manually or change CLEANUP_MODE.'
        )

    if mode == 'prompt':
        print(f'Output cleanup [{label}]: Found {total:,} existing output files:')
        for line in summary:
            print(line)
        choice = input('Choose [R]eplace / [S]kip / [A]bort: ').strip().lower()
        if choice in ('s', 'skip'):
            print('  Skipped.')
            result['skipped'] = True
            return result
        elif choice in ('a', 'abort'):
            raise RuntimeError('User chose to abort.')
        elif choice not in ('r', 'replace', ''):
            print(f'  Unknown choice "{choice}", defaulting to Replace.')

    # Mode: replace (default)
    print(f'Output cleanup [{label}]:')
    for line in summary:
        print(line)
    for pattern, files in targets.items():
        for f in files:
            try:
                f.unlink()
                result['deleted'] += 1
            except Exception as e:
                print(f'  WARNING: Could not delete {f.name}: {e}')
    deleted_count = result['deleted']
    print(f'  Cleaned {deleted_count:,} files. Ready for fresh output.')
    print()
    return result


# -- Run cleanup for NB 10 Exposure Extraction outputs --
clean_previous_outputs(
    OUTPUT_DIR,
    patterns=[
        "rdls_exp-hdx_*.json",
        "exposure_extraction_results.csv",
        "exposure_extraction_high_confidence.csv",
    ],
    label="NB 10 Exposure Extraction",
    mode=CLEANUP_MODE,
)


Output cleanup [NB 10 Exposure Extraction]:
  rdls_exp-hdx_*.json                     : 20,892 files
  exposure_extraction_results.csv         : 1 files
  exposure_extraction_high_confidence.csv : 1 files
  Cleaned 20,894 files. Ready for fresh output.



{'deleted': 20894, 'skipped': False}

In [14]:
"""
6.1 Export Extraction Results
"""

# Prepare export
export_df = df_exposure[[
    'id', 'title', 'organization', 'categories', 'category_count',
    'taxonomy', 'overall_confidence', 'has_exposure'
]].copy()

export_df['categories'] = export_df['categories'].apply(
    lambda x: '|'.join(x) if isinstance(x, list) else ''
)

output_file = OUTPUT_DIR / 'exposure_extraction_results.csv'
export_df.to_csv(output_file, index=False)
print(f"Saved: {output_file}")

# High confidence
high_conf = export_df[export_df['has_exposure'] & (df_exposure['overall_confidence'] >= 0.8)]
high_conf_file = OUTPUT_DIR / 'exposure_extraction_high_confidence.csv'
high_conf.to_csv(high_conf_file, index=False)
print(f"Saved: {high_conf_file} ({len(high_conf)} records)")

Saved: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/rdls/extracted/exposure_extraction_results.csv
Saved: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/rdls/extracted/exposure_extraction_high_confidence.csv (254 records)


In [15]:
"""
6.2 Generate RDLS Exposure Block JSONs for All Flagged Datasets

Create RDLS JSON records with exposure blocks for ALL datasets
where exposure was detected (not just a sample).
These JSONs are consumed by NB 12 for HEVL integration.
"""

# Select ALL records with exposure detection
all_exposure = df_exposure[
    df_exposure['has_exposure'] &
    (df_exposure['overall_confidence'] >= 0.5)
].copy()

print(f"Generating RDLS exposure block JSONs for {len(all_exposure):,} datasets...")

generated = 0
skipped = 0

iterator = tqdm(all_exposure.iterrows(), total=len(all_exposure), desc="Building exposure JSONs") if HAS_TQDM else all_exposure.iterrows()

for idx, row in iterator:
    extraction = row['extraction']
    exposure_block = build_exposure_block(extraction, row['id'])

    if exposure_block:
        rdls_record = {
            'datasets': [{
                'id': f"rdls_exp-hdx_{row['id'][:8]}",
                'title': row['title'],
                'risk_data_type': ['exposure'],
                'exposure': exposure_block,
                'links': [{
                    'href': 'https://docs.riskdatalibrary.org/en/0__3__0/rdls_schema.json',
                    'rel': 'describedby'
                }]
            }]
        }

        output_path = OUTPUT_DIR / f"rdls_exp-hdx_{row['id'][:8]}.json"
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(rdls_record, f, indent=2, ensure_ascii=False)

        generated += 1
    else:
        skipped += 1

print(f"\nDone.")
print(f"  Generated: {generated:,} exposure block JSONs")
print(f"  Skipped (no valid block): {skipped:,}")
print(f"  Output: {OUTPUT_DIR}")

Generating RDLS exposure block JSONs for 20,530 datasets...


Building exposure JSONs:   0%|          | 0/20530 [00:00<?, ?it/s]


Done.
  Generated: 20,530 exposure block JSONs
  Skipped (no valid block): 0
  Output: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/rdls/extracted


In [16]:
print(f"\nNotebook completed: {datetime.now().isoformat()}")


Notebook completed: 2026-02-10T09:20:21.230865


## End of Code