# Notebook 10: RDLS Exposure Block Extractor

**Purpose**: Extract and populate RDLS v0.3 Exposure component blocks from HDX metadata.

**Input**:
- HDX dataset metadata JSON files
- Signal Dictionary (`config/signal_dictionary.yaml`)
- RDLS Schema (`rdls/schema/rdls_schema_v0.3.json`)

**Output**:
- Exposure block extractions with confidence scores
- Updated RDLS records with populated exposure blocks

**RDLS Exposure Block Structure**:
```json
"exposure": [
  {
    "id": "...",
    "category": "buildings|infrastructure|population|agriculture|natural_environment",
    "taxonomy": "GED4ALL|...",
    "metrics": [
      {
        "id": "...",
        "dimension": "Structure|Content|Product|Other",
        "quantity_kind": "count|area|length|monetary|..."
      }
    ]
  }
]
```

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1

---

## 1. Setup and Configuration

In [None]:
"""
1.1 Import Dependencies
"""

import json
import os
import re
import yaml
from pathlib import Path
from collections import defaultdict, Counter
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any, Set
from dataclasses import dataclass, field, asdict
from copy import deepcopy

import pandas as pd
import numpy as np

try:
    from tqdm.notebook import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 120)

print(f"Notebook started: {datetime.now().isoformat()}")

In [None]:
"""
1.2 Define Paths
"""

NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebook' else NOTEBOOK_DIR

DATASET_METADATA_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'dataset_metadata'
SIGNAL_DICT_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'config' / 'signal_dictionary.yaml'
RDLS_SCHEMA_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'schema' / 'rdls_schema_v0.3.json'

OUTPUT_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'extracted'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

assert DATASET_METADATA_DIR.exists(), f"Not found: {DATASET_METADATA_DIR}"
assert SIGNAL_DICT_PATH.exists(), f"Not found: {SIGNAL_DICT_PATH}"

print(f"Base: {BASE_DIR}")
print(f"Output: {OUTPUT_DIR}")

In [None]:
"""
1.3 Load Signal Dictionary and RDLS Schema
"""

with open(SIGNAL_DICT_PATH, 'r', encoding='utf-8') as f:
    SIGNAL_DICT = yaml.safe_load(f)

with open(RDLS_SCHEMA_PATH, 'r', encoding='utf-8') as f:
    RDLS_SCHEMA = json.load(f)

# Extract valid exposure categories from schema
VALID_EXPOSURE_CATEGORIES = RDLS_SCHEMA.get('$defs', {}).get('exposure_category', {}).get('enum', [])
VALID_METRIC_DIMENSIONS = RDLS_SCHEMA.get('$defs', {}).get('metric_dimension', {}).get('enum', [])

print(f"Signal Dictionary loaded.")
print(f"Valid exposure categories: {VALID_EXPOSURE_CATEGORIES}")
print(f"Valid metric dimensions: {VALID_METRIC_DIMENSIONS}")

## 2. Exposure Extraction Classes

In [None]:
"""
2.1 Data Classes for Exposure Extraction
"""

@dataclass
class ExtractionMatch:
    """Single pattern match with confidence."""
    value: str
    confidence: float
    source_field: str
    matched_text: str
    pattern: str

@dataclass
class MetricExtraction:
    """Extracted metric information."""
    dimension: str
    quantity_kind: str
    confidence: float
    source_hint: str = ""

@dataclass
class ExposureExtraction:
    """Complete exposure extraction for a dataset."""
    categories: List[ExtractionMatch] = field(default_factory=list)
    metrics: List[MetricExtraction] = field(default_factory=list)
    taxonomy_hint: Optional[str] = None
    overall_confidence: float = 0.0
    
    def to_dict(self) -> Dict[str, Any]:
        return {
            'categories': [asdict(m) for m in self.categories],
            'metrics': [asdict(m) for m in self.metrics],
            'taxonomy_hint': self.taxonomy_hint,
            'overall_confidence': self.overall_confidence
        }

print("Data classes defined.")

In [None]:
"""
2.2 Metric Inference Patterns

Additional patterns specific to exposure metrics.
"""

# Metric dimension inference patterns
METRIC_DIMENSION_PATTERNS = {
    'Structure': [
        r'\b(building|structure|footprint|floor.?area)\b',
        r'\b(construction|built|asset)\b',
    ],
    'Content': [
        r'\b(content|inventory|equipment|furnishing)\b',
        r'\b(stock|goods|material)\b',
    ],
    'Product': [
        r'\b(crop|harvest|yield|production)\b',
        r'\b(output|commodity)\b',
    ],
    'Other': [
        r'\b(other|miscellaneous|general)\b',
    ],
}

# Quantity kind inference patterns
QUANTITY_KIND_PATTERNS = {
    'count': [
        r'\b(count|number|quantity|total)\b',
        r'\b(units|items|pieces)\b',
    ],
    'area': [
        r'\b(area|hectare|acre|sq\.?\s*(?:m|km|ft))\b',
        r'\b(square|coverage|extent)\b',
    ],
    'length': [
        r'\b(length|distance|km|kilometer|mile)\b',
        r'\b(route|corridor|line)\b',
    ],
    'monetary': [
        r'\b(value|cost|price|worth|\$|usd|eur)\b',
        r'\b(economic|financial|monetary)\b',
        r'\b(replacement|rebuild)\b',
    ],
    'weight': [
        r'\b(weight|mass|ton|kg|kilogram)\b',
        r'\b(tonnage|cargo)\b',
    ],
}

# Taxonomy hints
TAXONOMY_PATTERNS = {
    'GED4ALL': [r'\b(ged4all|gem.?taxonomy)\b'],
    'HAZUS': [r'\b(hazus|fema.?taxonomy)\b'],
    'PAGER': [r'\b(pager|usgs.?pager)\b'],
}

print("Metric inference patterns defined.")

In [None]:
"""
2.3 Exposure Extractor Class
"""

class ExposureExtractor:
    """
    Extracts RDLS Exposure block components from HDX metadata.
    
    Identifies:
    - Exposure categories (buildings, infrastructure, population, etc.)
    - Metric dimensions and quantity kinds
    - Taxonomy hints
    
    Parameters
    ----------
    signal_dict : Dict[str, Any]
        Loaded signal dictionary
    """
    
    CONFIDENCE_MAP = {'high': 0.9, 'medium': 0.7, 'low': 0.5}
    
    def __init__(self, signal_dict: Dict[str, Any]):
        self.signal_dict = signal_dict
        self._compile_patterns()
    
    def _compile_patterns(self) -> None:
        """Pre-compile regex patterns."""
        self.category_patterns = {}
        self.dimension_patterns = {}
        self.quantity_patterns = {}
        self.taxonomy_patterns = {}
        
        # Compile exposure category patterns from signal dict
        for category, config in self.signal_dict.get('exposure_category', {}).items():
            patterns = config.get('patterns', [])
            confidence = self.CONFIDENCE_MAP.get(config.get('confidence', 'medium'), 0.7)
            self.category_patterns[category] = {
                'compiled': [re.compile(p, re.IGNORECASE) for p in patterns],
                'confidence': confidence
            }
        
        # Compile metric dimension patterns
        for dimension, patterns in METRIC_DIMENSION_PATTERNS.items():
            self.dimension_patterns[dimension] = [
                re.compile(p, re.IGNORECASE) for p in patterns
            ]
        
        # Compile quantity kind patterns
        for kind, patterns in QUANTITY_KIND_PATTERNS.items():
            self.quantity_patterns[kind] = [
                re.compile(p, re.IGNORECASE) for p in patterns
            ]
        
        # Compile taxonomy patterns
        for taxonomy, patterns in TAXONOMY_PATTERNS.items():
            self.taxonomy_patterns[taxonomy] = [
                re.compile(p, re.IGNORECASE) for p in patterns
            ]
    
    def _extract_text_fields(self, hdx_record: Dict[str, Any]) -> Dict[str, str]:
        """Extract text fields from HDX record."""
        fields = {
            'title': hdx_record.get('title', ''),
            'name': hdx_record.get('name', ''),
            'notes': hdx_record.get('notes', ''),
            'tags': ' '.join(hdx_record.get('tags', [])),
            'methodology': hdx_record.get('methodology_other', '') or '',
        }
        
        resources = hdx_record.get('resources', [])
        fields['resources'] = ' '.join(
            f"{r.get('name', '')} {r.get('description', '')}" 
            for r in resources
        )
        
        return fields
    
    def _match_categories(
        self, 
        text_fields: Dict[str, str]
    ) -> List[ExtractionMatch]:
        """Match exposure categories."""
        matches = {}
        
        for category, config in self.category_patterns.items():
            for compiled in config['compiled']:
                for field_name, text in text_fields.items():
                    if not text:
                        continue
                    match = compiled.search(text)
                    if match:
                        if category not in matches or config['confidence'] > matches[category].confidence:
                            matches[category] = ExtractionMatch(
                                value=category,
                                confidence=config['confidence'],
                                source_field=field_name,
                                matched_text=match.group(0),
                                pattern=compiled.pattern
                            )
                        break
        
        return list(matches.values())
    
    def _infer_metrics(
        self, 
        text_fields: Dict[str, str],
        categories: List[ExtractionMatch]
    ) -> List[MetricExtraction]:
        """
        Infer metrics based on detected categories and text patterns.
        """
        all_text = ' '.join(text_fields.values()).lower()
        metrics = []
        
        # Default metrics by category
        default_metrics = {
            'buildings': [('Structure', 'count'), ('Structure', 'area'), ('Structure', 'monetary')],
            'infrastructure': [('Structure', 'count'), ('Structure', 'length')],
            'population': [('Other', 'count')],
            'agriculture': [('Product', 'area'), ('Product', 'monetary')],
            'natural_environment': [('Other', 'area')],
        }
        
        # Detect dimension from text
        detected_dimension = None
        for dimension, patterns in self.dimension_patterns.items():
            for pattern in patterns:
                if pattern.search(all_text):
                    detected_dimension = dimension
                    break
            if detected_dimension:
                break
        
        # Detect quantity kind from text
        detected_quantity = None
        for kind, patterns in self.quantity_patterns.items():
            for pattern in patterns:
                if pattern.search(all_text):
                    detected_quantity = kind
                    break
            if detected_quantity:
                break
        
        # Generate metrics for each detected category
        for cat_match in categories:
            category = cat_match.value
            
            if detected_dimension and detected_quantity:
                # Use detected values
                metrics.append(MetricExtraction(
                    dimension=detected_dimension,
                    quantity_kind=detected_quantity,
                    confidence=0.8,
                    source_hint='detected_from_text'
                ))
            elif category in default_metrics:
                # Use defaults for the category
                for dim, qty in default_metrics[category][:1]:  # Just first default
                    metrics.append(MetricExtraction(
                        dimension=dim,
                        quantity_kind=qty,
                        confidence=0.5,
                        source_hint=f'default_for_{category}'
                    ))
        
        return metrics
    
    def _detect_taxonomy(self, text_fields: Dict[str, str]) -> Optional[str]:
        """Detect taxonomy scheme from text."""
        all_text = ' '.join(text_fields.values()).lower()
        
        for taxonomy, patterns in self.taxonomy_patterns.items():
            for pattern in patterns:
                if pattern.search(all_text):
                    return taxonomy
        
        return None
    
    def extract(self, hdx_record: Dict[str, Any]) -> ExposureExtraction:
        """
        Extract exposure information from HDX record.
        
        Parameters
        ----------
        hdx_record : Dict[str, Any]
            HDX metadata record
            
        Returns
        -------
        ExposureExtraction
            Extraction results
        """
        text_fields = self._extract_text_fields(hdx_record)
        
        categories = self._match_categories(text_fields)
        metrics = self._infer_metrics(text_fields, categories)
        taxonomy = self._detect_taxonomy(text_fields)
        
        # Calculate confidence
        confidences = [c.confidence for c in categories]
        confidences.extend([m.confidence for m in metrics])
        overall_confidence = np.mean(confidences) if confidences else 0.0
        
        return ExposureExtraction(
            categories=categories,
            metrics=metrics,
            taxonomy_hint=taxonomy,
            overall_confidence=overall_confidence
        )

# Initialize
exposure_extractor = ExposureExtractor(SIGNAL_DICT)
print(f"ExposureExtractor initialized.")
print(f"  - Categories: {len(exposure_extractor.category_patterns)}")

## 3. RDLS Exposure Block Builder

In [None]:
"""
3.1 Build RDLS Exposure Block
"""

def build_exposure_block(
    extraction: ExposureExtraction,
    dataset_id: str
) -> Optional[List[Dict[str, Any]]]:
    """
    Build RDLS exposure block from extraction results.
    
    Parameters
    ----------
    extraction : ExposureExtraction
        Extraction results
    dataset_id : str
        Dataset identifier
        
    Returns
    -------
    Optional[List[Dict[str, Any]]]
        RDLS exposure array or None
    """
    if not extraction.categories:
        return None
    
    exposure_items = []
    
    for i, cat in enumerate(extraction.categories):
        exposure_item = {
            'id': f"exposure_{dataset_id[:8]}_{i+1}",
            'category': cat.value
        }
        
        # Add taxonomy if detected
        if extraction.taxonomy_hint:
            exposure_item['taxonomy'] = extraction.taxonomy_hint
        
        # Add metrics
        if extraction.metrics:
            metrics = []
            for j, m in enumerate(extraction.metrics):
                metric = {
                    'id': f"metric_{dataset_id[:8]}_{i+1}_{j+1}",
                    'dimension': m.dimension,
                    'quantity_kind': m.quantity_kind
                }
                metrics.append(metric)
            exposure_item['metrics'] = metrics
        else:
            # Minimal required metric
            exposure_item['metrics'] = [{
                'id': f"metric_{dataset_id[:8]}_{i+1}_1",
                'dimension': 'Other',
                'quantity_kind': 'count'
            }]
        
        exposure_items.append(exposure_item)
    
    return exposure_items

print("Exposure block builder defined.")

## 4. Test Extraction

In [None]:
"""
4.1 Load Sample Records
"""

# Find exposure-related files
exposure_files = list(DATASET_METADATA_DIR.glob('*exposure*.json'))[:10]
population_files = list(DATASET_METADATA_DIR.glob('*population*.json'))[:5]
building_files = list(DATASET_METADATA_DIR.glob('*building*.json'))[:5]

sample_files = exposure_files + population_files + building_files

sample_records = []
for filepath in sample_files:
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            sample_records.append(json.load(f))
    except:
        pass

print(f"Loaded {len(sample_records)} sample records.")

In [None]:
"""
4.2 Run Extraction on Samples
"""

print("=" * 80)
print("EXPOSURE EXTRACTION TEST RESULTS")
print("=" * 80)

for record in sample_records[:10]:
    extraction = exposure_extractor.extract(record)
    
    print(f"\n{'─' * 80}")
    print(f"Title: {record.get('title', '')[:70]}")
    print(f"Confidence: {extraction.overall_confidence:.2f}")
    
    if extraction.categories:
        print(f"Categories:")
        for cat in extraction.categories:
            print(f"  - {cat.value} (conf: {cat.confidence:.1f}, match: '{cat.matched_text}')")
    else:
        print(f"Categories: None detected")
    
    if extraction.metrics:
        print(f"Metrics:")
        for m in extraction.metrics:
            print(f"  - {m.dimension}/{m.quantity_kind} (conf: {m.confidence:.1f})")
    
    if extraction.taxonomy_hint:
        print(f"Taxonomy: {extraction.taxonomy_hint}")

In [None]:
"""
4.3 Generate Sample RDLS Exposure Blocks
"""

print("\n" + "=" * 80)
print("GENERATED RDLS EXPOSURE BLOCKS")
print("=" * 80)

for record in sample_records[:5]:
    extraction = exposure_extractor.extract(record)
    
    if extraction.categories:
        exposure_block = build_exposure_block(extraction, record.get('id', 'unknown'))
        
        if exposure_block:
            print(f"\n{'─' * 80}")
            print(f"Dataset: {record.get('title', '')[:60]}")
            print(json.dumps(exposure_block, indent=2))

## 5. Batch Processing

In [None]:
"""
5.1 Process All Records
"""

def process_exposure_extraction(
    metadata_dir: Path,
    extractor: ExposureExtractor,
    limit: Optional[int] = None
) -> pd.DataFrame:
    """
    Process all HDX records for exposure extraction.
    """
    json_files = list(metadata_dir.glob('*.json'))
    if limit:
        json_files = json_files[:limit]
    
    results = []
    iterator = tqdm(json_files, desc="Extracting") if HAS_TQDM else json_files
    
    for filepath in iterator:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                record = json.load(f)
            
            extraction = extractor.extract(record)
            
            results.append({
                'id': record.get('id'),
                'title': record.get('title'),
                'organization': record.get('organization'),
                'categories': [c.value for c in extraction.categories],
                'category_count': len(extraction.categories),
                'has_exposure': len(extraction.categories) > 0,
                'taxonomy': extraction.taxonomy_hint,
                'overall_confidence': extraction.overall_confidence,
                'extraction': extraction
            })
            
        except Exception as e:
            results.append({'id': filepath.stem, 'error': str(e)})
    
    return pd.DataFrame(results)

# Process
PROCESS_LIMIT = 1000  # Set to None for full corpus

print(f"Processing {PROCESS_LIMIT or 'all'} records...")
df_exposure = process_exposure_extraction(DATASET_METADATA_DIR, exposure_extractor, limit=PROCESS_LIMIT)

In [None]:
"""
5.2 Extraction Statistics
"""

print("=" * 60)
print("EXPOSURE EXTRACTION STATISTICS")
print("=" * 60)

total = len(df_exposure)
with_exposure = df_exposure['has_exposure'].sum()

print(f"\nTotal records: {total:,}")
print(f"With exposure signals: {with_exposure:,} ({with_exposure/total*100:.1f}%)")

# Category distribution
cat_counts = Counter()
for cats in df_exposure['categories'].dropna():
    cat_counts.update(cats)

print(f"\nCategory Distribution:")
for cat, count in cat_counts.most_common():
    print(f"  {cat}: {count}")

# Confidence distribution
conf = df_exposure[df_exposure['has_exposure']]['overall_confidence']
print(f"\nConfidence (exposure records):")
print(f"  Mean: {conf.mean():.2f}")
print(f"  High (>=0.8): {(conf >= 0.8).sum()}")

## 6. Export Results

In [None]:
"""
6.1 Export Extraction Results
"""

# Prepare export
export_df = df_exposure[[
    'id', 'title', 'organization', 'categories', 'category_count',
    'taxonomy', 'overall_confidence', 'has_exposure'
]].copy()

export_df['categories'] = export_df['categories'].apply(
    lambda x: '|'.join(x) if isinstance(x, list) else ''
)

output_file = OUTPUT_DIR / 'exposure_extraction_results.csv'
export_df.to_csv(output_file, index=False)
print(f"Saved: {output_file}")

# High confidence
high_conf = export_df[export_df['has_exposure'] & (df_exposure['overall_confidence'] >= 0.8)]
high_conf_file = OUTPUT_DIR / 'exposure_extraction_high_confidence.csv'
high_conf.to_csv(high_conf_file, index=False)
print(f"Saved: {high_conf_file} ({len(high_conf)} records)")

In [None]:
"""
6.2 Generate Sample RDLS Records
"""

top_records = df_exposure[
    df_exposure['has_exposure'] & 
    (df_exposure['overall_confidence'] >= 0.8)
].nlargest(10, 'overall_confidence')

print(f"\nGenerating {len(top_records)} sample RDLS exposure records...")

for idx, row in top_records.iterrows():
    extraction = row['extraction']
    exposure_block = build_exposure_block(extraction, row['id'])
    
    if exposure_block:
        rdls_record = {
            'datasets': [{
                'id': f"rdls_exp-hdx_{row['id'][:8]}",
                'title': row['title'],
                'risk_data_type': ['exposure'],
                'exposure': exposure_block,
                'links': [{
                    'href': 'https://docs.riskdatalibrary.org/en/0__3__0/rdls_schema.json',
                    'rel': 'describedby'
                }]
            }]
        }
        
        output_path = OUTPUT_DIR / f"rdls_exp-hdx_{row['id'][:8]}.json"
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(rdls_record, f, indent=2)
        
        print(f"  Created: {output_path.name}")

print(f"\nDone. Output: {OUTPUT_DIR}")

In [None]:
print(f"\nNotebook completed: {datetime.now().isoformat()}")