# Notebook 09: RDLS Hazard Block Extractor

**Purpose**: Extract and populate RDLS v0.3 Hazard component blocks from HDX metadata using the Signal Dictionary.

**Input**:
- HDX dataset metadata JSON files
- Signal Dictionary (`config/signal_dictionary.yaml`)
- RDLS Schema (`rdls/schema/rdls_schema_v0.3.json`)

**Output**:
- Hazard block extractions with confidence scores
- Extraction QA report
- Updated RDLS records with populated hazard blocks

**RDLS Hazard Block Structure**:
```json
"hazard": {
  "event_sets": [{
    "id": "...",
    "analysis_type": "probabilistic|deterministic|empirical",
    "hazards": [{
      "id": "...",
      "type": "flood|earthquake|...",
      "hazard_process": "fluvial_flood|ground_motion|...",
      "intensity_measure": "..."
    }],
    "events": [{
      "occurrence": {
        "probabilistic": { "return_period": 100 },
        "empirical": { "temporal": {...} }
      }
    }]
  }]
}
```

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1

---

## 1. Setup and Configuration

In [None]:
"""
1.1 Import Dependencies
"""

import json
import os
import re
import yaml
from pathlib import Path
from collections import defaultdict
from datetime import datetime
from typing import Dict, List, Tuple, Optional, Any, Union
from dataclasses import dataclass, field, asdict
from copy import deepcopy

import pandas as pd
import numpy as np

try:
    from tqdm.notebook import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 120)

print(f"Notebook started: {datetime.now().isoformat()}")

In [None]:
"""
1.2 Define Paths
"""

# Repository root
NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebook' else NOTEBOOK_DIR

# Input paths
DATASET_METADATA_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'dataset_metadata'
SIGNAL_DICT_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'config' / 'signal_dictionary.yaml'
RDLS_SCHEMA_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'schema' / 'rdls_schema_v0.3.json'
RDLS_TEMPLATE_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'template' / 'rdls_template_v03.json'

# Output paths
OUTPUT_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'extracted'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Verify paths
assert DATASET_METADATA_DIR.exists(), f"Not found: {DATASET_METADATA_DIR}"
assert SIGNAL_DICT_PATH.exists(), f"Not found: {SIGNAL_DICT_PATH}"
assert RDLS_SCHEMA_PATH.exists(), f"Not found: {RDLS_SCHEMA_PATH}"

print(f"Base: {BASE_DIR}")
print(f"Output: {OUTPUT_DIR}")

In [None]:
"""
1.3 Load Signal Dictionary

The Signal Dictionary contains pattern-to-codelist mappings.
"""

def load_signal_dictionary(path: Path) -> Dict[str, Any]:
    """
    Load and parse the Signal Dictionary YAML.
    
    Parameters
    ----------
    path : Path
        Path to signal_dictionary.yaml
        
    Returns
    -------
    Dict[str, Any]
        Parsed signal dictionary
    """
    with open(path, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

SIGNAL_DICT = load_signal_dictionary(SIGNAL_DICT_PATH)

print("Signal Dictionary loaded successfully.")
print(f"Sections: {list(SIGNAL_DICT.keys())}")

## 2. Core Extraction Classes

In [None]:
"""
2.1 Data Classes for Extraction Results

Strongly-typed containers for extraction outputs.
"""

@dataclass
class ExtractionMatch:
    """
    Represents a single pattern match with confidence.
    """
    value: str                    # RDLS codelist value
    confidence: float             # 0.0 to 1.0
    source_field: str             # HDX field where match was found
    matched_text: str             # Actual text that matched
    pattern: str                  # Pattern that matched

@dataclass
class HazardExtraction:
    """
    Complete hazard extraction for a dataset.
    """
    hazard_types: List[ExtractionMatch] = field(default_factory=list)
    process_types: List[ExtractionMatch] = field(default_factory=list)
    analysis_type: Optional[ExtractionMatch] = None
    return_periods: List[int] = field(default_factory=list)
    intensity_measures: List[str] = field(default_factory=list)
    overall_confidence: float = 0.0
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary for JSON serialization."""
        return {
            'hazard_types': [asdict(m) for m in self.hazard_types],
            'process_types': [asdict(m) for m in self.process_types],
            'analysis_type': asdict(self.analysis_type) if self.analysis_type else None,
            'return_periods': self.return_periods,
            'intensity_measures': self.intensity_measures,
            'overall_confidence': self.overall_confidence
        }

@dataclass
class RDLSHazardBlock:
    """
    RDLS-compliant hazard block structure.
    """
    event_sets: List[Dict[str, Any]] = field(default_factory=list)
    
    def to_rdls(self) -> Dict[str, Any]:
        """Convert to RDLS hazard block format."""
        return {'event_sets': self.event_sets}

print("Data classes defined.")

In [None]:
"""
2.2 Hazard Extractor Class

Main extraction engine using Signal Dictionary patterns.
"""

class HazardExtractor:
    """
    Extracts RDLS Hazard block components from HDX metadata.
    
    Uses pattern matching against the Signal Dictionary to identify:
    - Hazard types (flood, earthquake, etc.)
    - Hazard process types (fluvial_flood, ground_motion, etc.)
    - Analysis types (probabilistic, deterministic, empirical)
    - Return periods (numeric values)
    - Intensity measures (where identifiable)
    
    Parameters
    ----------
    signal_dict : Dict[str, Any]
        Loaded signal dictionary
    """
    
    # Confidence score mappings
    CONFIDENCE_MAP = {'high': 0.9, 'medium': 0.7, 'low': 0.5}
    
    def __init__(self, signal_dict: Dict[str, Any]):
        self.signal_dict = signal_dict
        self._compile_patterns()
    
    def _compile_patterns(self) -> None:
        """Pre-compile regex patterns for efficiency."""
        self.hazard_patterns = {}
        self.process_patterns = {}
        self.analysis_patterns = {}
        self.return_period_patterns = []
        
        # Compile hazard type patterns
        for hazard_type, config in self.signal_dict.get('hazard_type', {}).items():
            patterns = config.get('patterns', [])
            confidence = self.CONFIDENCE_MAP.get(config.get('confidence', 'medium'), 0.7)
            self.hazard_patterns[hazard_type] = {
                'compiled': [re.compile(p, re.IGNORECASE) for p in patterns],
                'confidence': confidence
            }
        
        # Compile process type patterns
        for process_type, config in self.signal_dict.get('process_type', {}).items():
            patterns = config.get('patterns', [])
            confidence = self.CONFIDENCE_MAP.get(config.get('confidence', 'medium'), 0.7)
            self.process_patterns[process_type] = {
                'compiled': [re.compile(p, re.IGNORECASE) for p in patterns],
                'confidence': confidence,
                'parent_hazard': config.get('parent_hazard')
            }
        
        # Compile analysis type patterns
        for analysis_type, config in self.signal_dict.get('analysis_type', {}).items():
            patterns = config.get('patterns', [])
            confidence = self.CONFIDENCE_MAP.get(config.get('confidence', 'medium'), 0.7)
            self.analysis_patterns[analysis_type] = {
                'compiled': [re.compile(p, re.IGNORECASE) for p in patterns],
                'confidence': confidence
            }
        
        # Compile return period patterns
        rp_config = self.signal_dict.get('return_period', {})
        for pattern in rp_config.get('patterns', []):
            try:
                self.return_period_patterns.append(re.compile(pattern, re.IGNORECASE))
            except re.error:
                pass  # Skip invalid patterns
    
    def _extract_text_fields(self, hdx_record: Dict[str, Any]) -> Dict[str, str]:
        """
        Extract all text fields from HDX record for pattern matching.
        
        Parameters
        ----------
        hdx_record : Dict[str, Any]
            HDX metadata record
            
        Returns
        -------
        Dict[str, str]
            Field name -> text content mapping
        """
        fields = {
            'title': hdx_record.get('title', ''),
            'name': hdx_record.get('name', ''),
            'notes': hdx_record.get('notes', ''),
            'tags': ' '.join(hdx_record.get('tags', [])),
            'methodology': hdx_record.get('methodology_other', '') or '',
        }
        
        # Add resource names and descriptions
        resources = hdx_record.get('resources', [])
        resource_text = ' '.join(
            f"{r.get('name', '')} {r.get('description', '')}" 
            for r in resources
        )
        fields['resources'] = resource_text
        
        return fields
    
    def _match_patterns(
        self, 
        text_fields: Dict[str, str], 
        patterns: Dict[str, Dict]
    ) -> List[ExtractionMatch]:
        """
        Match text against pattern dictionary.
        
        Parameters
        ----------
        text_fields : Dict[str, str]
            Field name -> text content
        patterns : Dict[str, Dict]
            Pattern configuration dictionary
            
        Returns
        -------
        List[ExtractionMatch]
            All matches found, deduplicated by value
        """
        matches = {}
        
        for value_name, config in patterns.items():
            for compiled_pattern in config['compiled']:
                for field_name, text in text_fields.items():
                    if not text:
                        continue
                    match = compiled_pattern.search(text)
                    if match:
                        # Only keep highest confidence match for each value
                        if value_name not in matches or config['confidence'] > matches[value_name].confidence:
                            matches[value_name] = ExtractionMatch(
                                value=value_name,
                                confidence=config['confidence'],
                                source_field=field_name,
                                matched_text=match.group(0),
                                pattern=compiled_pattern.pattern
                            )
                        break  # Found match for this pattern, move to next
        
        return list(matches.values())
    
    def _extract_return_periods(self, text_fields: Dict[str, str]) -> List[int]:
        """
        Extract numeric return period values from text.
        
        Parameters
        ----------
        text_fields : Dict[str, str]
            Field name -> text content
            
        Returns
        -------
        List[int]
            Sorted list of unique return period values
        """
        rp_values = set()
        rp_config = self.signal_dict.get('return_period', {})
        valid_min = rp_config.get('valid_range', {}).get('min', 1)
        valid_max = rp_config.get('valid_range', {}).get('max', 100000)
        
        all_text = ' '.join(text_fields.values())
        
        for pattern in self.return_period_patterns:
            for match in pattern.finditer(all_text):
                # Try each capture group
                for group in match.groups():
                    if group:
                        try:
                            value = int(group)
                            if valid_min <= value <= valid_max:
                                rp_values.add(value)
                        except ValueError:
                            pass
        
        return sorted(rp_values)
    
    def extract(self, hdx_record: Dict[str, Any]) -> HazardExtraction:
        """
        Extract hazard information from HDX record.
        
        Parameters
        ----------
        hdx_record : Dict[str, Any]
            HDX metadata record
            
        Returns
        -------
        HazardExtraction
            Extraction results with confidence scores
        """
        text_fields = self._extract_text_fields(hdx_record)
        
        # Extract each component
        hazard_types = self._match_patterns(text_fields, self.hazard_patterns)
        process_types = self._match_patterns(text_fields, self.process_patterns)
        analysis_matches = self._match_patterns(text_fields, self.analysis_patterns)
        return_periods = self._extract_return_periods(text_fields)
        
        # Select best analysis type match
        analysis_type = None
        if analysis_matches:
            analysis_type = max(analysis_matches, key=lambda x: x.confidence)
        
        # If return periods found but no analysis type, assume probabilistic
        if return_periods and not analysis_type:
            analysis_type = ExtractionMatch(
                value='probabilistic',
                confidence=0.8,
                source_field='inferred',
                matched_text='return_period_present',
                pattern='inferred_from_rp'
            )
        
        # Calculate overall confidence
        confidences = [m.confidence for m in hazard_types]
        if analysis_type:
            confidences.append(analysis_type.confidence)
        if return_periods:
            confidences.append(0.9)  # Return periods are strong signals
        
        overall_confidence = np.mean(confidences) if confidences else 0.0
        
        return HazardExtraction(
            hazard_types=hazard_types,
            process_types=process_types,
            analysis_type=analysis_type,
            return_periods=return_periods,
            intensity_measures=[],  # TODO: Implement intensity extraction
            overall_confidence=overall_confidence
        )

# Initialize extractor
extractor = HazardExtractor(SIGNAL_DICT)
print(f"HazardExtractor initialized.")
print(f"  - Hazard types: {len(extractor.hazard_patterns)}")
print(f"  - Process types: {len(extractor.process_patterns)}")
print(f"  - Analysis types: {len(extractor.analysis_patterns)}")

## 3. RDLS Hazard Block Builder

In [None]:
"""
3.1 Build RDLS Hazard Block from Extraction

Convert extraction results into RDLS-compliant hazard block structure.
"""

def build_hazard_block(
    extraction: HazardExtraction,
    dataset_id: str
) -> Optional[Dict[str, Any]]:
    """
    Build RDLS hazard block from extraction results.
    
    Parameters
    ----------
    extraction : HazardExtraction
        Extraction results from HazardExtractor
    dataset_id : str
        Dataset identifier for building IDs
        
    Returns
    -------
    Optional[Dict[str, Any]]
        RDLS hazard block or None if insufficient data
    """
    # Need at least one hazard type to create a block
    if not extraction.hazard_types:
        return None
    
    # Build hazards array
    hazards = []
    for i, ht in enumerate(extraction.hazard_types):
        hazard_entry = {
            'id': f"hazard_{dataset_id[:8]}_{i+1}",
            'type': ht.value
        }
        
        # Find matching process type
        for pt in extraction.process_types:
            # Check if process type is related to this hazard
            if pt.value.startswith(ht.value) or ht.value in pt.value:
                hazard_entry['hazard_process'] = pt.value
                break
        
        hazards.append(hazard_entry)
    
    # Build event set
    event_set = {
        'id': f"event_set_{dataset_id[:8]}",
        'hazards': hazards
    }
    
    # Add analysis type if available
    if extraction.analysis_type:
        event_set['analysis_type'] = extraction.analysis_type.value
    
    # Build events from return periods
    if extraction.return_periods:
        events = []
        for rp in extraction.return_periods:
            event = {
                'id': f"event_rp{rp}_{dataset_id[:8]}",
                'calculation_method': 'simulated',
                'hazard': hazards[0] if hazards else {},  # Reference first hazard
                'occurrence': {
                    'probabilistic': {
                        'return_period': rp
                    }
                }
            }
            events.append(event)
        
        event_set['events'] = events
        event_set['event_count'] = len(events)
        event_set['occurrence_range'] = f"Return periods: {min(extraction.return_periods)} to {max(extraction.return_periods)} years"
    
    return {
        'event_sets': [event_set]
    }

print("Hazard block builder defined.")

## 4. Test Extraction on Sample Records

In [None]:
"""
4.1 Load Sample HDX Records for Testing

Load records with known hazard content for validation.
"""

# Sample records with expected hazard content
SAMPLE_FILES = [
    '0ab99df0-17d4-4582-9e16-790308905993__tsunami-hazard-run-up-rp-500-years.json',  # Tsunami, RP500
    '02265908-5038-4021-bb65-d2b123a1431c__gar15-global-exposure-dataset-for-papua-new-guinea.json',  # Exposure
    '0454eb6a-d0df-4025-9e69-43fa918beb0c__bangladesh-level-1-exposure-data.json',  # Exposure
]

# Also find some flood-related files
flood_files = list(DATASET_METADATA_DIR.glob('*flood*.json'))[:5]
cyclone_files = list(DATASET_METADATA_DIR.glob('*cyclone*.json'))[:5]
earthquake_files = list(DATASET_METADATA_DIR.glob('*earthquake*.json'))[:5]

# Load samples
sample_records = []

for filename in SAMPLE_FILES:
    filepath = DATASET_METADATA_DIR / filename
    if filepath.exists():
        with open(filepath, 'r', encoding='utf-8') as f:
            sample_records.append(json.load(f))

for filepath in flood_files + cyclone_files + earthquake_files:
    with open(filepath, 'r', encoding='utf-8') as f:
        sample_records.append(json.load(f))

print(f"Loaded {len(sample_records)} sample records for testing.")

In [None]:
"""
4.2 Run Extraction on Samples
"""

print("=" * 80)
print("HAZARD EXTRACTION TEST RESULTS")
print("=" * 80)

extraction_results = []

for record in sample_records:
    extraction = extractor.extract(record)
    
    result = {
        'id': record.get('id'),
        'title': record.get('title', '')[:70],
        'extraction': extraction
    }
    extraction_results.append(result)
    
    # Display results
    print(f"\n{'─' * 80}")
    print(f"Title: {record.get('title', '')[:75]}")
    print(f"ID: {record.get('id')}")
    print(f"\nExtraction Results (confidence: {extraction.overall_confidence:.2f}):")
    
    if extraction.hazard_types:
        print(f"  Hazard Types:")
        for ht in extraction.hazard_types:
            print(f"    - {ht.value} (conf: {ht.confidence:.1f}, from: {ht.source_field}, match: '{ht.matched_text}')")
    else:
        print(f"  Hazard Types: None detected")
    
    if extraction.process_types:
        print(f"  Process Types:")
        for pt in extraction.process_types:
            print(f"    - {pt.value} (conf: {pt.confidence:.1f})")
    
    if extraction.analysis_type:
        print(f"  Analysis Type: {extraction.analysis_type.value} (conf: {extraction.analysis_type.confidence:.1f})")
    
    if extraction.return_periods:
        print(f"  Return Periods: {extraction.return_periods}")

In [None]:
"""
4.3 Build RDLS Hazard Blocks for Samples
"""

print("\n" + "=" * 80)
print("GENERATED RDLS HAZARD BLOCKS")
print("=" * 80)

for result in extraction_results:
    if result['extraction'].hazard_types:
        hazard_block = build_hazard_block(
            result['extraction'],
            result['id']
        )
        
        if hazard_block:
            print(f"\n{'─' * 80}")
            print(f"Dataset: {result['title']}")
            print(f"\nHazard Block:")
            print(json.dumps(hazard_block, indent=2))

## 5. Batch Processing

In [None]:
"""
5.1 Process All HDX Records

Run extraction on full corpus and collect statistics.
"""

def process_all_records(
    metadata_dir: Path,
    extractor: HazardExtractor,
    limit: Optional[int] = None
) -> Tuple[pd.DataFrame, List[Dict]]:
    """
    Process all HDX records and extract hazard information.
    
    Parameters
    ----------
    metadata_dir : Path
        Directory containing HDX JSON files
    extractor : HazardExtractor
        Initialized extractor instance
    limit : Optional[int]
        Maximum files to process (None = all)
        
    Returns
    -------
    Tuple[pd.DataFrame, List[Dict]]
        - Summary DataFrame with extraction statistics
        - List of full extraction results
    """
    json_files = list(metadata_dir.glob('*.json'))
    if limit:
        json_files = json_files[:limit]
    
    results = []
    iterator = tqdm(json_files, desc="Extracting") if HAS_TQDM else json_files
    
    for filepath in iterator:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                record = json.load(f)
            
            extraction = extractor.extract(record)
            
            results.append({
                'id': record.get('id'),
                'title': record.get('title'),
                'organization': record.get('organization'),
                'hazard_types': [m.value for m in extraction.hazard_types],
                'process_types': [m.value for m in extraction.process_types],
                'analysis_type': extraction.analysis_type.value if extraction.analysis_type else None,
                'return_periods': extraction.return_periods,
                'overall_confidence': extraction.overall_confidence,
                'has_hazard': len(extraction.hazard_types) > 0,
                'has_return_period': len(extraction.return_periods) > 0,
                'extraction': extraction
            })
            
        except Exception as e:
            results.append({
                'id': filepath.stem,
                'error': str(e)
            })
    
    # Create summary DataFrame
    df = pd.DataFrame(results)
    
    return df, results

# Process (set limit for testing)
PROCESS_LIMIT = 1000  # Set to None for full corpus

print(f"Processing {'all' if PROCESS_LIMIT is None else PROCESS_LIMIT} records...")
df_results, full_results = process_all_records(DATASET_METADATA_DIR, extractor, limit=PROCESS_LIMIT)

In [None]:
"""
5.2 Extraction Statistics Summary
"""

print("=" * 60)
print("EXTRACTION STATISTICS")
print("=" * 60)

total = len(df_results)
with_hazard = df_results['has_hazard'].sum()
with_rp = df_results['has_return_period'].sum()

print(f"\nTotal records processed: {total:,}")
print(f"Records with hazard extraction: {with_hazard:,} ({with_hazard/total*100:.1f}%)")
print(f"Records with return periods: {with_rp:,} ({with_rp/total*100:.1f}%)")

# Hazard type distribution
hazard_counts = {}
for hazards in df_results['hazard_types'].dropna():
    for h in hazards:
        hazard_counts[h] = hazard_counts.get(h, 0) + 1

print(f"\nHazard Type Distribution:")
for hazard, count in sorted(hazard_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"  {hazard}: {count}")

# Confidence distribution
print(f"\nConfidence Score Distribution:")
conf_bins = df_results[df_results['has_hazard']]['overall_confidence']
print(f"  Mean: {conf_bins.mean():.2f}")
print(f"  Median: {conf_bins.median():.2f}")
print(f"  High (>=0.8): {(conf_bins >= 0.8).sum()}")
print(f"  Medium (0.5-0.8): {((conf_bins >= 0.5) & (conf_bins < 0.8)).sum()}")
print(f"  Low (<0.5): {(conf_bins < 0.5).sum()}")

## 6. Export Results

In [None]:
"""
6.1 Export Extraction Summary
"""

# Prepare export DataFrame
export_df = df_results[[
    'id', 'title', 'organization', 'hazard_types', 'process_types',
    'analysis_type', 'return_periods', 'overall_confidence', 'has_hazard'
]].copy()

# Convert lists to strings
export_df['hazard_types'] = export_df['hazard_types'].apply(
    lambda x: '|'.join(x) if isinstance(x, list) else ''
)
export_df['process_types'] = export_df['process_types'].apply(
    lambda x: '|'.join(x) if isinstance(x, list) else ''
)
export_df['return_periods'] = export_df['return_periods'].apply(
    lambda x: '|'.join(map(str, x)) if isinstance(x, list) else ''
)

# Save
output_file = OUTPUT_DIR / 'hazard_extraction_results.csv'
export_df.to_csv(output_file, index=False)
print(f"Saved: {output_file}")

# Save high-confidence extractions
high_conf = export_df[export_df['has_hazard'] & (df_results['overall_confidence'] >= 0.8)]
high_conf_file = OUTPUT_DIR / 'hazard_extraction_high_confidence.csv'
high_conf.to_csv(high_conf_file, index=False)
print(f"Saved: {high_conf_file} ({len(high_conf)} records)")

In [None]:
"""
6.2 Generate Sample RDLS Records with Hazard Blocks

Create complete RDLS JSON records for high-confidence extractions.
"""

# Select top 10 high-confidence records
top_records = df_results[
    df_results['has_hazard'] & 
    (df_results['overall_confidence'] >= 0.8)
].nlargest(10, 'overall_confidence')

print(f"\nGenerating {len(top_records)} sample RDLS records...")

for idx, row in top_records.iterrows():
    extraction = row['extraction']
    hazard_block = build_hazard_block(extraction, row['id'])
    
    if hazard_block:
        # Create minimal RDLS record
        rdls_record = {
            'datasets': [{
                'id': f"rdls_hzd-hdx_{row['id'][:8]}",
                'title': row['title'],
                'risk_data_type': ['hazard'],
                'hazard': hazard_block,
                'links': [{
                    'href': 'https://docs.riskdatalibrary.org/en/0__3__0/rdls_schema.json',
                    'rel': 'describedby'
                }]
            }]
        }
        
        # Save
        output_path = OUTPUT_DIR / f"rdls_hzd-hdx_{row['id'][:8]}.json"
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(rdls_record, f, indent=2)
        
        print(f"  Created: {output_path.name}")

print(f"\nSample RDLS records saved to: {OUTPUT_DIR}")

## 7. Next Steps

This notebook demonstrates the Hazard extraction pipeline. Next steps:

1. **Notebook 10**: Exposure Block Extractor (similar pattern-based approach)
2. **Notebook 11**: Vulnerability/Loss Block Extractor (more complex, lower coverage expected)
3. **Notebook 12**: Integration - merge extractions with existing general metadata
4. **Notebook 13**: Validation against RDLS schema and QA reporting

In [None]:
print(f"\nNotebook completed: {datetime.now().isoformat()}")