# Notebook 12: RDLS HEVL Integration

**Purpose**: Integrate HEVL extractions with existing general metadata to produce complete RDLS v0.3 records.

**Process**:
1. Load existing RDLS general metadata (from Notebook 06)
2. Load HEVL extractions (from Notebooks 09-11)
3. Merge extractions into complete RDLS records
4. Apply component gating rules (V/L require H or E)
5. Generate final RDLS JSON files

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1

---

## 1. Setup

In [None]:
"""
1.1 Import Dependencies
"""

import json
import re
import yaml
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Any, Tuple
from copy import deepcopy

import pandas as pd
import numpy as np

try:
    from tqdm.notebook import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False

pd.set_option('display.max_columns', None)

print(f"Notebook started: {datetime.now().isoformat()}")

In [None]:
"""
1.2 Configure Paths
"""

NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebook' else NOTEBOOK_DIR

# Input paths
DATASET_METADATA_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'dataset_metadata'
SIGNAL_DICT_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'config' / 'signal_dictionary.yaml'
RDLS_TEMPLATE_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'template' / 'rdls_template_v03.json'
RDLS_SCHEMA_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'schema' / 'rdls_schema_v0.3.json'

# Extraction results from previous notebooks
EXTRACTED_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'extracted'
HAZARD_RESULTS = EXTRACTED_DIR / 'hazard_extraction_results.csv'
EXPOSURE_RESULTS = EXTRACTED_DIR / 'exposure_extraction_results.csv'
VL_RESULTS = EXTRACTED_DIR / 'vuln_loss_extraction_results.csv'

# Classification from Notebook 05
CLASSIFICATION_FILE = BASE_DIR / 'hdx_dataset_metadata_dump' / 'derived' / 'classification_final.csv'

# Output paths
OUTPUT_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'integrated'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base: {BASE_DIR}")
print(f"Output: {OUTPUT_DIR}")

In [None]:
"""
1.3 Load Resources
"""

# Load signal dictionary
with open(SIGNAL_DICT_PATH, 'r', encoding='utf-8') as f:
    SIGNAL_DICT = yaml.safe_load(f)

# Load RDLS template
with open(RDLS_TEMPLATE_PATH, 'r', encoding='utf-8') as f:
    RDLS_TEMPLATE = json.load(f)

# Load RDLS schema for validation
with open(RDLS_SCHEMA_PATH, 'r', encoding='utf-8') as f:
    RDLS_SCHEMA = json.load(f)

print("Resources loaded.")

## 2. Load Extraction Results

In [None]:
"""
2.1 Load HEVL Extraction CSVs
"""

def load_extraction_csv(path: Path) -> Optional[pd.DataFrame]:
    """Load extraction CSV if it exists."""
    if path.exists():
        df = pd.read_csv(path)
        print(f"Loaded {path.name}: {len(df)} records")
        return df
    else:
        print(f"Not found: {path.name}")
        return None

df_hazard = load_extraction_csv(HAZARD_RESULTS)
df_exposure = load_extraction_csv(EXPOSURE_RESULTS)
df_vl = load_extraction_csv(VL_RESULTS)

# Also try to load classification if available
df_classification = load_extraction_csv(CLASSIFICATION_FILE)

In [None]:
"""
2.2 Merge Extraction Results
"""

def merge_extractions(
    hazard_df: Optional[pd.DataFrame],
    exposure_df: Optional[pd.DataFrame],
    vl_df: Optional[pd.DataFrame]
) -> pd.DataFrame:
    """
    Merge HEVL extractions into single DataFrame.
    """
    # Start with hazard or exposure (whichever is larger)
    dfs_to_merge = []
    
    if hazard_df is not None:
        hdf = hazard_df[['id', 'title', 'organization', 'hazard_types', 
                        'analysis_type', 'return_periods', 'has_hazard']].copy()
        hdf.columns = ['id', 'title', 'organization', 'hazard_types',
                      'hazard_analysis_type', 'return_periods', 'has_hazard']
        dfs_to_merge.append(hdf)
    
    if exposure_df is not None:
        edf = exposure_df[['id', 'categories', 'taxonomy', 'has_exposure']].copy()
        edf.columns = ['id', 'exposure_categories', 'exposure_taxonomy', 'has_exposure']
        dfs_to_merge.append(edf)
    
    if vl_df is not None:
        vldf = vl_df[['id', 'has_vulnerability', 'has_socioeconomic', 
                      'has_loss', 'loss_types', 'loss_hazard']].copy()
        dfs_to_merge.append(vldf)
    
    if not dfs_to_merge:
        return pd.DataFrame()
    
    # Merge all on 'id'
    merged = dfs_to_merge[0]
    for df in dfs_to_merge[1:]:
        merged = merged.merge(df, on='id', how='outer')
    
    # Fill NaN booleans with False
    bool_cols = ['has_hazard', 'has_exposure', 'has_vulnerability', 
                 'has_socioeconomic', 'has_loss']
    for col in bool_cols:
        if col in merged.columns:
            merged[col] = merged[col].fillna(False)
    
    return merged

df_merged = merge_extractions(df_hazard, df_exposure, df_vl)
print(f"\nMerged DataFrame: {len(df_merged)} records")
print(f"Columns: {list(df_merged.columns)}")

In [None]:
"""
2.3 Analyze HEVL Coverage
"""

print("=" * 60)
print("HEVL EXTRACTION COVERAGE")
print("=" * 60)

if len(df_merged) > 0:
    total = len(df_merged)
    
    # Count each component
    h_count = df_merged['has_hazard'].sum() if 'has_hazard' in df_merged else 0
    e_count = df_merged['has_exposure'].sum() if 'has_exposure' in df_merged else 0
    v_count = df_merged['has_vulnerability'].sum() if 'has_vulnerability' in df_merged else 0
    l_count = df_merged['has_loss'].sum() if 'has_loss' in df_merged else 0
    
    print(f"\nTotal records: {total:,}")
    print(f"\nComponent Coverage:")
    print(f"  Hazard (H):       {h_count:>6,} ({h_count/total*100:>5.1f}%)")
    print(f"  Exposure (E):     {e_count:>6,} ({e_count/total*100:>5.1f}%)")
    print(f"  Vulnerability (V):{v_count:>6,} ({v_count/total*100:>5.1f}%)")
    print(f"  Loss (L):         {l_count:>6,} ({l_count/total*100:>5.1f}%)")
    
    # HEVL combinations
    df_merged['hevl_combo'] = (
        df_merged.get('has_hazard', False).apply(lambda x: 'H' if x else '-') +
        df_merged.get('has_exposure', False).apply(lambda x: 'E' if x else '-') +
        df_merged.get('has_vulnerability', False).apply(lambda x: 'V' if x else '-') +
        df_merged.get('has_loss', False).apply(lambda x: 'L' if x else '-')
    )
    
    print(f"\nHEVL Combinations:")
    for combo, count in df_merged['hevl_combo'].value_counts().head(10).items():
        print(f"  {combo}: {count:>6,}")

## 3. RDLS Integration Logic

In [None]:
"""
3.1 Component Gating Rules

RDLS business rules:
- Vulnerability data should be associated with Hazard and/or Exposure
- Loss data should be associated with Hazard and/or Exposure
"""

def apply_gating_rules(row: pd.Series) -> Dict[str, Any]:
    """
    Apply RDLS component gating rules.
    
    Parameters
    ----------
    row : pd.Series
        Row from merged DataFrame
        
    Returns
    -------
    Dict[str, Any]
        Gating decision with included components and any blocks
    """
    has_h = row.get('has_hazard', False)
    has_e = row.get('has_exposure', False)
    has_v = row.get('has_vulnerability', False)
    has_l = row.get('has_loss', False)
    
    result = {
        'include_hazard': has_h,
        'include_exposure': has_e,
        'include_vulnerability': False,
        'include_loss': False,
        'blocked': False,
        'block_reason': None
    }
    
    # Vulnerability gating: needs H or E
    if has_v:
        if has_h or has_e:
            result['include_vulnerability'] = True
        else:
            result['blocked'] = True
            result['block_reason'] = 'vulnerability_without_hazard_or_exposure'
    
    # Loss gating: needs H or E
    if has_l:
        if has_h or has_e:
            result['include_loss'] = True
        else:
            if result['blocked']:
                result['block_reason'] += '_and_loss_without_hazard_or_exposure'
            else:
                result['blocked'] = True
                result['block_reason'] = 'loss_without_hazard_or_exposure'
    
    return result

# Apply gating
if len(df_merged) > 0:
    gating_results = df_merged.apply(apply_gating_rules, axis=1)
    df_gating = pd.DataFrame(gating_results.tolist())
    df_merged = pd.concat([df_merged, df_gating], axis=1)
    
    print("Gating rules applied.")
    print(f"Blocked records: {df_merged['blocked'].sum()}")

In [None]:
"""
3.2 Determine Primary risk_data_type
"""

def determine_risk_data_types(row: pd.Series) -> List[str]:
    """
    Determine risk_data_type array for RDLS record.
    """
    types = []
    
    if row.get('include_hazard', False):
        types.append('hazard')
    if row.get('include_exposure', False):
        types.append('exposure')
    if row.get('include_vulnerability', False):
        types.append('vulnerability')
    if row.get('include_loss', False):
        types.append('loss')
    
    return types

def determine_filename_prefix(risk_types: List[str]) -> str:
    """
    Determine RDLS filename prefix based on primary component.
    
    Priority: hazard > exposure > vulnerability > loss
    """
    if 'hazard' in risk_types:
        return 'rdls_hzd'
    elif 'exposure' in risk_types:
        return 'rdls_exp'
    elif 'vulnerability' in risk_types:
        return 'rdls_vln'
    elif 'loss' in risk_types:
        return 'rdls_lss'
    else:
        return 'rdls_unk'

print("Risk data type logic defined.")

## 4. RDLS Record Builder

In [None]:
"""
4.1 HDX to RDLS General Metadata Mapper
"""

def map_hdx_to_rdls_general(
    hdx_record: Dict[str, Any],
    risk_data_types: List[str]
) -> Dict[str, Any]:
    """
    Map HDX metadata to RDLS general metadata fields.
    
    Parameters
    ----------
    hdx_record : Dict[str, Any]
        Original HDX metadata
    risk_data_types : List[str]
        Determined risk_data_type values
        
    Returns
    -------
    Dict[str, Any]
        RDLS dataset record (general metadata only)
    """
    dataset_id = hdx_record.get('id', '')
    prefix = determine_filename_prefix(risk_data_types)
    
    rdls = {
        'id': f"{prefix}-hdx_{dataset_id[:8]}",
        'title': hdx_record.get('title', ''),
        'description': hdx_record.get('notes', ''),
        'risk_data_type': risk_data_types,
    }
    
    # Version (default)
    rdls['version'] = '1'
    
    # Spatial
    groups = hdx_record.get('groups', [])
    if groups:
        # Try to determine scale from groups
        if 'World' in groups:
            rdls['spatial'] = {'scale': 'global'}
        elif len(groups) > 3:
            rdls['spatial'] = {'scale': 'regional'}
        else:
            rdls['spatial'] = {'scale': 'national'}
    
    # License mapping
    license_map = {
        'Creative Commons Attribution International': 'CC-BY-4.0',
        'Creative Commons Attribution for Intergovernmental Organisations': 'CC-BY-IGO-3.0',
        'Public Domain': 'CC0-1.0',
        'Other': 'Other',
    }
    hdx_license = hdx_record.get('license_title', '')
    rdls['license'] = license_map.get(hdx_license, hdx_license or 'Other')
    
    # Attributions
    org = hdx_record.get('organization', '')
    rdls['attributions'] = [
        {
            'id': 'attribution_publisher',
            'role': 'publisher',
            'entity': {'name': org or 'HDX'}
        },
        {
            'id': 'attribution_creator',
            'role': 'creator',
            'entity': {'name': hdx_record.get('dataset_source', org or 'Unknown')}
        },
        {
            'id': 'attribution_contact',
            'role': 'contact_point',
            'entity': {'name': org or 'HDX'}
        }
    ]
    
    # Resources
    resources = []
    for i, res in enumerate(hdx_record.get('resources', [])):
        rdls_res = {
            'id': f"resource_{dataset_id[:8]}_{i+1}",
            'title': res.get('name', ''),
            'description': res.get('description', ''),
            'data_format': res.get('format', ''),
            'access_url': f"https://data.humdata.org/dataset/{dataset_id}",
            'download_url': res.get('download_url', '')
        }
        resources.append(rdls_res)
    
    if resources:
        rdls['resources'] = resources
    
    # Links (schema reference)
    rdls['links'] = [{
        'href': 'https://docs.riskdatalibrary.org/en/0__3__0/rdls_schema.json',
        'rel': 'describedby'
    }]
    
    return rdls

print("General metadata mapper defined.")

In [None]:
"""
4.2 HEVL Block Builders (Simplified versions)
"""

def build_hazard_block_simple(row: pd.Series, dataset_id: str) -> Optional[Dict]:
    """Build hazard block from extraction row."""
    hazard_types = row.get('hazard_types', '')
    if not hazard_types or pd.isna(hazard_types):
        return None
    
    types = hazard_types.split('|') if isinstance(hazard_types, str) else []
    if not types:
        return None
    
    hazards = []
    for i, ht in enumerate(types):
        hazards.append({
            'id': f"hazard_{dataset_id[:8]}_{i+1}",
            'type': ht
        })
    
    event_set = {
        'id': f"event_set_{dataset_id[:8]}",
        'hazards': hazards
    }
    
    # Add analysis type if available
    analysis = row.get('hazard_analysis_type')
    if analysis and not pd.isna(analysis):
        event_set['analysis_type'] = analysis
    
    # Add return periods if available
    rp_str = row.get('return_periods', '')
    if rp_str and not pd.isna(rp_str):
        rps = [int(x) for x in str(rp_str).split('|') if x.isdigit()]
        if rps:
            events = []
            for rp in rps:
                events.append({
                    'id': f"event_rp{rp}_{dataset_id[:8]}",
                    'calculation_method': 'simulated',
                    'hazard': hazards[0],
                    'occurrence': {
                        'probabilistic': {'return_period': rp}
                    }
                })
            event_set['events'] = events
    
    return {'event_sets': [event_set]}


def build_exposure_block_simple(row: pd.Series, dataset_id: str) -> Optional[List[Dict]]:
    """Build exposure block from extraction row."""
    categories = row.get('exposure_categories', '')
    if not categories or pd.isna(categories):
        return None
    
    cats = categories.split('|') if isinstance(categories, str) else []
    if not cats:
        return None
    
    exposure_items = []
    for i, cat in enumerate(cats):
        item = {
            'id': f"exposure_{dataset_id[:8]}_{i+1}",
            'category': cat,
            'metrics': [{
                'id': f"metric_{dataset_id[:8]}_{i+1}",
                'dimension': 'Structure' if cat == 'buildings' else 'Other',
                'quantity_kind': 'count'
            }]
        }
        
        taxonomy = row.get('exposure_taxonomy')
        if taxonomy and not pd.isna(taxonomy):
            item['taxonomy'] = taxonomy
        
        exposure_items.append(item)
    
    return exposure_items

print("HEVL block builders defined.")

In [None]:
"""
4.3 Complete RDLS Record Builder
"""

def build_complete_rdls_record(
    hdx_record: Dict[str, Any],
    extraction_row: pd.Series
) -> Tuple[Optional[Dict], Optional[str]]:
    """
    Build complete RDLS record with HEVL blocks.
    
    Returns
    -------
    Tuple[Optional[Dict], Optional[str]]
        (RDLS record, block_reason if blocked)
    """
    # Check if blocked
    if extraction_row.get('blocked', False):
        return None, extraction_row.get('block_reason', 'unknown')
    
    # Determine risk data types
    risk_types = determine_risk_data_types(extraction_row)
    
    if not risk_types:
        return None, 'no_hevl_signals'
    
    dataset_id = hdx_record.get('id', '')
    
    # Build general metadata
    rdls_dataset = map_hdx_to_rdls_general(hdx_record, risk_types)
    
    # Add HEVL blocks
    if extraction_row.get('include_hazard', False):
        hazard_block = build_hazard_block_simple(extraction_row, dataset_id)
        if hazard_block:
            rdls_dataset['hazard'] = hazard_block
    
    if extraction_row.get('include_exposure', False):
        exposure_block = build_exposure_block_simple(extraction_row, dataset_id)
        if exposure_block:
            rdls_dataset['exposure'] = exposure_block
    
    # Vulnerability and Loss blocks are more complex
    # For now, just flag that they're present
    if extraction_row.get('include_vulnerability', False):
        rdls_dataset['vulnerability'] = {
            'functions': {},
            '_note': 'Vulnerability data detected - requires manual enrichment'
        }
    
    if extraction_row.get('include_loss', False):
        rdls_dataset['loss'] = {
            'losses': [],
            '_note': 'Loss data detected - requires manual enrichment'
        }
    
    return {'datasets': [rdls_dataset]}, None

print("Complete record builder defined.")

## 5. Generate Integrated RDLS Records

In [None]:
"""
5.1 Process Records
"""

def process_integration(
    merged_df: pd.DataFrame,
    metadata_dir: Path,
    output_dir: Path,
    limit: Optional[int] = None
) -> Dict[str, Any]:
    """
    Process all records and generate integrated RDLS files.
    """
    # Filter to records with any HEVL signal
    hevl_mask = (
        merged_df.get('has_hazard', False) |
        merged_df.get('has_exposure', False) |
        merged_df.get('has_vulnerability', False) |
        merged_df.get('has_loss', False)
    )
    
    df_to_process = merged_df[hevl_mask].copy()
    
    if limit:
        df_to_process = df_to_process.head(limit)
    
    stats = {
        'total_candidates': len(df_to_process),
        'success': 0,
        'blocked': 0,
        'errors': 0,
        'by_type': {'hazard': 0, 'exposure': 0, 'vulnerability': 0, 'loss': 0}
    }
    
    blocked_records = []
    
    iterator = tqdm(df_to_process.iterrows(), total=len(df_to_process), desc="Integrating") if HAS_TQDM else df_to_process.iterrows()
    
    for idx, row in iterator:
        dataset_id = row.get('id', '')
        if not dataset_id:
            continue
        
        # Find HDX metadata file
        hdx_files = list(metadata_dir.glob(f"{dataset_id}__*.json"))
        if not hdx_files:
            stats['errors'] += 1
            continue
        
        try:
            with open(hdx_files[0], 'r', encoding='utf-8') as f:
                hdx_record = json.load(f)
            
            rdls_record, block_reason = build_complete_rdls_record(hdx_record, row)
            
            if rdls_record:
                # Determine filename
                risk_types = rdls_record['datasets'][0].get('risk_data_type', [])
                prefix = determine_filename_prefix(risk_types)
                filename = f"{prefix}-hdx_{dataset_id[:8]}.json"
                
                # Save
                output_path = output_dir / filename
                with open(output_path, 'w', encoding='utf-8') as f:
                    json.dump(rdls_record, f, indent=2)
                
                stats['success'] += 1
                for rt in risk_types:
                    stats['by_type'][rt] = stats['by_type'].get(rt, 0) + 1
            else:
                stats['blocked'] += 1
                blocked_records.append({
                    'id': dataset_id,
                    'title': row.get('title', ''),
                    'reason': block_reason
                })
                
        except Exception as e:
            stats['errors'] += 1
    
    # Save blocked records report
    if blocked_records:
        blocked_df = pd.DataFrame(blocked_records)
        blocked_df.to_csv(output_dir / 'integration_blocked.csv', index=False)
    
    return stats

# Process
PROCESS_LIMIT = 100  # Set to None for full processing

if len(df_merged) > 0:
    print(f"Processing up to {PROCESS_LIMIT or 'all'} records...")
    stats = process_integration(df_merged, DATASET_METADATA_DIR, OUTPUT_DIR, limit=PROCESS_LIMIT)
    
    print(f"\n{'='*60}")
    print("INTEGRATION COMPLETE")
    print(f"{'='*60}")
    print(f"Total candidates: {stats['total_candidates']}")
    print(f"Successfully created: {stats['success']}")
    print(f"Blocked: {stats['blocked']}")
    print(f"Errors: {stats['errors']}")
    print(f"\nBy component type:")
    for t, count in stats['by_type'].items():
        print(f"  {t}: {count}")
else:
    print("No merged data to process. Run notebooks 09-11 first.")

## 6. Generate Index and Summary

In [None]:
"""
6.1 Generate RDLS Index
"""

def generate_rdls_index(output_dir: Path) -> pd.DataFrame:
    """
    Generate index of all RDLS records.
    """
    records = []
    
    for filepath in output_dir.glob('rdls_*.json'):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            dataset = data.get('datasets', [{}])[0]
            
            records.append({
                'filename': filepath.name,
                'id': dataset.get('id', ''),
                'title': dataset.get('title', ''),
                'risk_data_type': '|'.join(dataset.get('risk_data_type', [])),
                'has_hazard': 'hazard' in dataset,
                'has_exposure': 'exposure' in dataset,
                'has_vulnerability': 'vulnerability' in dataset,
                'has_loss': 'loss' in dataset,
                'license': dataset.get('license', ''),
                'resource_count': len(dataset.get('resources', []))
            })
        except:
            pass
    
    df = pd.DataFrame(records)
    
    # Save index
    df.to_csv(output_dir / 'rdls_index.csv', index=False)
    
    # Also save as JSONL
    with open(output_dir / 'rdls_index.jsonl', 'w', encoding='utf-8') as f:
        for record in records:
            f.write(json.dumps(record) + '\n')
    
    return df

df_index = generate_rdls_index(OUTPUT_DIR)
print(f"Generated index with {len(df_index)} records.")
print(f"Saved to: {OUTPUT_DIR / 'rdls_index.csv'}")

In [None]:
"""
6.2 Summary Report
"""

if len(df_index) > 0:
    print("=" * 60)
    print("INTEGRATED RDLS RECORDS SUMMARY")
    print("=" * 60)
    print(f"\nTotal records: {len(df_index)}")
    print(f"\nBy primary component:")
    print(df_index['risk_data_type'].value_counts())
    print(f"\nComponent coverage:")
    print(f"  With hazard block: {df_index['has_hazard'].sum()}")
    print(f"  With exposure block: {df_index['has_exposure'].sum()}")
    print(f"  With vulnerability block: {df_index['has_vulnerability'].sum()}")
    print(f"  With loss block: {df_index['has_loss'].sum()}")

In [None]:
print(f"\nNotebook completed: {datetime.now().isoformat()}")
print(f"Output directory: {OUTPUT_DIR}")