# Notebook 13: RDLS Validation and Quality Assurance

**Purpose**: Validate integrated RDLS records against the v0.3 JSON schema and produce QA reports.

**Process**:
1. Load all generated RDLS records
2. Validate against RDLS v0.3 JSON schema
3. Check HEVL block completeness
4. Generate validation reports
5. Produce final distribution package

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1

---

## 1. Setup

In [None]:
"""
1.1 Import Dependencies
"""

import json
import shutil
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Any, Tuple

import pandas as pd
import numpy as np

try:
    import jsonschema
    from jsonschema import validate, ValidationError, Draft7Validator
    HAS_JSONSCHEMA = True
except ImportError:
    HAS_JSONSCHEMA = False
    print("Warning: jsonschema not installed. Schema validation will be skipped.")
    print("Install with: pip install jsonschema")

try:
    from tqdm.notebook import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print(f"Notebook started: {datetime.now().isoformat()}")
print(f"JSON Schema validation: {'Available' if HAS_JSONSCHEMA else 'Not available'}")

In [None]:
"""
1.2 Configure Paths
"""

NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebook' else NOTEBOOK_DIR

# RDLS schema
RDLS_SCHEMA_PATH = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'schema' / 'rdls_schema_v0.3.json'

# Input: integrated records
INTEGRATED_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'integrated'
EXTRACTED_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'extracted'

# Output: reports and dist
REPORTS_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'reports'
DIST_DIR = BASE_DIR / 'hdx_dataset_metadata_dump' / 'rdls' / 'dist'

REPORTS_DIR.mkdir(parents=True, exist_ok=True)
DIST_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base: {BASE_DIR}")
print(f"Reports: {REPORTS_DIR}")
print(f"Dist: {DIST_DIR}")

In [None]:
"""
1.3 Load RDLS Schema
"""

with open(RDLS_SCHEMA_PATH, 'r', encoding='utf-8') as f:
    RDLS_SCHEMA = json.load(f)

print(f"RDLS Schema loaded: {RDLS_SCHEMA.get('$id', 'unknown')}")

## 2. Load RDLS Records

In [None]:
"""
2.1 Find All RDLS Records
"""

def find_rdls_records(*directories: Path) -> List[Path]:
    """
    Find all RDLS JSON files in given directories.
    """
    files = []
    for directory in directories:
        if directory.exists():
            files.extend(directory.glob('rdls_*.json'))
    return sorted(set(files))

rdls_files = find_rdls_records(INTEGRATED_DIR, EXTRACTED_DIR)
print(f"Found {len(rdls_files)} RDLS record files.")

In [None]:
"""
2.2 Load Records
"""

def load_rdls_records(files: List[Path]) -> List[Dict[str, Any]]:
    """
    Load RDLS records from files.
    """
    records = []
    
    for filepath in files:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            records.append({
                'filepath': filepath,
                'filename': filepath.name,
                'data': data
            })
        except Exception as e:
            records.append({
                'filepath': filepath,
                'filename': filepath.name,
                'load_error': str(e)
            })
    
    return records

rdls_records = load_rdls_records(rdls_files)
loaded = sum(1 for r in rdls_records if 'data' in r)
errors = sum(1 for r in rdls_records if 'load_error' in r)

print(f"Loaded: {loaded}")
print(f"Load errors: {errors}")

## 3. Schema Validation

In [None]:
"""
3.1 Validate Against RDLS Schema
"""

def validate_rdls_record(data: Dict[str, Any], schema: Dict[str, Any]) -> Tuple[bool, List[str]]:
    """
    Validate RDLS record against schema.
    
    Returns
    -------
    Tuple[bool, List[str]]
        (is_valid, list_of_error_messages)
    """
    if not HAS_JSONSCHEMA:
        return True, ['Schema validation skipped - jsonschema not installed']
    
    errors = []
    
    try:
        # Use Draft7 validator
        validator = Draft7Validator(schema)
        
        for error in validator.iter_errors(data):
            path = '/'.join(str(p) for p in error.absolute_path)
            errors.append(f"{path}: {error.message}")
        
        return len(errors) == 0, errors
        
    except Exception as e:
        return False, [f"Validation exception: {str(e)}"]

print("Schema validator defined.")

In [None]:
"""
3.2 Run Schema Validation
"""

validation_results = []

iterator = tqdm(rdls_records, desc="Validating") if HAS_TQDM else rdls_records

for record in iterator:
    result = {
        'filename': record['filename'],
        'filepath': str(record['filepath'])
    }
    
    if 'load_error' in record:
        result['status'] = 'load_error'
        result['errors'] = [record['load_error']]
    else:
        is_valid, errors = validate_rdls_record(record['data'], RDLS_SCHEMA)
        result['status'] = 'valid' if is_valid else 'invalid'
        result['errors'] = errors
        
        # Extract basic info
        if 'datasets' in record['data'] and record['data']['datasets']:
            ds = record['data']['datasets'][0]
            result['id'] = ds.get('id', '')
            result['title'] = ds.get('title', '')[:80]
            result['risk_data_type'] = '|'.join(ds.get('risk_data_type', []))
    
    validation_results.append(result)

df_validation = pd.DataFrame(validation_results)

print(f"\n{'='*60}")
print("SCHEMA VALIDATION RESULTS")
print(f"{'='*60}")
print(f"\nTotal records: {len(df_validation)}")
print(f"\nStatus distribution:")
print(df_validation['status'].value_counts())

In [None]:
"""
3.3 Analyze Validation Errors
"""

invalid_records = df_validation[df_validation['status'] == 'invalid']

if len(invalid_records) > 0:
    print(f"\n{'='*60}")
    print(f"VALIDATION ERRORS ({len(invalid_records)} records)")
    print(f"{'='*60}")
    
    # Collect all errors
    all_errors = []
    for errors in invalid_records['errors']:
        if isinstance(errors, list):
            all_errors.extend(errors)
    
    # Group by error type
    error_counts = pd.Series(all_errors).value_counts()
    
    print(f"\nTop 10 error types:")
    for error, count in error_counts.head(10).items():
        print(f"  [{count}] {error[:80]}")
else:
    print("\nAll records passed schema validation!")

## 4. HEVL Completeness Check

In [None]:
"""
4.1 Check HEVL Block Completeness
"""

def check_hevl_completeness(data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Check completeness of HEVL blocks.
    """
    if 'datasets' not in data or not data['datasets']:
        return {'error': 'No datasets array'}
    
    ds = data['datasets'][0]
    result = {
        'has_hazard_block': 'hazard' in ds,
        'has_exposure_block': 'exposure' in ds,
        'has_vulnerability_block': 'vulnerability' in ds,
        'has_loss_block': 'loss' in ds,
    }
    
    # Hazard completeness
    if result['has_hazard_block']:
        hazard = ds['hazard']
        event_sets = hazard.get('event_sets', [])
        result['hazard_event_sets'] = len(event_sets)
        if event_sets:
            es = event_sets[0]
            result['hazard_has_analysis_type'] = 'analysis_type' in es
            result['hazard_has_events'] = 'events' in es and len(es.get('events', [])) > 0
            result['hazard_hazards_count'] = len(es.get('hazards', []))
    
    # Exposure completeness
    if result['has_exposure_block']:
        exposure = ds['exposure']
        result['exposure_items'] = len(exposure) if isinstance(exposure, list) else 0
        if isinstance(exposure, list) and exposure:
            result['exposure_has_metrics'] = 'metrics' in exposure[0]
            result['exposure_has_taxonomy'] = 'taxonomy' in exposure[0]
    
    # Vulnerability completeness
    if result['has_vulnerability_block']:
        vuln = ds['vulnerability']
        funcs = vuln.get('functions', {})
        result['vuln_has_functions'] = bool(funcs)
        result['vuln_has_socioeconomic'] = 'socio_economic' in vuln
    
    # Loss completeness
    if result['has_loss_block']:
        loss = ds['loss']
        result['loss_items'] = len(loss.get('losses', []))
    
    return result

# Run completeness check
completeness_results = []

for record in rdls_records:
    if 'data' in record:
        result = check_hevl_completeness(record['data'])
        result['filename'] = record['filename']
        completeness_results.append(result)

df_completeness = pd.DataFrame(completeness_results)

print(f"{'='*60}")
print("HEVL BLOCK COMPLETENESS")
print(f"{'='*60}")
print(f"\nRecords analyzed: {len(df_completeness)}")
print(f"\nBlock presence:")
for col in ['has_hazard_block', 'has_exposure_block', 'has_vulnerability_block', 'has_loss_block']:
    if col in df_completeness:
        count = df_completeness[col].sum()
        print(f"  {col}: {count} ({count/len(df_completeness)*100:.1f}%)")

In [None]:
"""
4.2 Hazard Block Quality
"""

hazard_records = df_completeness[df_completeness.get('has_hazard_block', False) == True]

if len(hazard_records) > 0:
    print(f"\n{'='*60}")
    print(f"HAZARD BLOCK QUALITY ({len(hazard_records)} records)")
    print(f"{'='*60}")
    
    if 'hazard_has_analysis_type' in hazard_records:
        print(f"\nWith analysis_type: {hazard_records['hazard_has_analysis_type'].sum()}")
    if 'hazard_has_events' in hazard_records:
        print(f"With events: {hazard_records['hazard_has_events'].sum()}")
    if 'hazard_hazards_count' in hazard_records:
        print(f"Avg hazards per record: {hazard_records['hazard_hazards_count'].mean():.1f}")

## 5. Generate Reports

In [None]:
"""
5.1 Save Validation Report
"""

# Prepare validation report
validation_export = df_validation.copy()
validation_export['errors'] = validation_export['errors'].apply(
    lambda x: '|'.join(x) if isinstance(x, list) else str(x)
)

validation_file = REPORTS_DIR / 'schema_validation_report.csv'
validation_export.to_csv(validation_file, index=False)
print(f"Saved: {validation_file}")

# Save completeness report
completeness_file = REPORTS_DIR / 'hevl_completeness_report.csv'
df_completeness.to_csv(completeness_file, index=False)
print(f"Saved: {completeness_file}")

In [None]:
"""
5.2 Generate Summary Markdown Report
"""

def generate_summary_report(
    validation_df: pd.DataFrame,
    completeness_df: pd.DataFrame,
    output_path: Path
) -> None:
    """
    Generate markdown summary report.
    """
    total = len(validation_df)
    valid = (validation_df['status'] == 'valid').sum()
    invalid = (validation_df['status'] == 'invalid').sum()
    
    report = f"""# RDLS Validation and QA Report

**Generated**: {datetime.now().isoformat()}

## Summary

| Metric | Value |
|--------|-------|
| Total Records | {total} |
| Schema Valid | {valid} ({valid/total*100:.1f}%) |
| Schema Invalid | {invalid} ({invalid/total*100:.1f}%) |

## HEVL Block Coverage

| Component | Records | Percentage |
|-----------|---------|------------|
| Hazard | {completeness_df.get('has_hazard_block', pd.Series([False])).sum()} | {completeness_df.get('has_hazard_block', pd.Series([False])).mean()*100:.1f}% |
| Exposure | {completeness_df.get('has_exposure_block', pd.Series([False])).sum()} | {completeness_df.get('has_exposure_block', pd.Series([False])).mean()*100:.1f}% |
| Vulnerability | {completeness_df.get('has_vulnerability_block', pd.Series([False])).sum()} | {completeness_df.get('has_vulnerability_block', pd.Series([False])).mean()*100:.1f}% |
| Loss | {completeness_df.get('has_loss_block', pd.Series([False])).sum()} | {completeness_df.get('has_loss_block', pd.Series([False])).mean()*100:.1f}% |

## Validation Status

### Valid Records
Records that pass RDLS v0.3 JSON schema validation.

### Invalid Records
Records with schema validation errors. See `schema_validation_report.csv` for details.

## Files Generated

- `schema_validation_report.csv` - Detailed validation results
- `hevl_completeness_report.csv` - HEVL block completeness analysis
- `rdls_index.csv` - Index of all RDLS records

## Next Steps

1. Review invalid records and fix schema errors
2. Enrich records with manual data where needed
3. Re-run validation after corrections

---
*Report generated by HDX-RDLS Pipeline Notebook 13*
"""
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(report)
    
    print(f"Saved: {output_path}")

summary_file = REPORTS_DIR / 'rdls_validation_summary.md'
generate_summary_report(df_validation, df_completeness, summary_file)

## 6. Create Distribution Package

In [None]:
"""
6.1 Copy Valid Records to Dist
"""

def create_distribution(
    validation_df: pd.DataFrame,
    rdls_records: List[Dict],
    dist_dir: Path,
    include_invalid: bool = False
) -> Dict[str, int]:
    """
    Create distribution package with valid records.
    """
    records_dir = dist_dir / 'records'
    records_dir.mkdir(parents=True, exist_ok=True)
    
    stats = {'copied': 0, 'skipped': 0}
    
    # Get valid filenames
    if include_invalid:
        valid_files = set(validation_df['filename'])
    else:
        valid_files = set(validation_df[validation_df['status'] == 'valid']['filename'])
    
    for record in rdls_records:
        if record['filename'] in valid_files and 'data' in record:
            dest = records_dir / record['filename']
            shutil.copy2(record['filepath'], dest)
            stats['copied'] += 1
        else:
            stats['skipped'] += 1
    
    return stats

# Create distribution (include all for now)
dist_stats = create_distribution(df_validation, rdls_records, DIST_DIR, include_invalid=True)

print(f"\n{'='*60}")
print("DISTRIBUTION PACKAGE")
print(f"{'='*60}")
print(f"Records copied: {dist_stats['copied']}")
print(f"Records skipped: {dist_stats['skipped']}")
print(f"Location: {DIST_DIR}")

In [None]:
"""
6.2 Copy Reports and Index
"""

# Copy reports to dist
dist_reports = DIST_DIR / 'reports'
dist_reports.mkdir(exist_ok=True)

for report_file in REPORTS_DIR.glob('*'):
    shutil.copy2(report_file, dist_reports / report_file.name)

# Copy index if exists
for source_dir in [INTEGRATED_DIR, EXTRACTED_DIR]:
    for index_file in source_dir.glob('rdls_index.*'):
        shutil.copy2(index_file, DIST_DIR / index_file.name)

print(f"Reports copied to: {dist_reports}")

In [None]:
"""
6.3 Create ZIP Archive (Optional)
"""

import zipfile

def create_zip_archive(dist_dir: Path, output_name: str = 'rdls_hdx_package') -> Path:
    """
    Create ZIP archive of distribution.
    """
    timestamp = datetime.now().strftime('%Y%m%d')
    zip_path = dist_dir.parent / f"{output_name}_{timestamp}.zip"
    
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zf:
        for file_path in dist_dir.rglob('*'):
            if file_path.is_file():
                arcname = file_path.relative_to(dist_dir)
                zf.write(file_path, arcname)
    
    return zip_path

# Create ZIP
zip_path = create_zip_archive(DIST_DIR)
print(f"\nZIP archive created: {zip_path}")
print(f"Size: {zip_path.stat().st_size / 1024:.1f} KB")

## 7. Final Summary

In [None]:
"""
7.1 Pipeline Summary
"""

print("="*70)
print("HDX â†’ RDLS PIPELINE COMPLETE")
print("="*70)

print(f"""
SUMMARY
-------
Total RDLS records generated: {len(rdls_records)}
Schema valid: {(df_validation['status'] == 'valid').sum()}
Schema invalid: {(df_validation['status'] == 'invalid').sum()}

HEVL COVERAGE
-------------
Records with Hazard block: {df_completeness.get('has_hazard_block', pd.Series([False])).sum()}
Records with Exposure block: {df_completeness.get('has_exposure_block', pd.Series([False])).sum()}
Records with Vulnerability block: {df_completeness.get('has_vulnerability_block', pd.Series([False])).sum()}
Records with Loss block: {df_completeness.get('has_loss_block', pd.Series([False])).sum()}

OUTPUT LOCATIONS
----------------
Distribution: {DIST_DIR}
Reports: {REPORTS_DIR}
ZIP Archive: {zip_path}

NEXT STEPS
----------
1. Review validation errors in schema_validation_report.csv
2. Enrich high-priority records manually using RDLS Metadata Editor
3. Merge with manually curated RDLS records
4. Publish to RDLS catalog
""")

print(f"\nNotebook completed: {datetime.now().isoformat()}")