# Notebook 05: RDLS Classification Review & Overrides

**Purpose**: Provide a human QA loop for the machine classification from Notebook 04.

**Process**:
1. Build a review pack (CSV) prioritizing low/medium confidence candidates
2. Capture human decisions (keep/exclude/adjust) in structured format
3. Convert reviewed CSV to `config/overrides.yaml`
4. Apply overrides to produce final classification table

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1

---

## 1. Setup

In [1]:
"""
1.1 Import Dependencies
"""

from __future__ import annotations

import json
import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd

# PyYAML for config files
try:
    import yaml
except ImportError as e:
    raise ImportError("Missing dependency: pyyaml. Install with: pip install pyyaml") from e

# Optional: tqdm for progress bars
try:
    from tqdm.notebook import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False
    print("Note: tqdm not installed. Install with: pip install tqdm")

print(f"Notebook started: {datetime.now().isoformat()}")
print(f"Progress bars: {'Available' if HAS_TQDM else 'Not available'}")

Notebook started: 2026-02-10T21:42:18.365470
Progress bars: Available


In [2]:
"""
1.2 Configure Paths
"""

NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebook' else NOTEBOOK_DIR

# Input/Output directories
DUMP_DIR = BASE_DIR / 'hdx_dataset_metadata_dump'
DERIVED_DIR = DUMP_DIR / 'derived'
POLICY_DIR = DUMP_DIR / 'policy'
CONFIG_DIR = DUMP_DIR / 'config'
REVIEW_DIR = DERIVED_DIR / 'review'

# Input files from Notebook 04
CLASSIFICATION_CSV = DERIVED_DIR / 'classification.csv'
OSM_EXCLUDED_IDS_TXT = POLICY_DIR / 'osm_excluded_dataset_ids.txt'

# Output files
REVIEW_PACK_CSV = REVIEW_DIR / 'review_pack.csv'
OVERRIDES_YAML = CONFIG_DIR / 'overrides.yaml'
CLASSIFICATION_FINAL_CSV = DERIVED_DIR / 'classification_final.csv'
CLASSIFICATION_FINAL_SUMMARY_JSON = DERIVED_DIR / 'classification_final_summary.json'
RDLS_INCLUDED_IDS_FINAL_TXT = DERIVED_DIR / 'rdls_included_dataset_ids_final.txt'

# Create directories
REVIEW_DIR.mkdir(parents=True, exist_ok=True)
CONFIG_DIR.mkdir(parents=True, exist_ok=True)

print(f"Input: {CLASSIFICATION_CSV}")
print(f"Output: {REVIEW_DIR}")

# ── Output cleanup mode ───────────────────────────────────────────────
# Default "skip" because review/override files contain human decisions.
CLEANUP_MODE = "replace"


Input: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/derived/classification.csv
Output: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/derived/review


In [3]:
"""
1.3 Clean Previous Outputs

Remove stale output files from previous runs (controlled by CLEANUP_MODE).
Default is "skip" because review/override files contain human decisions.
"""

def clean_previous_outputs(output_dir, patterns, label, mode="replace"):
    """
    Remove previous output files matching the given glob patterns.

    Parameters
    ----------
    output_dir : Path
        Directory containing old outputs.
    patterns : list[str]
        Glob patterns to match.
    label : str
        Human-readable label for log messages.
    mode : str
        One of: "replace" (auto-delete), "prompt" (ask user),
        "skip" (keep old files), "abort" (error if stale files exist).

    Returns
    -------
    dict  with keys 'deleted' (int) and 'skipped' (bool)
    """
    result = {'deleted': 0, 'skipped': False}
    targets = {}
    for pattern in patterns:
        matches = sorted(output_dir.glob(pattern))
        if matches:
            targets[pattern] = matches
    total = sum(len(files) for files in targets.values())

    if total == 0:
        print(f'Output cleanup [{label}]: Directory is clean.')
        return result

    summary = []
    for pattern, files in targets.items():
        summary.append(f'  {pattern:40s}: {len(files):,} files')

    if mode == 'skip':
        print(f'Output cleanup [{label}]: SKIPPED ({total:,} existing files kept)')
        result['skipped'] = True
        return result

    if mode == 'abort':
        raise RuntimeError(
            f'Output cleanup [{label}]: ABORT -- {total:,} stale files found. '
            f'Delete manually or change CLEANUP_MODE.'
        )

    if mode == 'prompt':
        print(f'Output cleanup [{label}]: Found {total:,} existing output files:')
        for line in summary:
            print(line)
        choice = input('Choose [R]eplace / [S]kip / [A]bort: ').strip().lower()
        if choice in ('s', 'skip'):
            print('  Skipped.')
            result['skipped'] = True
            return result
        elif choice in ('a', 'abort'):
            raise RuntimeError('User chose to abort.')
        elif choice not in ('r', 'replace', ''):
            print(f'  Unknown choice, defaulting to Replace.')

    # Mode: replace (default)
    print(f'Output cleanup [{label}]:')
    for line in summary:
        print(line)
    for pattern, files in targets.items():
        for f in files:
            try:
                f.unlink()
                result['deleted'] += 1
            except Exception as e:
                print(f'  WARNING: Could not delete {f.name}: {e}')
    deleted_count = result['deleted']
    print(f'  Cleaned {deleted_count:,} files. Ready for fresh output.')
    print()
    return result

# ── Run cleanup ────────────────────────────────────────────────────────
clean_previous_outputs(
    REVIEW_DIR,
    patterns=["review_pack.csv"],
    label="NB 05 Review Pack",
    mode=CLEANUP_MODE,
)

clean_previous_outputs(
    CONFIG_DIR,
    patterns=["overrides.yaml"],
    label="NB 05 Overrides Config",
    mode=CLEANUP_MODE,
)

clean_previous_outputs(
    DERIVED_DIR,
    patterns=[
        "classification_final.csv",
        "classification_final_summary.json",
        "rdls_included_dataset_ids_final.txt",
    ],
    label="NB 05 Final Classification",
    mode=CLEANUP_MODE,
)


Output cleanup [NB 05 Review Pack]:
  review_pack.csv                         : 1 files
  Cleaned 1 files. Ready for fresh output.

Output cleanup [NB 05 Overrides Config]:
  overrides.yaml                          : 1 files
  Cleaned 1 files. Ready for fresh output.

Output cleanup [NB 05 Final Classification]:
  classification_final.csv                : 1 files
  classification_final_summary.json       : 1 files
  rdls_included_dataset_ids_final.txt     : 1 files
  Cleaned 3 files. Ready for fresh output.



{'deleted': 3, 'skipped': False}

In [4]:
"""
1.3 Configuration Parameters
"""

# Policy: do NOT allow humans to override OSM exclusion by default
ALLOW_OSM_OVERRIDE = False

# Review pack size (tune based on review capacity)
REVIEW_PACK_SIZE = 1500

# Prioritize low/medium confidence for review
PRIORITIZE_CONFIDENCE = ('low', 'medium')

# Required columns in classification CSV
REQUIRED_COLUMNS = [
    'dataset_id', 'title', 'organization', 'dataset_source', 'license_title',
    'tags', 'groups', 'formats', 'excluded_by_policy', 'rdls_candidate',
    'rdls_components', 'confidence', 'score_hazard', 'score_exposure',
    'score_vulnerability_proxy', 'score_loss_impact',
]

print(f"Review pack size: {REVIEW_PACK_SIZE}")
print(f"Allow OSM override: {ALLOW_OSM_OVERRIDE}")

Review pack size: 1500
Allow OSM override: False


## 2. Load Data

In [5]:
"""
2.1 Helper Functions
"""

def _to_bool_series(s: pd.Series) -> pd.Series:
    """Robust bool coercion for CSV roundtrips."""
    return (
        s.astype(str)
        .str.strip()
        .str.lower()
        .map({'true': True, 'false': False, '1': True, '0': False, 'yes': True, 'no': False})
        .fillna(False)
        .astype(bool)
    )


def load_classification_table(path: Path) -> pd.DataFrame:
    """
    Load and validate classification CSV.
    
    Parameters
    ----------
    path : Path
        Path to classification CSV
        
    Returns
    -------
    pd.DataFrame
        Classification dataframe
    """
    if not path.exists():
        raise FileNotFoundError(f"Missing Step 4 output: {path}")
    
    df = pd.read_csv(path)
    
    missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {', '.join(missing)}")
    
    # Coerce boolean columns
    for col in ['excluded_by_policy', 'rdls_candidate']:
        df[col] = _to_bool_series(df[col])
    
    df['confidence'] = df['confidence'].fillna('unknown').astype(str).str.strip().str.lower()
    
    return df


def load_osm_excluded_ids(path: Path) -> set:
    """Load OSM exclusion list."""
    if not path.exists():
        print(f"WARNING: OSM exclusion list not found: {path}")
        return set()
    
    ids = set()
    for line in path.read_text(encoding='utf-8').splitlines():
        s = line.strip()
        if s:
            ids.add(s)
    return ids


print("Helper functions defined.")

Helper functions defined.


In [6]:
"""
2.2 Load Classification and OSM Exclusion Data
"""

osm_excluded_ids = load_osm_excluded_ids(OSM_EXCLUDED_IDS_TXT)
df = load_classification_table(CLASSIFICATION_CSV)

# Derive is_osm column if not present
if 'is_osm' not in df.columns:
    df['is_osm'] = df['dataset_id'].astype(str).isin(osm_excluded_ids)
else:
    df['is_osm'] = _to_bool_series(df['is_osm'])

print(f"Total rows: {len(df):,}")
print(f"Unique dataset_id: {df['dataset_id'].nunique():,}")
print(f"OSM excluded IDs: {len(osm_excluded_ids):,}")
print(f"OSM flagged in data: {df['is_osm'].sum():,}")

Total rows: 26,246
Unique dataset_id: 26,246
OSM excluded IDs: 3,649
OSM flagged in data: 3,649


## 3. Build Review Pack

In [7]:
"""
3.1 Generate Review Pack CSV

Creates a subset for manual review prioritizing low/medium confidence candidates.
"""

REVIEW_COLUMNS = [
    'dataset_id', 'title', 'organization', 'dataset_source', 'license_title',
    'tags', 'groups', 'formats', 'is_osm', 'excluded_by_policy', 'rdls_candidate',
    'rdls_components', 'confidence', 'score_hazard', 'score_exposure',
    'score_vulnerability_proxy', 'score_loss_impact', 'top_signals',
]

# Filter: only candidates that are currently included
eligible = df[(df['rdls_candidate'] == True) & (df['excluded_by_policy'] == False)].copy()

# Calculate total score
eligible['total_score'] = (
    eligible['score_hazard'] + eligible['score_exposure'] +
    eligible['score_vulnerability_proxy'] + eligible['score_loss_impact']
)

# Prioritize low/medium confidence
priority = eligible[eligible['confidence'].isin(PRIORITIZE_CONFIDENCE)].copy()
priority = priority.sort_values(['confidence', 'total_score'], ascending=[True, False])
review_pack = priority.head(REVIEW_PACK_SIZE)

# Top up with high confidence if needed
if len(review_pack) < REVIEW_PACK_SIZE:
    remaining = eligible[~eligible.index.isin(review_pack.index)].sort_values('total_score', ascending=False)
    review_pack = pd.concat([review_pack, remaining.head(REVIEW_PACK_SIZE - len(review_pack))], ignore_index=True)

# Keep only available columns
available_cols = [c for c in REVIEW_COLUMNS if c in review_pack.columns]
review_pack = review_pack[available_cols].copy()

# Add empty human-edit fields
review_pack['decision'] = ''  # keep | exclude | unsure
review_pack['components_override'] = ''  # e.g. hazard,exposure
review_pack['notes'] = ''

review_pack.to_csv(REVIEW_PACK_CSV, index=False, encoding='utf-8')

print(f"Wrote: {REVIEW_PACK_CSV}")
print(f"Review pack size: {len(review_pack):,}")
print(f"\nConfidence breakdown:")
print(review_pack['confidence'].value_counts())

Wrote: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/derived/review/review_pack.csv
Review pack size: 1,500

Confidence breakdown:
confidence
medium    1500
Name: count, dtype: int64


## 4. Convert to Overrides YAML

In [8]:
"""
4.1 Parse Reviewed CSV and Generate overrides.yaml

Workflow:
1. Open review_pack.csv in Excel/VS Code
2. Fill decision (keep/exclude/unsure) and optional components_override
3. Re-run this cell to generate overrides.yaml
"""

VALID_DECISIONS = {'keep', 'exclude', 'unsure', ''}

def parse_components_list(s: str) -> List[str]:
    """Parse comma-separated components list."""
    parts = [p.strip().lower() for p in str(s).split(',') if p.strip()]
    # Deduplicate preserving order
    seen = set()
    out = []
    for p in parts:
        if p not in seen:
            out.append(p)
            seen.add(p)
    return out


# Load reviewed CSV
reviewed = pd.read_csv(REVIEW_PACK_CSV).fillna('')
reviewed['decision'] = reviewed['decision'].astype(str).str.strip().str.lower()

# Validate decisions
bad = reviewed[~reviewed['decision'].isin(VALID_DECISIONS)]
if len(bad) > 0:
    raise ValueError(
        f"Invalid decision values found. Allowed: keep, exclude, unsure, blank.\n"
        + bad[['dataset_id', 'decision']].head(20).to_string(index=False)
    )

# Build overrides dict
overrides: Dict[str, Any] = {'overrides': {}}

for _, r in reviewed.iterrows():
    dsid = str(r['dataset_id']).strip()
    decision = str(r['decision']).strip().lower()
    
    if not dsid or not decision or decision == 'unsure':
        continue
    
    entry: Dict[str, Any] = {'decision': decision}
    
    comps = parse_components_list(r.get('components_override', ''))
    if comps:
        entry['components'] = comps
    
    notes = str(r.get('notes', '')).strip()
    if notes:
        entry['notes'] = notes
    
    overrides['overrides'][dsid] = entry

# Write YAML
with OVERRIDES_YAML.open('w', encoding='utf-8') as f:
    yaml.safe_dump(overrides, f, sort_keys=True, allow_unicode=True)

print(f"Wrote: {OVERRIDES_YAML}")
print(f"Override entries: {len(overrides['overrides']):,}")

Wrote: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/config/overrides.yaml
Override entries: 0


## 5. Apply Overrides

In [9]:
"""
5.1 Apply Overrides to Full Classification
"""

def load_overrides(path: Path) -> Dict[str, Dict[str, Any]]:
    """Load overrides YAML file."""
    if not path.exists():
        print(f"WARNING: overrides file not found: {path}")
        return {}
    data = yaml.safe_load(path.read_text(encoding='utf-8')) or {}
    return data.get('overrides', {}) or {}


def _parse_components(s: Any) -> set:
    """Parse components string to set."""
    parts = [p.strip().lower() for p in str(s).split(',') if p.strip()]
    return set(parts)


def _join_components(s: set) -> str:
    """Join components in standard order."""
    order = ['hazard', 'exposure', 'vulnerability_proxy', 'loss_impact']
    return ','.join([c for c in order if c in s])


# Load overrides
overrides_map = load_overrides(OVERRIDES_YAML)
print(f"Loaded {len(overrides_map):,} overrides")

# Create final dataframe
final = df.copy()

# Add override tracking columns
final['override_decision'] = ''
final['override_components'] = ''
final['excluded_by_override'] = False

# Apply overrides
for i, r in final.iterrows():
    dsid = r['dataset_id']
    ov = overrides_map.get(dsid)
    if not ov:
        continue
    
    decision = str(ov.get('decision', '')).strip().lower()
    comps = ov.get('components', None)
    
    if decision in {'exclude', 'keep'}:
        final.at[i, 'override_decision'] = decision
    
    if isinstance(comps, list) and comps:
        final.at[i, 'override_components'] = ','.join([str(c).lower() for c in comps])
    
    if decision == 'exclude':
        final.at[i, 'excluded_by_override'] = True
    
    if decision == 'keep':
        final.at[i, 'rdls_candidate'] = True
        if isinstance(comps, list) and comps:
            final.at[i, 'rdls_components'] = ','.join([str(c).lower() for c in comps])

# Final exclusion: policy OR override-exclude
final['final_excluded'] = final['excluded_by_policy'] | final['excluded_by_override']

print("Overrides applied.")

Loaded 0 overrides
Overrides applied.


In [10]:
"""
5.2 Apply Component Normalization Rules

Rules:
- vulnerability_proxy requires hazard or exposure
- loss_impact requires hazard or exposure
If violated, auto-add exposure.
"""

# M5: Component dependency enforcement -- V/L candidates must co-occur with H or E.
# If vulnerability_proxy or loss_impact appears alone, exposure is auto-added.
final['components_normalized'] = False
final['components_normalization_notes'] = ''

_norm_vuln_count = 0
_norm_loss_count = 0

for i, r in final.iterrows():
    if not bool(r.get('rdls_candidate', False)):
        continue
    if bool(r.get('final_excluded', False)):
        continue
    
    comps = _parse_components(r.get('rdls_components', ''))
    if not comps:
        continue
    
    notes = []
    
    # vulnerability_proxy requires hazard or exposure
    if 'vulnerability_proxy' in comps and not ({'hazard', 'exposure'} & comps):
        comps.add('exposure')
        notes.append('added_exposure_for_vulnerability_proxy')
        _norm_vuln_count += 1
    
    # loss_impact requires hazard or exposure
    if 'loss_impact' in comps and not ({'hazard', 'exposure'} & comps):
        comps.add('exposure')
        notes.append('added_exposure_for_loss_impact')
        _norm_loss_count += 1
    
    if notes:
        final.at[i, 'rdls_components'] = _join_components(comps)
        final.at[i, 'components_normalized'] = True
        final.at[i, 'components_normalization_notes'] = ';'.join(notes)

# Enforce OSM policy if not allowing overrides
if not ALLOW_OSM_OVERRIDE and len(osm_excluded_ids) > 0:
    mask_illegal = final['dataset_id'].isin(osm_excluded_ids) & (final['override_decision'] == 'keep')
    illegal_count = int(mask_illegal.sum())
    if illegal_count > 0:
        print(f"WARNING: {illegal_count} override(s) tried to include OSM-excluded datasets. Reverting.")
        final.loc[mask_illegal, 'final_excluded'] = True

# Final included set
final['final_included'] = final['rdls_candidate'] & (~final['final_excluded'])

print(f"Component normalization complete.")
_total_norm = int(final['components_normalized'].sum())
print(f"Component normalization: {_total_norm} datasets had components added "
      f"({_norm_vuln_count} for vulnerability_proxy, {_norm_loss_count} for loss_impact)")


Component normalization complete.
Component normalization: 2837 datasets had components added (2481 for vulnerability_proxy, 356 for loss_impact)


## 6. Write Outputs

In [11]:
"""
6.1 Save Final Classification and Summary
"""

# Write final classification
final.to_csv(CLASSIFICATION_FINAL_CSV, index=False, encoding='utf-8')
print(f"Wrote: {CLASSIFICATION_FINAL_CSV}")

# Write included IDs
included_ids = final.loc[final['final_included'], 'dataset_id'].astype(str).tolist()
RDLS_INCLUDED_IDS_FINAL_TXT.write_text('\n'.join(included_ids) + '\n', encoding='utf-8')
print(f"Wrote: {RDLS_INCLUDED_IDS_FINAL_TXT} ({len(included_ids):,} IDs)")

# Generate summary
summary = {
    'total_datasets': int(len(final)),
    'policy': {
        'osm_excluded_ids_loaded': int(len(osm_excluded_ids)),
        'datasets_excluded_by_policy': int(final['excluded_by_policy'].sum()),
    },
    'overrides': {
        'override_entries_loaded': int(len(overrides_map)),
        'datasets_excluded_by_override': int(final['excluded_by_override'].sum()),
        'datasets_with_component_override': int((final['override_components'].astype(str) != '').sum()),
    },
    'rdls': {
        'candidates_total': int(final['rdls_candidate'].sum()),
        'included_total': int(final['final_included'].sum()),
    },
    'confidence_counts': final['confidence'].value_counts().to_dict(),
    'component_nonzero_counts': {
        'hazard': int((final['score_hazard'] > 0).sum()),
        'exposure': int((final['score_exposure'] > 0).sum()),
        'vulnerability_proxy': int((final['score_vulnerability_proxy'] > 0).sum()),
        'loss_impact': int((final['score_loss_impact'] > 0).sum()),
    },
}

CLASSIFICATION_FINAL_SUMMARY_JSON.write_text(json.dumps(summary, indent=2), encoding='utf-8')
print(f"Wrote: {CLASSIFICATION_FINAL_SUMMARY_JSON}")

print(f"\n{'='*60}")
print("FINAL CLASSIFICATION SUMMARY")
print(f"{'='*60}")
print(f"Total datasets: {summary['total_datasets']:,}")
print(f"RDLS candidates: {summary['rdls']['candidates_total']:,}")
print(f"Final included: {summary['rdls']['included_total']:,}")
print(f"Excluded by OSM policy: {summary['policy']['datasets_excluded_by_policy']:,}")
print(f"Excluded by override: {summary['overrides']['datasets_excluded_by_override']:,}")

print(f"\nNext: Run Notebook 06 to translate to RDLS schema.")
print(f"\nNotebook completed: {datetime.now().isoformat()}")

Wrote: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/derived/classification_final.csv
Wrote: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/derived/rdls_included_dataset_ids_final.txt (13,152 IDs)
Wrote: /mnt/c/Users/benny/OneDrive/Documents/Github/hdx-metadata-crawler/hdx_dataset_metadata_dump/derived/classification_final_summary.json

FINAL CLASSIFICATION SUMMARY
Total datasets: 26,246
RDLS candidates: 16,224
Final included: 13,152
Excluded by OSM policy: 3,649
Excluded by override: 0

Next: Run Notebook 06 to translate to RDLS schema.

Notebook completed: 2026-02-10T21:42:47.361452


## End of Code