# Notebook 04: RDLS Candidate Classification

**Purpose**: Classify HDX datasets into RDLS components using weighted scoring.

**Process**:
1. Load mapping configs from Notebook 03
2. Score each dataset using tags, keywords, and org hints
3. Assign RDLS components (hazard, exposure, vulnerability_proxy, loss_impact)
4. Apply OSM exclusion policy from Notebook 02

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1

---

## 1. Setup

In [None]:
"""
1.1 Import Dependencies
"""

from __future__ import annotations

import json
import re
from collections import Counter, defaultdict
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import pandas as pd

# PyYAML for config files
try:
    import yaml
except ImportError as e:
    raise ImportError("Missing dependency: pyyaml. Install with: pip install pyyaml") from e

# Optional: tqdm for progress bars
try:
    from tqdm.notebook import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False
    print("Note: tqdm not installed. Install with: pip install tqdm")

print(f"Notebook started: {datetime.now().isoformat()}")
print(f"Progress bars: {'Available' if HAS_TQDM else 'Not available'}")

In [None]:
"""
1.2 Configure Paths
"""

NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebook' else NOTEBOOK_DIR

# Input directories
DUMP_DIR = BASE_DIR / 'hdx_dataset_metadata_dump'
DATASET_DIR = DUMP_DIR / 'dataset_metadata'
POLICY_DIR = DUMP_DIR / 'policy'
CONFIG_DIR = DUMP_DIR / 'config'

# Config files from Notebook 03
OSM_EXCLUDED_IDS_TXT = POLICY_DIR / 'osm_excluded_dataset_ids.txt'
TAG_MAP_YAML = CONFIG_DIR / 'tag_to_rdls_component.yaml'
KEYWORD_MAP_YAML = CONFIG_DIR / 'keyword_to_rdls_component.yaml'
ORG_HINTS_YAML = CONFIG_DIR / 'org_hints.yaml'

# Output directories
DERIVED_DIR = DUMP_DIR / 'derived'
DERIVED_DIR.mkdir(parents=True, exist_ok=True)

# Output files
OUT_CLASSIFICATION_CSV = DERIVED_DIR / 'classification.csv'
OUT_SUMMARY_JSON = DERIVED_DIR / 'classification_summary.json'
OUT_INCLUDED_IDS_TXT = DERIVED_DIR / 'rdls_included_dataset_ids.txt'
OUT_ERRORS_JSONL = DERIVED_DIR / 'errors_classification.jsonl'

print(f"Dataset dir: {DATASET_DIR}")
print(f"Config dir: {CONFIG_DIR}")
print(f"Output dir: {DERIVED_DIR}")

In [None]:
"""
1.3 Classification Configuration
"""

# RDLS component categories
RDLS_COMPONENTS = ('hazard', 'exposure', 'vulnerability_proxy', 'loss_impact')

# Scoring thresholds
KEYWORD_HIT_WEIGHT = 1    # Weight per keyword pattern match
CANDIDATE_MIN_SCORE = 4   # Minimum score to be RDLS candidate
CONF_HIGH = 7             # Score >= this = high confidence
CONF_MED = 4              # Score >= this = medium confidence

print(f"RDLS Components: {RDLS_COMPONENTS}")
print(f"Candidate threshold: {CANDIDATE_MIN_SCORE}")
print(f"High confidence threshold: {CONF_HIGH}")

## 2. Load Configuration

In [None]:
"""
2.1 Load Mapping Configs from Notebook 03
"""

def load_yaml(path: Path) -> Dict[str, Any]:
    """Load YAML file safely."""
    if not path.exists():
        raise FileNotFoundError(f"Missing config file: {path}")
    with path.open('r', encoding='utf-8') as f:
        return yaml.safe_load(f) or {}


# Load mapping configs
tag_weights: Dict[str, Dict[str, int]] = load_yaml(TAG_MAP_YAML)
keyword_patterns: Dict[str, List[str]] = load_yaml(KEYWORD_MAP_YAML)
org_hints: Dict[str, Dict[str, int]] = load_yaml(ORG_HINTS_YAML)

# Compile regex patterns for keywords
compiled_keywords: Dict[str, List[re.Pattern]] = {}
for comp in RDLS_COMPONENTS:
    pats = keyword_patterns.get(comp, []) or []
    compiled_keywords[comp] = [re.compile(p, flags=re.IGNORECASE) for p in pats]

print(f"Loaded tag maps: {list(tag_weights.keys())}")
print(f"Loaded keyword maps: {list(keyword_patterns.keys())}")
print(f"Loaded org hints: {len(org_hints)}")

In [None]:
"""
2.2 Load OSM Exclusion List from Notebook 02
"""

osm_excluded: set = set()

if OSM_EXCLUDED_IDS_TXT.exists():
    with OSM_EXCLUDED_IDS_TXT.open('r', encoding='utf-8') as f:
        for line in f:
            v = line.strip()
            if v:
                osm_excluded.add(v)
    print(f"Loaded {len(osm_excluded):,} OSM excluded IDs")
else:
    print(f"WARNING: OSM exclusion list not found: {OSM_EXCLUDED_IDS_TXT}")

## 3. Classification Logic

In [None]:
"""
3.1 Helper Functions
"""

def iter_json_files(folder: Path) -> Iterable[Path]:
    """Yield JSON files in folder, sorted for determinism."""
    if not folder.exists():
        raise FileNotFoundError(f"Dataset folder not found: {folder}")
    yield from sorted(folder.glob('*.json'))


def safe_load_json(path: Path) -> Dict[str, Any]:
    """Load JSON file safely."""
    with path.open('r', encoding='utf-8') as f:
        return json.load(f)


def as_list(x: Any) -> List[Any]:
    """Convert value to list."""
    if x is None:
        return []
    if isinstance(x, list):
        return x
    return [x]


def normalize_text(s: Any) -> str:
    """Normalize text value."""
    if not s:
        return ''
    return str(s).strip()


def extract_formats(resources: Any) -> List[str]:
    """Extract unique formats from resources."""
    fmts = []
    for r in as_list(resources):
        fmt = normalize_text(r.get('format') if isinstance(r, dict) else '')
        if fmt:
            fmts.append(fmt.upper())
    # Deduplicate preserving order
    seen = set()
    out = []
    for f in fmts:
        if f not in seen:
            out.append(f)
            seen.add(f)
    return out


print("Helper functions defined.")

In [None]:
"""
3.2 Classification Dataclass and Function
"""

@dataclass
class Classification:
    """Classification result for a dataset."""
    scores: Dict[str, int]
    components: List[str]
    rdls_candidate: bool
    confidence: str
    top_signals: List[str]


def classify_dataset(meta: Dict[str, Any]) -> Classification:
    """
    Classify a dataset into RDLS components.
    
    Parameters
    ----------
    meta : Dict[str, Any]
        Dataset metadata
        
    Returns
    -------
    Classification
        Classification result with scores and components
    """
    # Extract text fields
    title = normalize_text(meta.get('title'))
    notes = normalize_text(meta.get('notes'))
    org = normalize_text(meta.get('organization'))
    text = f"{title}\n{notes}".strip()
    
    # Extract and normalize tags
    tags = [normalize_text(t) for t in as_list(meta.get('tags'))]
    tags_lower = [t.lower() for t in tags if t]
    
    # Initialize scores
    scores = {c: 0 for c in RDLS_COMPONENTS}
    signals: List[Tuple[int, str]] = []
    
    # 1) Tag weights
    for comp, weights in tag_weights.items():
        if comp not in RDLS_COMPONENTS:
            continue
        for t in tags_lower:
            w = weights.get(t)
            if w:
                scores[comp] += int(w)
                signals.append((abs(int(w)), f"tag:{t}(+{w})→{comp}"))
    
    # 2) Keyword pattern matches
    if text:
        for comp in RDLS_COMPONENTS:
            hits = 0
            for pat in compiled_keywords.get(comp, []):
                if pat.search(text):
                    hits += 1
                    signals.append((KEYWORD_HIT_WEIGHT, f"kw:{pat.pattern}(+{KEYWORD_HIT_WEIGHT})→{comp}"))
            if hits:
                scores[comp] += hits * KEYWORD_HIT_WEIGHT
    
    # 3) Organization hints
    org_norm = org.lower()
    for hint, comp_weights in org_hints.items():
        if not hint:
            continue
        if hint.lower() in org_norm:
            for comp, w in comp_weights.items():
                if comp in RDLS_COMPONENTS and w:
                    scores[comp] += int(w)
                    signals.append((abs(int(w)), f"org:{hint}(+{w})→{comp}"))
    
    # Determine components
    max_score = max(scores.values()) if scores else 0
    components = [c for c, s in scores.items() if s >= CANDIDATE_MIN_SCORE]
    
    # Fallback: include best component if any signal exists
    if not components and max_score > 0:
        best = [c for c, s in scores.items() if s == max_score]
        components = best[:1]
    
    rdls_candidate = max_score >= CANDIDATE_MIN_SCORE
    
    # Determine confidence
    if max_score >= CONF_HIGH:
        confidence = 'high'
    elif max_score >= CONF_MED:
        confidence = 'medium'
    else:
        confidence = 'low'
    
    # Top signals for debugging
    signals_sorted = [s for _, s in sorted(signals, key=lambda x: x[0], reverse=True)]
    top_signals = signals_sorted[:8]
    
    return Classification(
        scores=scores,
        components=components,
        rdls_candidate=rdls_candidate,
        confidence=confidence,
        top_signals=top_signals
    )


print("Classification function defined.")

## 4. Run Classification

In [None]:
"""
4.1 Process All Datasets
"""

files = list(iter_json_files(DATASET_DIR))
total = len(files)

print(f"Processing {total:,} dataset files...")

rows: List[Dict[str, Any]] = []
errors = 0

# Reset errors log
if OUT_ERRORS_JSONL.exists():
    OUT_ERRORS_JSONL.unlink()

# Create iterator with progress bar
iterator = tqdm(files, desc="Classifying datasets") if HAS_TQDM else files

for i, fp in enumerate(iterator, start=1):
    try:
        meta = safe_load_json(fp)
        
        dataset_id = normalize_text(meta.get('id'))
        cls = classify_dataset(meta)
        
        excluded_by_policy = dataset_id in osm_excluded
        resources = as_list(meta.get('resources'))
        formats = extract_formats(resources)
        
        row = {
            'dataset_id': dataset_id,
            'name': normalize_text(meta.get('name')),
            'title': normalize_text(meta.get('title')),
            'organization': normalize_text(meta.get('organization')),
            'dataset_source': normalize_text(meta.get('dataset_source')),
            'license_title': normalize_text(meta.get('license_title')),
            'dataset_date': normalize_text(meta.get('dataset_date')),
            'last_modified': normalize_text(meta.get('last_modified')),
            'data_update_frequency': normalize_text(meta.get('data_update_frequency')),
            'groups': ';'.join([normalize_text(g) for g in as_list(meta.get('groups')) if normalize_text(g)]),
            'tags': ';'.join([t for t in as_list(meta.get('tags')) if normalize_text(t)]),
            'resource_count': len(resources),
            'formats': ';'.join(formats),
            **{f'score_{k}': int(v) for k, v in cls.scores.items()},
            'rdls_components': ';'.join(cls.components),
            'rdls_candidate': bool(cls.rdls_candidate),
            'confidence': cls.confidence,
            'excluded_by_policy': bool(excluded_by_policy),
            'exclusion_reason': 'osm_policy' if excluded_by_policy else '',
            'top_signals': ' | '.join(cls.top_signals),
            'source_file': fp.name,
        }
        rows.append(row)
        
    except Exception as e:
        errors += 1
        with OUT_ERRORS_JSONL.open('a', encoding='utf-8') as ef:
            ef.write(json.dumps({'file': fp.name, 'error': str(e)}, ensure_ascii=False) + '\n')
    
    # Progress for non-tqdm
    if not HAS_TQDM and i % 5000 == 0:
        print(f"  Processed {i:,}/{total:,}")

print(f"\nTotal rows: {len(rows):,}")
print(f"Errors: {errors:,}")

## 5. Write Outputs

In [None]:
"""
5.1 Save Classification Results
"""

df = pd.DataFrame(rows)

# Stable column order
base_cols = [
    'dataset_id', 'name', 'title', 'organization', 'dataset_source', 'license_title',
    'dataset_date', 'last_modified', 'data_update_frequency', 'groups', 'tags',
    'resource_count', 'formats',
    'score_hazard', 'score_exposure', 'score_vulnerability_proxy', 'score_loss_impact',
    'rdls_components', 'rdls_candidate', 'confidence',
    'excluded_by_policy', 'exclusion_reason',
    'top_signals', 'source_file'
]

for c in base_cols:
    if c not in df.columns:
        df[c] = ''

df = df[base_cols]
df.to_csv(OUT_CLASSIFICATION_CSV, index=False, encoding='utf-8')
print(f"Wrote: {OUT_CLASSIFICATION_CSV}")

# Included IDs = RDLS candidate AND not excluded
included_ids = df.loc[
    (df['rdls_candidate'] == True) & (df['excluded_by_policy'] == False), 
    'dataset_id'
].dropna().tolist()

OUT_INCLUDED_IDS_TXT.write_text(
    '\n'.join(included_ids) + ('\n' if included_ids else ''), 
    encoding='utf-8'
)
print(f"Wrote: {OUT_INCLUDED_IDS_TXT} ({len(included_ids):,} IDs)")

In [None]:
"""
5.2 Generate Summary Statistics
"""

summary = {
    'total_datasets': int(len(df)),
    'errors': int(errors),
    'policy': {
        'osm_excluded_ids_loaded': int(len(osm_excluded)),
        'datasets_excluded_by_policy': int(df['excluded_by_policy'].sum()),
    },
    'rdls': {
        'candidates_total': int(df['rdls_candidate'].sum()),
        'included_total': int(((df['rdls_candidate'] == True) & (df['excluded_by_policy'] == False)).sum()),
    },
    'confidence_counts': df['confidence'].value_counts(dropna=False).to_dict(),
    'component_nonzero_counts': {
        comp: int((df[f'score_{comp}'] > 0).sum()) for comp in RDLS_COMPONENTS
    },
}

OUT_SUMMARY_JSON.write_text(json.dumps(summary, indent=2, ensure_ascii=False), encoding='utf-8')
print(f"Wrote: {OUT_SUMMARY_JSON}")

print(f"\n{'='*60}")
print("CLASSIFICATION SUMMARY")
print(f"{'='*60}")
print(f"Total datasets: {summary['total_datasets']:,}")
print(f"RDLS candidates: {summary['rdls']['candidates_total']:,}")
print(f"Included (after policy): {summary['rdls']['included_total']:,}")
print(f"Excluded by OSM policy: {summary['policy']['datasets_excluded_by_policy']:,}")
print(f"\nConfidence distribution:")
for conf, count in summary['confidence_counts'].items():
    print(f"  {conf}: {count:,}")
print(f"\nComponent coverage (non-zero scores):")
for comp, count in summary['component_nonzero_counts'].items():
    print(f"  {comp}: {count:,}")

## 6. Summary

In [None]:
"""
6.1 Display Top Candidates
"""

# Add max score column
df['score_max'] = df[[f'score_{c}' for c in RDLS_COMPONENTS]].max(axis=1)

# Show top candidates
print("\nTop 15 RDLS Candidates (by score):")
top_candidates = df.loc[
    df['excluded_by_policy'] == False
].sort_values('score_max', ascending=False).head(15)

display_cols = ['dataset_id', 'title', 'rdls_components', 'score_max', 'confidence']
print(top_candidates[display_cols].to_string(index=False))

print(f"\nOutputs:")
print(f"  - {OUT_CLASSIFICATION_CSV}")
print(f"  - {OUT_SUMMARY_JSON}")
print(f"  - {OUT_INCLUDED_IDS_TXT}")

print(f"\nNext: Run Notebook 05 to apply manual overrides.")
print(f"\nNotebook completed: {datetime.now().isoformat()}")