# Notebook 01: HDX Metadata Crawler

**Purpose**: Crawl the Humanitarian Data Exchange (HDX) catalogue and download dataset-level metadata as JSON.

**Process**:
1. Enumerate datasets using CKAN Action API (`package_search`)
2. Download dataset-level metadata via HDX export endpoint
3. Fall back to CKAN `package_show` if export fails
4. Produce audit logs (manifest + errors)

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1  

---

## 1. Setup

In [None]:
"""
1.1 Import Dependencies

Core libraries for HTTP requests, JSON handling, and file operations.
"""

from __future__ import annotations

import json
import random
import re
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple

import requests

# Optional: tqdm for progress bars
try:
    from tqdm.notebook import tqdm
    HAS_TQDM = True
except ImportError:
    HAS_TQDM = False
    print("Note: tqdm not installed. Progress will be shown via print statements.")
    print("Install with: pip install tqdm")

print(f"Notebook started: {datetime.now().isoformat()}")
print(f"Progress bars: {'Available' if HAS_TQDM else 'Not available'}")

In [None]:
"""
1.2 Configuration

All configurable parameters are centralized here for easy adjustment.
"""

@dataclass
class CrawlerConfig:
    """
    Configuration for HDX metadata crawler.
    
    Attributes
    ----------
    base_url : str
        HDX base URL
    rows_per_page : int
        Number of datasets per API page (max typically 1000)
    requests_per_second : float
        Rate limiting (be polite to the server)
    max_retries : int
        Maximum retry attempts for failed requests
    timeout : int
        HTTP request timeout in seconds
    max_datasets : Optional[int]
        Limit for testing (None = crawl all)
    add_slug_to_filename : bool
        Include human-readable slug in filenames
    slug_max_length : int
        Maximum length of slug in filename
    """
    base_url: str = "https://data.humdata.org"
    rows_per_page: int = 500
    requests_per_second: float = 2.0
    max_retries: int = 6
    timeout: int = 60
    max_datasets: Optional[int] = None  # Set to e.g. 100 for testing
    add_slug_to_filename: bool = True
    slug_max_length: int = 80
    
    @property
    def ckan_api_url(self) -> str:
        """CKAN Action API endpoint."""
        return f"{self.base_url}/api/3/action"

# Initialize configuration
config = CrawlerConfig(
    max_datasets=None,  # Set to small number (e.g., 50) for testing
)

print(f"Base URL: {config.base_url}")
print(f"Rate limit: {config.requests_per_second} req/sec")
print(f"Max datasets: {config.max_datasets or 'All'}")

In [None]:
"""
1.3 Configure Paths

Output directories for metadata JSON files and audit logs.
"""

NOTEBOOK_DIR = Path.cwd()
BASE_DIR = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == 'notebook' else NOTEBOOK_DIR

# Output directories
OUT_DIR = BASE_DIR / 'hdx_dataset_metadata_dump'
DATASET_META_DIR = OUT_DIR / 'dataset_metadata'

# Audit logs
MANIFEST_PATH = OUT_DIR / 'manifest_datasets.jsonl'
ERRORS_PATH = OUT_DIR / 'errors_datasets.jsonl'

# Create directories
OUT_DIR.mkdir(parents=True, exist_ok=True)
DATASET_META_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base directory: {BASE_DIR}")
print(f"Output directory: {OUT_DIR}")
print(f"Dataset metadata: {DATASET_META_DIR}")

## 2. HTTP Client

In [None]:
"""
2.1 HTTP Session and Helpers

Robust HTTP client with rate limiting, retries, and bot-check detection.
"""

class HDXClient:
    """
    HTTP client for HDX API with built-in politeness and error handling.
    
    Features:
    - Rate limiting to respect server resources
    - Exponential backoff on failures
    - Bot-check page detection
    - Session persistence for connection reuse
    """
    
    def __init__(self, config: CrawlerConfig):
        """
        Initialize HTTP client.
        
        Parameters
        ----------
        config : CrawlerConfig
            Crawler configuration object
        """
        self.config = config
        self._last_request_time = 0.0
        
        # Initialize session with appropriate headers
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "hdx-metadata-crawler/1.0 (RDLS pipeline)",
            "Accept": "application/json,text/plain,*/*",
        })
    
    def _looks_like_bot_check(self, text: str) -> bool:
        """Detect bot-check/captcha pages."""
        t = text.lower()
        return ("verify that you're not a robot" in t) or ("javascript is disabled" in t)
    
    def _rate_limit(self) -> None:
        """Enforce rate limiting between requests."""
        if self.config.requests_per_second <= 0:
            return
        
        min_interval = 1.0 / self.config.requests_per_second
        elapsed = time.time() - self._last_request_time
        
        if elapsed < min_interval:
            time.sleep(min_interval - elapsed)
        
        self._last_request_time = time.time()
    
    def get_json(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
        """
        GET JSON with retries and error handling.
        
        Parameters
        ----------
        url : str
            URL to fetch
        params : Optional[Dict[str, Any]]
            Query parameters
            
        Returns
        -------
        Dict[str, Any]
            Parsed JSON response
            
        Raises
        ------
        RuntimeError
            If request fails after all retries
        """
        for attempt in range(self.config.max_retries):
            self._rate_limit()
            
            try:
                response = self.session.get(url, params=params, timeout=self.config.timeout)
            except requests.RequestException as e:
                wait = min(60, (2 ** attempt) + random.random())
                time.sleep(wait)
                continue
            
            # Retry on transient errors
            if response.status_code in (429, 500, 502, 503, 504):
                retry_after = response.headers.get("Retry-After")
                if retry_after and retry_after.isdigit():
                    time.sleep(int(retry_after))
                else:
                    time.sleep(min(60, (2 ** attempt) + random.random()))
                continue
            
            # Hard failure
            if response.status_code >= 400:
                raise RuntimeError(f"HTTP {response.status_code} for {response.url}")
            
            # Check for bot-check pages
            content_type = (response.headers.get("Content-Type") or "").lower()
            if "json" not in content_type:
                if self._looks_like_bot_check(response.text[:5000]):
                    raise RuntimeError(f"Bot-check page returned for {response.url}")
            
            try:
                return response.json()
            except Exception:
                raise RuntimeError(f"Non-JSON response for {response.url}: {response.text[:200]}")
        
        raise RuntimeError(f"Failed after {self.config.max_retries} retries: {url}")
    
    def ckan_action(self, action: str, **params: Any) -> Dict[str, Any]:
        """
        Call CKAN Action API.
        
        Parameters
        ----------
        action : str
            CKAN action name (e.g., 'package_search')
        **params : Any
            Action parameters
            
        Returns
        -------
        Dict[str, Any]
            Action result
        """
        url = f"{self.config.ckan_api_url}/{action}"
        response = self.get_json(url, params=params)
        
        if not response.get("success", False):
            raise RuntimeError(
                f"CKAN action failed: {action} params={params} error={response.get('error')}"
            )
        
        return response["result"]

# Initialize client
client = HDXClient(config)
print("HTTP client initialized.")

## 3. Crawler Logic

In [None]:
"""
3.1 Utility Functions

Helper functions for filename generation and file I/O.
"""

_SLUG_RE = re.compile(r"[^a-zA-Z0-9]+")

def slugify(s: str) -> str:
    """
    Convert string to URL-safe slug.
    
    Parameters
    ----------
    s : str
        Input string
        
    Returns
    -------
    str
        Slugified string
    """
    s = (s or "").strip()
    s = _SLUG_RE.sub("-", s).strip("-").lower()
    return s


def dataset_filename(dataset_id: str, dataset_name: str = "") -> Path:
    """
    Generate filename for dataset metadata JSON.
    
    Parameters
    ----------
    dataset_id : str
        HDX dataset UUID
    dataset_name : str
        Optional dataset name for readable slug
        
    Returns
    -------
    Path
        Output file path
    """
    base = dataset_id
    if config.add_slug_to_filename:
        slug = slugify(dataset_name)[:config.slug_max_length]
        if slug:
            base = f"{dataset_id}__{slug}"
    return DATASET_META_DIR / f"{base}.json"


def write_json(path: Path, obj: Any) -> None:
    """
    Write object as JSON file.
    
    Parameters
    ----------
    path : Path
        Output file path
    obj : Any
        Object to serialize
    """
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)


def append_jsonl(path: Path, obj: Dict[str, Any]) -> None:
    """
    Append object as JSON line.
    
    Parameters
    ----------
    path : Path
        Output JSONL file path
    obj : Dict[str, Any]
        Object to append
    """
    with path.open("a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")


print("Utility functions defined.")

In [None]:
"""
3.2 Dataset Iterator

Paginated iteration over all HDX datasets using CKAN API.
"""

def iter_datasets(query: str = "*:*") -> Iterable[Dict[str, Any]]:
    """
    Iterate over all datasets matching query.
    
    Uses CKAN package_search with pagination.
    
    Parameters
    ----------
    query : str
        CKAN search query (default: all datasets)
        
    Yields
    ------
    Dict[str, Any]
        Dataset metadata dict
    """
    start = 0
    yielded = 0
    
    while True:
        result = client.ckan_action(
            "package_search",
            q=query,
            rows=config.rows_per_page,
            start=start,
            sort="metadata_modified desc",
            facet="false",
        )
        
        count = result.get("count", 0)
        datasets = result.get("results", [])
        
        if not datasets:
            break
        
        for ds in datasets:
            yield ds
            yielded += 1
            
            if config.max_datasets is not None and yielded >= config.max_datasets:
                return
        
        start += config.rows_per_page
        if start >= count:
            break


print("Dataset iterator defined.")

In [None]:
"""
3.3 Metadata Downloader

Download dataset metadata with fallback strategy.
"""

def download_dataset_metadata(dataset_id: str) -> Tuple[Dict[str, Any], str]:
    """
    Download dataset-level metadata.
    
    Tries HDX export endpoint first, falls back to CKAN package_show.
    
    Parameters
    ----------
    dataset_id : str
        HDX dataset UUID
        
    Returns
    -------
    Tuple[Dict[str, Any], str]
        (metadata_dict, source_label)
    """
    export_url = f"{config.base_url}/dataset/{dataset_id}/download_metadata"
    
    try:
        meta = client.get_json(export_url, params={"format": "json"})
        return meta, "download_metadata"
    except Exception as e:
        # Fallback to CKAN API
        pkg = client.ckan_action("package_show", id=dataset_id)
        return {
            "_fallback_reason": str(e),
            "_note": "Fallback used: CKAN package_show (may differ from HDX export).",
            "dataset": pkg,
        }, "ckan_package_show_fallback"


print("Metadata downloader defined.")

## 4. Run Crawler

In [None]:
"""
4.1 Main Crawl Function

Resume-safe crawler with progress tracking.
"""

@dataclass
class CrawlStats:
    """Statistics for crawl run."""
    total_datasets: int = 0
    downloaded: int = 0
    skipped_existing: int = 0
    errors: int = 0
    fallbacks: int = 0


def run_dataset_crawl(query: str = "*:*") -> CrawlStats:
    """
    Crawl HDX datasets and download metadata.
    
    Resume-safe: skips existing files.
    
    Parameters
    ----------
    query : str
        CKAN search query
        
    Returns
    -------
    CrawlStats
        Crawl statistics
    """
    stats = CrawlStats()
    
    # Get total count for progress bar
    initial_result = client.ckan_action("package_search", q=query, rows=0)
    total_count = initial_result.get("count", 0)
    
    if config.max_datasets:
        total_count = min(total_count, config.max_datasets)
    
    print(f"Total datasets to process: {total_count:,}")
    
    # Create iterator with optional progress bar
    dataset_iter = iter_datasets(query)
    if HAS_TQDM:
        dataset_iter = tqdm(dataset_iter, total=total_count, desc="Crawling datasets")
    
    for ds in dataset_iter:
        stats.total_datasets += 1
        
        ds_id = ds.get("id")
        ds_name = ds.get("name") or ""
        ds_title = ds.get("title") or ""
        
        if not ds_id:
            continue
        
        out_path = dataset_filename(ds_id, ds_name)
        
        # Resume-safe: skip existing
        if out_path.exists():
            stats.skipped_existing += 1
            continue
        
        try:
            meta, source = download_dataset_metadata(ds_id)
            write_json(out_path, meta)
            
            if "fallback" in source:
                stats.fallbacks += 1
            
            # Log success
            append_jsonl(MANIFEST_PATH, {
                "dataset_id": ds_id,
                "dataset_name": ds_name,
                "dataset_title": ds_title,
                "metadata_source": source,
                "metadata_file": str(out_path),
                "metadata_url": f"{config.base_url}/dataset/{ds_id}/download_metadata?format=json",
                "timestamp": datetime.now().isoformat(),
            })
            
            stats.downloaded += 1
            
        except Exception as e:
            stats.errors += 1
            append_jsonl(ERRORS_PATH, {
                "dataset_id": ds_id,
                "dataset_name": ds_name,
                "dataset_title": ds_title,
                "error": str(e),
                "timestamp": datetime.now().isoformat(),
            })
        
        # Progress update for non-tqdm
        if not HAS_TQDM and stats.total_datasets % 500 == 0:
            print(f"  Processed {stats.total_datasets:,} datasets...")
    
    return stats


print("Crawler function ready.")

In [None]:
"""
4.2 Execute Crawl

Run the full dataset metadata crawl.
"""

# Full crawl of all public datasets
stats = run_dataset_crawl(query="*:*")

print(f"\n{'='*60}")
print("CRAWL COMPLETE")
print(f"{'='*60}")
print(f"Total processed: {stats.total_datasets:,}")
print(f"Downloaded: {stats.downloaded:,}")
print(f"Skipped (existing): {stats.skipped_existing:,}")
print(f"Fallbacks used: {stats.fallbacks:,}")
print(f"Errors: {stats.errors:,}")

## 5. Summary

In [None]:
"""
5.1 Verify Output

Quick sanity check of downloaded files.
"""

json_files = list(DATASET_META_DIR.glob("*.json"))

print(f"\n{'='*60}")
print("OUTPUT SUMMARY")
print(f"{'='*60}")
print(f"Dataset metadata files: {len(json_files):,}")
print(f"Manifest: {MANIFEST_PATH}")
print(f"Errors: {ERRORS_PATH}")

if json_files:
    print(f"\nSample files (first 5):")
    for f in sorted(json_files)[:5]:
        print(f"  - {f.name}")

print(f"\nNotebook completed: {datetime.now().isoformat()}")