# Temporary Fix - Rebuild `manifest_datasets.jsonl` from Dataset JSON Folder

## Why this notebook exists
Your Step 1 run produced a complete dataset JSON dump, but the manifest write occasionally failed
(e.g., Windows file lock / permission issues) causing missing entries in `manifest_datasets.jsonl`.

This notebook **rebuilds a fresh, complete manifest** directly from the dataset JSON files already downloaded,
without re-crawling HDX.

## What it does
1. Scans `hdx_dataset_metadata_dump/dataset_metadata/*.json`
2. Reads each dataset JSON and extracts key fields:
   - `dataset_id`, `dataset_name`, `dataset_title`
   - `metadata_source` (assumed `download_metadata` unless a fallback marker is detected)
   - `metadata_file` (relative path)
   - `metadata_url` (reconstructed from dataset_id)
3. Writes a brand-new:
   - `hdx_dataset_metadata_dump/manifest_datasets.jsonl`

Optionally, it can also:
- write `manifest_datasets.csv` for quick inspection in Excel
- write `manifest_datasets_duplicates.jsonl` if any duplicate IDs are found

## Inputs
- `hdx_dataset_metadata_dump/dataset_metadata/` (26k+ JSON files)

## Outputs
- `hdx_dataset_metadata_dump/manifest_datasets.jsonl` (complete, rebuilt)
- optional: `hdx_dataset_metadata_dump/manifest_datasets.csv`


In [1]:
"""
temporary_fixed.ipynb

Rebuild a complete HDX dataset manifest (JSONL) from an existing dataset metadata dump.

Design principles:
- No network calls; purely local rebuild
- Deterministic output ordering
- Audit-friendly: detects duplicates and malformed files
- Minimal dependencies: standard library only

Author: <YOUR NAME/ORG>
License: <YOUR LICENSE>
"""

from __future__ import annotations

import csv
import json
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple


In [2]:
# =========================
# Configuration (EDIT HERE)
# =========================

# Root dump directory produced by Step 1:
DUMP_DIR = Path("../hdx_dataset_metadata_dump")

# Dataset JSON folder (dataset-level metadata):
DATASET_DIR = DUMP_DIR / "dataset_metadata"

# Output manifest paths (will overwrite existing):
OUT_MANIFEST_JSONL = DUMP_DIR / "manifest_datasets.jsonl"

# Optional outputs:
WRITE_CSV = True
OUT_MANIFEST_CSV = DUMP_DIR / "manifest_datasets.csv"

WRITE_DUPLICATES_JSONL = True
OUT_DUPLICATES_JSONL = DUMP_DIR / "manifest_datasets_duplicates.jsonl"

# Base URL to reconstruct metadata_url
BASE_URL = "https://data.humdata.org"

print("DATASET_DIR:", DATASET_DIR.resolve())


DATASET_DIR: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\dataset_metadata


In [3]:
# =========================
# Helpers
# =========================

def iter_json_files(folder: Path) -> Iterable[Path]:
    """Yield JSON files in a folder (non-recursive), sorted for determinism."""
    if not folder.exists():
        raise FileNotFoundError(f"Dataset folder not found: {folder}")
    yield from sorted(folder.glob("*.json"))

def read_json(path: Path) -> Optional[Dict[str, Any]]:
    """Read a JSON file; return None if unreadable."""
    try:
        return json.loads(path.read_text(encoding="utf-8", errors="ignore"))
    except Exception:
        return None

def normalize_dataset_record(raw: Dict[str, Any]) -> Dict[str, Any]:
    """Handle possible fallback wrapper: {'dataset': {...}}."""
    if isinstance(raw, dict) and "id" in raw:
        return raw
    if isinstance(raw, dict) and "dataset" in raw and isinstance(raw["dataset"], dict):
        return raw["dataset"]
    return raw

def reconstruct_metadata_url(dataset_id: str) -> str:
    """Reconstruct dataset-level download_metadata URL for a dataset UUID."""
    return f"{BASE_URL}/dataset/{dataset_id}/download_metadata?format=json"

def detect_metadata_source(raw: Dict[str, Any]) -> str:
    """Infer whether this record looks like a fallback."""
    # Your Step 1 typically writes direct HDX export JSON with id at top-level.
    # If it was a fallback, Step 1 wrapped it and added helper fields.
    if isinstance(raw, dict) and ("_fallback_reason" in raw or "_note" in raw) and "dataset" in raw:
        return "ckan_package_show_fallback"
    return "download_metadata"


In [4]:
# =========================
# Build manifest rows
# =========================

def build_manifest(dataset_dir: Path) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Path]]:
    """
    Returns:
      rows: manifest rows
      duplicates: rows that share the same dataset_id
      bad_files: files that could not be parsed as JSON
    """
    rows: List[Dict[str, Any]] = []
    bad_files: List[Path] = []
    seen: Dict[str, Dict[str, Any]] = {}
    duplicates: List[Dict[str, Any]] = []

    files = list(iter_json_files(dataset_dir))
    total = len(files)
    print(f"Scanning {total:,} dataset JSON files...")

    for i, path in enumerate(files, start=1):
        if i % 1000 == 0 or i == total:
            print(f"  processed {i:,}/{total:,}")

        raw = read_json(path)
        if raw is None:
            bad_files.append(path)
            continue

        ds = normalize_dataset_record(raw)
        dataset_id = ds.get("id") or ""
        dataset_name = ds.get("name") or ""
        dataset_title = ds.get("title") or ""

        if not dataset_id:
            bad_files.append(path)
            continue

        source = detect_metadata_source(raw)

        rel_path = path.relative_to(DUMP_DIR).as_posix() if path.is_relative_to(DUMP_DIR) else path.as_posix()

        row = {
            "dataset_id": dataset_id,
            "dataset_name": dataset_name,
            "dataset_title": dataset_title,
            "metadata_source": source,
            "metadata_file": rel_path,
            "metadata_url": reconstruct_metadata_url(dataset_id),
        }

        if dataset_id in seen:
            duplicates.append(row)
        else:
            seen[dataset_id] = row
            rows.append(row)

    # Deterministic ordering: dataset_id ascending
    rows.sort(key=lambda r: r["dataset_id"])
    duplicates.sort(key=lambda r: r["dataset_id"])

    return rows, duplicates, bad_files

rows, duplicates, bad_files = build_manifest(DATASET_DIR)

print(f"\nManifest rows: {len(rows):,}")
print(f"Duplicate IDs:  {len(duplicates):,}")
print(f"Bad files:      {len(bad_files):,}")


Scanning 26,246 dataset JSON files...
  processed 1,000/26,246
  processed 2,000/26,246
  processed 3,000/26,246
  processed 4,000/26,246
  processed 5,000/26,246
  processed 6,000/26,246
  processed 7,000/26,246
  processed 8,000/26,246
  processed 9,000/26,246
  processed 10,000/26,246
  processed 11,000/26,246
  processed 12,000/26,246
  processed 13,000/26,246
  processed 14,000/26,246
  processed 15,000/26,246
  processed 16,000/26,246
  processed 17,000/26,246
  processed 18,000/26,246
  processed 19,000/26,246
  processed 20,000/26,246
  processed 21,000/26,246
  processed 22,000/26,246
  processed 23,000/26,246
  processed 24,000/26,246
  processed 25,000/26,246
  processed 26,000/26,246
  processed 26,246/26,246

Manifest rows: 26,246
Duplicate IDs:  0
Bad files:      0


In [5]:
# =========================
# Write manifest (JSONL + optional CSV)
# =========================

def write_jsonl(path: Path, rows: List[Dict[str, Any]]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def write_csv(path: Path, rows: List[Dict[str, Any]]) -> None:
    if not rows:
        return
    with path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
        w.writeheader()
        w.writerows(rows)

write_jsonl(OUT_MANIFEST_JSONL, rows)
print("Wrote:", OUT_MANIFEST_JSONL.resolve())

if WRITE_CSV:
    write_csv(OUT_MANIFEST_CSV, rows)
    print("Wrote:", OUT_MANIFEST_CSV.resolve())

if WRITE_DUPLICATES_JSONL and duplicates:
    write_jsonl(OUT_DUPLICATES_JSONL, duplicates)
    print("Wrote:", OUT_DUPLICATES_JSONL.resolve())
elif WRITE_DUPLICATES_JSONL:
    print("No duplicates found; duplicates file not written.")

# Optional: write bad file list for debugging
if bad_files:
    bad_list = DUMP_DIR / "manifest_datasets_bad_files.txt"
    with bad_list.open("w", encoding="utf-8") as f:
        for p in bad_files:
            f.write(p.as_posix() + "\n")
    print("Wrote:", bad_list.resolve())


Wrote: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\manifest_datasets.jsonl
Wrote: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\manifest_datasets.csv
No duplicates found; duplicates file not written.


In [6]:
# =========================
# Quick sanity checks
# =========================

# 1) Sample first/last entries
print("\nFirst 3 rows:")
for r in rows[:3]:
    print(r)

print("\nLast 3 rows:")
for r in rows[-3:]:
    print(r)

# 2) Count check vs number of JSON files
n_json = len(list(iter_json_files(DATASET_DIR)))
print(f"\nJSON files in folder: {n_json:,}")
print(f"Manifest unique rows:  {len(rows):,}")
if n_json != len(rows):
    print("NOTE: Folder file count != unique manifest rows. See duplicates/bad files outputs above.")



First 3 rows:
{'dataset_id': '00035653-2e52-4716-9121-7d6f6d9f961b', 'dataset_name': 'rasp4', 'dataset_title': 'Risk Assessment Site Priority (RASP)', 'metadata_source': 'download_metadata', 'metadata_file': 'dataset_metadata/00035653-2e52-4716-9121-7d6f6d9f961b__rasp4.json', 'metadata_url': 'https://data.humdata.org/dataset/00035653-2e52-4716-9121-7d6f6d9f961b/download_metadata?format=json'}
{'dataset_id': '0004678f-50cb-4a07-8969-5a74bc492efb', 'dataset_name': 'financiamiento-y-personas-alcanzadas-por-el-cluster-salud-colombia', 'dataset_title': 'Financiamiento y personas alcanzadas por el Cl√∫ster salud - Colombia 2021', 'metadata_source': 'download_metadata', 'metadata_file': 'dataset_metadata/0004678f-50cb-4a07-8969-5a74bc492efb__financiamiento-y-personas-alcanzadas-por-el-cluster-salud-colombia.json', 'metadata_url': 'https://data.humdata.org/dataset/0004678f-50cb-4a07-8969-5a74bc492efb/download_metadata?format=json'}
{'dataset_id': '00090357-0df0-4e33-9bc8-aa3ce425ef09', 'datas