# 07 â€” Validate RDLS JSON and package for delivery

This notebook validates the RDLS JSON records produced in **Step 06**, produces QA reports, and packages the deliverable bundle.

**Outputs**
- `rdls/reports/rdls_validation_summary.md`
- `rdls/reports/rdls_missing_fields.csv`
- `rdls/reports/rdls_duplicates.csv`
- `rdls/dist/rdls_metadata_bundle.zip`


In [1]:
# ======================
# Config (edit if needed)
# ======================
from pathlib import Path
import json
import csv
import hashlib
from typing import Any, Dict, List, Tuple, Optional

# If you run from /notebooks, keep the default:
DUMP_DIR = Path("../hdx_dataset_metadata_dump").resolve()

# RDLS root (created in Notebook 06)
RDLS_DIR = (DUMP_DIR / "rdls").resolve()

# -------------------------
# Output mode compatibility
# -------------------------
# - "in_place"  : validate/package rdls/{records,index,reports}
# - "run_folder": validate/package rdls/runs/<RUN_ID>/{records,index,reports}
OUTPUT_MODE = "in_place"  # "in_place" | "run_folder"

# Only used when OUTPUT_MODE="run_folder".
# If None/blank, the notebook will try:
#  1) rdls/runs/_latest.txt  (if present)
#  2) most recently modified directory under rdls/runs/
RUN_ID: Optional[str] = None

def resolve_rdls_run_dir(rdls_dir: Path, output_mode: str, run_id: Optional[str]) -> Path:
    rdls_dir = rdls_dir.resolve()
    if output_mode == "in_place":
        return rdls_dir

    if output_mode != "run_folder":
        raise ValueError(f"Unknown OUTPUT_MODE: {output_mode}")

    runs_dir = rdls_dir / "runs"
    latest_ptr = runs_dir / "_latest.txt"

    # 1) explicit RUN_ID
    if run_id:
        candidate = (runs_dir / run_id).resolve()
        if not candidate.exists():
            raise FileNotFoundError(f"RUN_ID folder not found: {candidate}")
        return candidate

    # 2) _latest.txt pointer
    if latest_ptr.exists():
        rid = latest_ptr.read_text(encoding="utf-8").strip()
        if rid:
            candidate = (runs_dir / rid).resolve()
            if candidate.exists():
                return candidate

    # 3) newest run folder fallback
    if not runs_dir.exists():
        raise FileNotFoundError(f"Runs folder not found: {runs_dir}")

    candidates = [p for p in runs_dir.iterdir() if p.is_dir() and not p.name.startswith("_")]
    if not candidates:
        raise FileNotFoundError(
            f"No run folders found under: {runs_dir}. "
            "Either set OUTPUT_MODE='in_place' or provide RUN_ID."
        )
    candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True)
    return candidates[0].resolve()

RDLS_RUN_DIR = resolve_rdls_run_dir(RDLS_DIR, OUTPUT_MODE, RUN_ID)

# Inputs
RECORDS_DIR = RDLS_RUN_DIR / "records"
INDEX_JSONL = RDLS_RUN_DIR / "index" / "rdls_index.jsonl"
SCHEMA_JSON = RDLS_DIR / "schema" / "rdls_schema_v0.3.json"  # schema lives at root

# Outputs (package + reports are written within the selected run dir)
REPORTS_DIR = RDLS_RUN_DIR / "reports"
DIST_DIR = RDLS_RUN_DIR / "dist"

REPORTS_DIR.mkdir(parents=True, exist_ok=True)
DIST_DIR.mkdir(parents=True, exist_ok=True)

print("DUMP_DIR:", DUMP_DIR)
print("RDLS_DIR:", RDLS_DIR)
print("OUTPUT_MODE:", OUTPUT_MODE)
print("RDLS_RUN_DIR:", RDLS_RUN_DIR)
print("RECORDS_DIR:", RECORDS_DIR)
print("INDEX_JSONL:", INDEX_JSONL)
print("SCHEMA_JSON:", SCHEMA_JSON)
print("DIST_DIR:", DIST_DIR)


DUMP_DIR: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump
RDLS_DIR: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls
OUTPUT_MODE: in_place
RDLS_RUN_DIR: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls
RECORDS_DIR: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls\records
INDEX_JSONL: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls\index\rdls_index.jsonl
SCHEMA_JSON: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls\schema\rdls_schema_v0.3.json
DIST_DIR: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls\dist


In [2]:
# ======================
# Load schema + optional jsonschema validator
# ======================
import pandas as pd

def safe_load_json(path: Path) -> Dict[str, Any]:
    return json.loads(path.read_text(encoding="utf-8"))

rdls_schema = safe_load_json(SCHEMA_JSON)

def try_import_jsonschema():
    try:
        import jsonschema  # type: ignore
        return jsonschema
    except Exception:
        return None

_jsonschema = try_import_jsonschema()
validator = None
if _jsonschema is not None:
    try:
        validator = _jsonschema.Draft202012Validator(rdls_schema)  # type: ignore
        print("jsonschema validation enabled (Draft2020-12).")
    except Exception as e:
        print("WARNING: jsonschema available but validator init failed:", e)
        validator = None
else:
    print("WARNING: jsonschema not installed; schema validation will be skipped.")

def validate_dataset_obj(dataset_obj: Dict[str, Any]) -> Tuple[bool, str]:
    """Validate the RDLS *dataset object* (not the outer wrapper)."""
    if validator is None:
        return True, ""
    errors = sorted(validator.iter_errors(dataset_obj), key=lambda e: e.path)
    if not errors:
        return True, ""
    msgs = []
    for e in errors[:10]:
        path = ".".join([str(p) for p in e.path])
        msgs.append(f"{path}: {e.message}")
    return False, " | ".join(msgs)

# Determine required fields from schema
REQUIRED_FIELDS = rdls_schema.get("required", [])
print("Required fields (schema):", REQUIRED_FIELDS)


jsonschema validation enabled (Draft2020-12).
Required fields (schema): ['id', 'title', 'risk_data_type', 'attributions', 'spatial', 'license', 'resources']


In [3]:
# ======================
# Validate all records + compute duplicates
# ======================
import zipfile

def iter_record_files(folder: Path) -> List[Path]:
    if not folder.exists():
        raise FileNotFoundError(f"Records folder not found: {folder}")
    return sorted(folder.glob("*.json"))

def sha256_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()

record_files = iter_record_files(RECORDS_DIR)
print("Record files:", len(record_files))

rows_validation: List[Dict[str, Any]] = []
rows_missing: List[Dict[str, Any]] = []
rows_duplicates: List[Dict[str, Any]] = []

seen_ids: Dict[str, str] = {}         # rdls id -> filename
seen_hash: Dict[str, str] = {}        # sha256 -> filename

valid_ok = 0
invalid = 0

for fp in record_files:
    try:
        rec = safe_load_json(fp)
    except Exception as e:
        invalid += 1
        rows_validation.append({
            "filename": fp.name,
            "rdls_id": "",
            "valid": False,
            "message": f"json_parse_error: {e}",
        })
        continue

    # Support either outer wrapper {'datasets':[...]} or raw dataset object
    if isinstance(rec, dict) and "datasets" in rec and isinstance(rec["datasets"], list) and rec["datasets"]:
        ds = rec["datasets"][0]
    else:
        ds = rec

    rdls_id = str(ds.get("id", "")).strip()

    ok, msg = validate_dataset_obj(ds)
    rows_validation.append({
        "filename": fp.name,
        "rdls_id": rdls_id,
        "valid": ok,
        "message": msg,
    })
    if ok:
        valid_ok += 1
    else:
        invalid += 1

    # Missing required fields (treat empty string / empty list / empty dict as missing too)
    missing_fields = []
    for k in REQUIRED_FIELDS:
        v = ds.get(k, None)
        if v is None:
            missing_fields.append(k)
        elif isinstance(v, str) and not v.strip():
            missing_fields.append(k)
        elif isinstance(v, (list, dict)) and len(v) == 0:
            missing_fields.append(k)
    if missing_fields:
        rows_missing.append({
            "filename": fp.name,
            "rdls_id": rdls_id,
            "missing_fields": ";".join(missing_fields),
        })

    # Duplicates
    if rdls_id:
        if rdls_id in seen_ids:
            rows_duplicates.append({
                "type": "duplicate_id",
                "rdls_id": rdls_id,
                "filename_a": seen_ids[rdls_id],
                "filename_b": fp.name,
            })
        else:
            seen_ids[rdls_id] = fp.name

    # Identical-content duplicates (hash)
    file_hash = sha256_file(fp)
    if file_hash in seen_hash:
        rows_duplicates.append({
            "type": "duplicate_content_hash",
            "rdls_id": rdls_id,
            "filename_a": seen_hash[file_hash],
            "filename_b": fp.name,
        })
    else:
        seen_hash[file_hash] = fp.name

df_val = pd.DataFrame(rows_validation)
df_missing = pd.DataFrame(rows_missing)
df_dups = pd.DataFrame(rows_duplicates)

print("Valid:", valid_ok, "Invalid:", invalid)
print("Files with missing required fields:", len(df_missing))
print("Duplicates found:", len(df_dups))

OUT_VALIDATION = REPORTS_DIR / "schema_validation_full.csv"
OUT_MISSING = REPORTS_DIR / "rdls_missing_fields.csv"
OUT_DUPS = REPORTS_DIR / "rdls_duplicates.csv"

df_val.to_csv(OUT_VALIDATION, index=False)
df_missing.to_csv(OUT_MISSING, index=False)
df_dups.to_csv(OUT_DUPS, index=False)

print("Wrote:", OUT_VALIDATION)
print("Wrote:", OUT_MISSING)
print("Wrote:", OUT_DUPS)


Record files: 50
Valid: 50 Invalid: 0
Files with missing required fields: 0
Duplicates found: 0
Wrote: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls\reports\schema_validation_full.csv
Wrote: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls\reports\rdls_missing_fields.csv
Wrote: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls\reports\rdls_duplicates.csv


In [6]:
# ======================
# Create a human-readable summary (Markdown)
# ======================
from datetime import datetime, timezone

summary_lines: List[str] = []
summary_lines.append("# RDLS Validation Summary")
summary_lines.append("")
summary_lines.append(f"- Run timestamp: {datetime.now(timezone.utc).isoformat()}")
summary_lines.append(f"- Records folder: `{RECORDS_DIR}`")
summary_lines.append(f"- Total JSON files: **{len(record_files)}**")
summary_lines.append(f"- Schema valid: **{int((df_val['valid'] == True).sum())}**")
summary_lines.append(f"- Schema invalid: **{int((df_val['valid'] == False).sum())}**")
summary_lines.append(f"- Records missing required fields: **{len(df_missing)}**")
summary_lines.append(f"- Duplicates detected: **{len(df_dups)}**")
summary_lines.append("")

if not df_missing.empty:
    summary_lines.append("## Top missing required fields")
    # explode missing_fields
    tmp = df_missing.copy()
    tmp["missing_fields"] = tmp["missing_fields"].fillna("").astype(str)
    exploded = tmp["missing_fields"].str.split(";").explode()
    vc = exploded[exploded != ""].value_counts().head(20)
    for k, v in vc.items():
        summary_lines.append(f"- `{k}`: {int(v)}")
    summary_lines.append("")

if not df_dups.empty:
    summary_lines.append("## Duplicate signals")
    vc = df_dups["type"].value_counts()
    for k, v in vc.items():
        summary_lines.append(f"- `{k}`: {int(v)}")
    summary_lines.append("")

OUT_MD = REPORTS_DIR / "rdls_validation_summary.md"
OUT_MD.write_text("\n".join(summary_lines) + "\n", encoding="utf-8")
print("Wrote:", OUT_MD)


Wrote: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls\reports\rdls_validation_summary.md


In [7]:
# ======================
# Package deliverable bundle (zip)
# ======================
import zipfile

OUT_ZIP = DIST_DIR / "rdls_metadata_bundle.zip"

def add_folder_to_zip(z: zipfile.ZipFile, folder: Path, arc_prefix: str) -> None:
    for p in sorted(folder.rglob("*")):
        if p.is_file():
            z.write(p, arcname=str(Path(arc_prefix) / p.relative_to(folder)))

with zipfile.ZipFile(OUT_ZIP, "w", compression=zipfile.ZIP_DEFLATED) as z:
    # records + index + reports
    add_folder_to_zip(z, RECORDS_DIR, "records")
    if INDEX_JSONL.exists():
        z.write(INDEX_JSONL, arcname="index/rdls_index.jsonl")
    add_folder_to_zip(z, REPORTS_DIR, "reports")

print("Wrote:", OUT_ZIP)
print("Zip size (MB):", round(OUT_ZIP.stat().st_size / (1024 * 1024), 2))


Wrote: C:\Users\benny\OneDrive\Documents\Github\hdx-metadata-crawler\hdx_dataset_metadata_dump\rdls\dist\rdls_metadata_bundle.zip
Zip size (MB): 0.14
