# Step 7 - Validate RDLS JSON and Package for Delivery

**Purpose:** Validate RDLS JSON records produced in Step 6, generate QA reports, and package the deliverable bundle.

**Process:**
1. Load and validate all RDLS JSON records against the schema
2. Detect missing required fields and duplicates
3. Generate human-readable validation summary
4. Package records, index, and reports into a distributable ZIP

**Author**: Benny Istanto/Risk Data Librarian/GFDRR  
**Version**: 2026.1

---

## Inputs
- `rdls/records/*.json` — RDLS records from Step 6
- `rdls/index/rdls_index.jsonl` — Index file from Step 6
- `rdls/schema/rdls_schema_v0.3.json` — RDLS JSON Schema

## Outputs
- `rdls/reports/rdls_validation_summary.md` — Human-readable summary
- `rdls/reports/rdls_missing_fields.csv` — Records with missing required fields
- `rdls/reports/rdls_duplicates.csv` — Duplicate ID and content detection
- `rdls/dist/rdls_metadata_bundle.zip` — Deliverable package

## 1. Setup and Configuration

In [None]:
"""
Setup: Import libraries and configure paths.

Configuration Options:
    OUTPUT_MODE: 'in_place' or 'run_folder' to match Step 6
    RUN_ID: Specific run ID for run_folder mode (None for auto-detect)
"""
from __future__ import annotations

import csv
import hashlib
import json
import zipfile
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd

# --- tqdm with graceful fallback ---
try:
    from tqdm.auto import tqdm
    TQDM_AVAILABLE = True
except ImportError:
    TQDM_AVAILABLE = False
    def tqdm(iterable, **kwargs):
        """Fallback: return iterable unchanged if tqdm not installed."""
        return iterable

print(f"tqdm available: {TQDM_AVAILABLE}")


@dataclass
class ValidationConfig:
    """
    Configuration for RDLS validation and packaging.
    
    Attributes:
        dump_dir: Root directory for HDX metadata dump
        output_mode: 'in_place' or 'run_folder' (must match Step 6)
        run_id: Specific run ID for run_folder mode (None for auto-detect)
    """
    dump_dir: Path = field(default_factory=lambda: (Path("..") / "hdx_dataset_metadata_dump").resolve())
    output_mode: str = "in_place"  # "in_place" | "run_folder"
    run_id: Optional[str] = None


# Initialize configuration
config = ValidationConfig()

# --- Resolve paths ---
DUMP_DIR = config.dump_dir
RDLS_DIR = (DUMP_DIR / "rdls").resolve()


def resolve_rdls_run_dir(rdls_dir: Path, output_mode: str, run_id: Optional[str]) -> Path:
    """
    Resolve the RDLS run directory based on output mode.
    
    Parameters:
        rdls_dir: Base RDLS directory
        output_mode: 'in_place' or 'run_folder'
        run_id: Specific run ID (optional)
        
    Returns:
        Resolved path to run directory
    """
    rdls_dir = rdls_dir.resolve()
    
    if output_mode == "in_place":
        return rdls_dir
    
    if output_mode != "run_folder":
        raise ValueError(f"Unknown OUTPUT_MODE: {output_mode}")
    
    runs_dir = rdls_dir / "runs"
    latest_ptr = runs_dir / "_latest.txt"
    
    # 1) Explicit RUN_ID
    if run_id:
        candidate = (runs_dir / run_id).resolve()
        if not candidate.exists():
            raise FileNotFoundError(f"RUN_ID folder not found: {candidate}")
        return candidate
    
    # 2) _latest.txt pointer
    if latest_ptr.exists():
        rid = latest_ptr.read_text(encoding="utf-8").strip()
        if rid:
            candidate = (runs_dir / rid).resolve()
            if candidate.exists():
                return candidate
    
    # 3) Newest run folder fallback
    if not runs_dir.exists():
        raise FileNotFoundError(f"Runs folder not found: {runs_dir}")
    
    candidates = [p for p in runs_dir.iterdir() if p.is_dir() and not p.name.startswith("_")]
    if not candidates:
        raise FileNotFoundError(
            f"No run folders found under: {runs_dir}. "
            "Either set OUTPUT_MODE='in_place' or provide RUN_ID."
        )
    candidates.sort(key=lambda p: p.stat().st_mtime, reverse=True)
    return candidates[0].resolve()


RDLS_RUN_DIR = resolve_rdls_run_dir(RDLS_DIR, config.output_mode, config.run_id)

# Input paths
RECORDS_DIR = RDLS_RUN_DIR / "records"
INDEX_JSONL = RDLS_RUN_DIR / "index" / "rdls_index.jsonl"
SCHEMA_JSON = RDLS_DIR / "schema" / "rdls_schema_v0.3.json"  # Schema lives at root

# Output paths
REPORTS_DIR = RDLS_RUN_DIR / "reports"
DIST_DIR = RDLS_RUN_DIR / "dist"

# Ensure output directories exist
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
DIST_DIR.mkdir(parents=True, exist_ok=True)

print(f"Configuration:")
print(f"  DUMP_DIR: {DUMP_DIR}")
print(f"  RDLS_DIR: {RDLS_DIR}")
print(f"  OUTPUT_MODE: {config.output_mode}")
print(f"  RDLS_RUN_DIR: {RDLS_RUN_DIR}")
print(f"  RECORDS_DIR: {RECORDS_DIR}")
print(f"  INDEX_JSONL: {INDEX_JSONL}")
print(f"  SCHEMA_JSON: {SCHEMA_JSON}")
print(f"  DIST_DIR: {DIST_DIR}")

## 2. Load RDLS Schema

In [None]:
"""
Load RDLS schema and initialize JSON Schema validator.
"""

def safe_load_json(path: Path) -> Dict[str, Any]:
    """Load JSON file with UTF-8 encoding."""
    return json.loads(path.read_text(encoding="utf-8"))


# Load schema
if not SCHEMA_JSON.exists():
    raise FileNotFoundError(f"RDLS schema not found: {SCHEMA_JSON}")

rdls_schema = safe_load_json(SCHEMA_JSON)

# Initialize JSON Schema validator
def try_import_jsonschema():
    """Try to import jsonschema library."""
    try:
        import jsonschema
        return jsonschema
    except ImportError:
        return None

_jsonschema = try_import_jsonschema()
validator = None

if _jsonschema is not None:
    try:
        validator = _jsonschema.Draft202012Validator(rdls_schema)
        print("jsonschema validation enabled (Draft2020-12)")
    except Exception as e:
        print(f"WARNING: jsonschema init failed: {e}")
else:
    print("WARNING: jsonschema not installed; validation will be skipped")

# Required fields from schema
REQUIRED_FIELDS = rdls_schema.get("required", [])
print(f"Required fields (schema): {REQUIRED_FIELDS}")

## 3. Validation Functions

In [None]:
"""
Validation helper functions.
"""

def validate_dataset_obj(dataset_obj: Dict[str, Any]) -> Tuple[bool, str]:
    """
    Validate RDLS dataset object against schema.
    
    Parameters:
        dataset_obj: The RDLS dataset object (not the wrapper)
        
    Returns:
        Tuple of (is_valid, error_message)
    """
    if validator is None:
        return True, ""
    
    errors = sorted(validator.iter_errors(dataset_obj), key=lambda e: e.path)
    if not errors:
        return True, ""
    
    msgs = [f"{'.'.join(str(p) for p in e.path)}: {e.message}" for e in errors[:10]]
    return False, " | ".join(msgs)


def sha256_file(path: Path) -> str:
    """
    Compute SHA-256 hash of a file.
    
    Parameters:
        path: Path to file
        
    Returns:
        Hex digest of SHA-256 hash
    """
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()


def iter_record_files(folder: Path) -> List[Path]:
    """
    Get sorted list of JSON record files.
    
    Parameters:
        folder: Directory containing records
        
    Returns:
        Sorted list of JSON file paths
    """
    if not folder.exists():
        raise FileNotFoundError(f"Records folder not found: {folder}")
    return sorted(folder.glob("*.json"))


print("Validation functions loaded.")

## 4. Validate All Records

In [None]:
"""
Validate all RDLS records and detect duplicates.

Checks:
    - JSON Schema validation
    - Missing required fields
    - Duplicate IDs
    - Duplicate content (hash-based)
"""

# Get record files
record_files = iter_record_files(RECORDS_DIR)
print(f"Record files found: {len(record_files):,}")

# Tracking structures
rows_validation: List[Dict[str, Any]] = []
rows_missing: List[Dict[str, Any]] = []
rows_duplicates: List[Dict[str, Any]] = []

seen_ids: Dict[str, str] = {}      # rdls_id -> filename
seen_hash: Dict[str, str] = {}     # sha256 -> filename

# Counters
valid_ok = 0
invalid = 0

print(f"\nValidating records...")

for fp in tqdm(record_files, desc="Validating"):
    # Try to load JSON
    try:
        rec = safe_load_json(fp)
    except Exception as e:
        invalid += 1
        rows_validation.append({
            "filename": fp.name,
            "rdls_id": "",
            "valid": False,
            "message": f"json_parse_error: {e}",
        })
        continue
    
    # Extract dataset object (support wrapper or raw)
    if isinstance(rec, dict) and "datasets" in rec and isinstance(rec["datasets"], list) and rec["datasets"]:
        ds = rec["datasets"][0]
    else:
        ds = rec
    
    rdls_id = str(ds.get("id", "")).strip()
    
    # Validate against schema
    ok, msg = validate_dataset_obj(ds)
    rows_validation.append({
        "filename": fp.name,
        "rdls_id": rdls_id,
        "valid": ok,
        "message": msg,
    })
    
    if ok:
        valid_ok += 1
    else:
        invalid += 1
    
    # Check missing required fields
    missing_fields = []
    for k in REQUIRED_FIELDS:
        v = ds.get(k, None)
        if v is None:
            missing_fields.append(k)
        elif isinstance(v, str) and not v.strip():
            missing_fields.append(k)
        elif isinstance(v, (list, dict)) and len(v) == 0:
            missing_fields.append(k)
    
    if missing_fields:
        rows_missing.append({
            "filename": fp.name,
            "rdls_id": rdls_id,
            "missing_fields": ";".join(missing_fields),
        })
    
    # Check for duplicate IDs
    if rdls_id:
        if rdls_id in seen_ids:
            rows_duplicates.append({
                "type": "duplicate_id",
                "rdls_id": rdls_id,
                "filename_a": seen_ids[rdls_id],
                "filename_b": fp.name,
            })
        else:
            seen_ids[rdls_id] = fp.name
    
    # Check for duplicate content (hash-based)
    file_hash = sha256_file(fp)
    if file_hash in seen_hash:
        rows_duplicates.append({
            "type": "duplicate_content_hash",
            "rdls_id": rdls_id,
            "filename_a": seen_hash[file_hash],
            "filename_b": fp.name,
        })
    else:
        seen_hash[file_hash] = fp.name

# Summary
print(f"\n" + "="*50)
print(f"VALIDATION COMPLETE")
print(f"="*50)
print(f"Valid: {valid_ok:,}")
print(f"Invalid: {invalid:,}")
print(f"Records with missing required fields: {len(rows_missing):,}")
print(f"Duplicates detected: {len(rows_duplicates):,}")

## 5. Save Validation Reports

In [None]:
"""
Save validation results to CSV reports.
"""

# Create DataFrames
df_val = pd.DataFrame(rows_validation)
df_missing = pd.DataFrame(rows_missing)
df_dups = pd.DataFrame(rows_duplicates)

# Output paths
OUT_VALIDATION = REPORTS_DIR / "schema_validation_full.csv"
OUT_MISSING = REPORTS_DIR / "rdls_missing_fields.csv"
OUT_DUPS = REPORTS_DIR / "rdls_duplicates.csv"

# Save reports
df_val.to_csv(OUT_VALIDATION, index=False)
print(f"Wrote: {OUT_VALIDATION}")

df_missing.to_csv(OUT_MISSING, index=False)
print(f"Wrote: {OUT_MISSING}")

df_dups.to_csv(OUT_DUPS, index=False)
print(f"Wrote: {OUT_DUPS}")

## 6. Generate Human-Readable Summary

In [None]:
"""
Generate Markdown summary of validation results.
"""

summary_lines: List[str] = []
summary_lines.append("# RDLS Validation Summary")
summary_lines.append("")
summary_lines.append(f"- **Run timestamp:** {datetime.now(timezone.utc).isoformat()}")
summary_lines.append(f"- **Records folder:** `{RECORDS_DIR}`")
summary_lines.append(f"- **Total JSON files:** **{len(record_files):,}**")
summary_lines.append(f"- **Schema valid:** **{int((df_val['valid'] == True).sum()):,}**")
summary_lines.append(f"- **Schema invalid:** **{int((df_val['valid'] == False).sum()):,}**")
summary_lines.append(f"- **Records missing required fields:** **{len(df_missing):,}**")
summary_lines.append(f"- **Duplicates detected:** **{len(df_dups):,}**")
summary_lines.append("")

# Missing fields breakdown
if not df_missing.empty:
    summary_lines.append("## Top Missing Required Fields")
    summary_lines.append("")
    tmp = df_missing.copy()
    tmp["missing_fields"] = tmp["missing_fields"].fillna("").astype(str)
    exploded = tmp["missing_fields"].str.split(";").explode()
    vc = exploded[exploded != ""].value_counts().head(20)
    for k, v in vc.items():
        summary_lines.append(f"- `{k}`: {int(v):,}")
    summary_lines.append("")

# Duplicates breakdown
if not df_dups.empty:
    summary_lines.append("## Duplicate Signals")
    summary_lines.append("")
    vc = df_dups["type"].value_counts()
    for k, v in vc.items():
        summary_lines.append(f"- `{k}`: {int(v):,}")
    summary_lines.append("")

# Write summary
OUT_MD = REPORTS_DIR / "rdls_validation_summary.md"
OUT_MD.write_text("\n".join(summary_lines) + "\n", encoding="utf-8")
print(f"Wrote: {OUT_MD}")

# Display summary
print("\n" + "="*50)
print("VALIDATION SUMMARY")
print("="*50)
for line in summary_lines[:15]:
    if line.strip():
        print(line.replace("**", "").replace("`", ""))

## 7. Package Deliverable Bundle

In [None]:
"""
Create distributable ZIP bundle containing:
    - records/*.json
    - index/rdls_index.jsonl
    - reports/*.csv, *.md
"""

OUT_ZIP = DIST_DIR / "rdls_metadata_bundle.zip"


def add_folder_to_zip(z: zipfile.ZipFile, folder: Path, arc_prefix: str) -> int:
    """
    Add all files from a folder to a ZIP archive.
    
    Parameters:
        z: ZipFile object
        folder: Source folder
        arc_prefix: Prefix for archive paths
        
    Returns:
        Number of files added
    """
    count = 0
    for p in sorted(folder.rglob("*")):
        if p.is_file():
            z.write(p, arcname=str(Path(arc_prefix) / p.relative_to(folder)))
            count += 1
    return count


print("Creating deliverable bundle...")

with zipfile.ZipFile(OUT_ZIP, "w", compression=zipfile.ZIP_DEFLATED) as z:
    # Add records
    records_count = add_folder_to_zip(z, RECORDS_DIR, "records")
    print(f"  Added {records_count:,} record files")
    
    # Add index
    if INDEX_JSONL.exists():
        z.write(INDEX_JSONL, arcname="index/rdls_index.jsonl")
        print(f"  Added index file")
    
    # Add reports
    reports_count = add_folder_to_zip(z, REPORTS_DIR, "reports")
    print(f"  Added {reports_count:,} report files")

# Report size
zip_size_mb = OUT_ZIP.stat().st_size / (1024 * 1024)

print(f"\n" + "="*50)
print(f"PACKAGING COMPLETE")
print(f"="*50)
print(f"Wrote: {OUT_ZIP}")
print(f"Bundle size: {zip_size_mb:.2f} MB")

## 8. Final QA Summary

In [None]:
"""
Final QA summary and pipeline completion status.
"""

print("\n" + "="*60)
print("HDX-RDLS PIPELINE COMPLETE")
print("="*60)

print(f"\nValidation Results:")
print(f"  - Total records: {len(record_files):,}")
print(f"  - Schema valid: {valid_ok:,}")
print(f"  - Schema invalid: {invalid:,}")
print(f"  - Missing required fields: {len(rows_missing):,}")
print(f"  - Duplicates: {len(rows_duplicates):,}")

print(f"\nOutput Files:")
print(f"  - Validation report: {OUT_VALIDATION}")
print(f"  - Missing fields report: {OUT_MISSING}")
print(f"  - Duplicates report: {OUT_DUPS}")
print(f"  - Summary (Markdown): {OUT_MD}")
print(f"  - Deliverable bundle: {OUT_ZIP}")

# Quality gate
if invalid == 0 and len(rows_missing) == 0:
    print(f"\n{'='*60}")
    print("QUALITY GATE: PASSED")
    print(f"All {valid_ok:,} records are schema-valid with no missing required fields.")
    print(f"{'='*60}")
else:
    print(f"\n{'='*60}")
    print("QUALITY GATE: REVIEW NEEDED")
    if invalid > 0:
        print(f"  - {invalid:,} records failed schema validation")
    if len(rows_missing) > 0:
        print(f"  - {len(rows_missing):,} records have missing required fields")
    print("Review the reports above for details.")
    print(f"{'='*60}")

print(f"\nPipeline execution complete.")