In [6]:
from pathlib import Path
import json, pandas as pd, numpy as np

# ---------- Config ----------
AOI = "huambo"  # change if needed
# Primary project outputs path
B1 = Path("/mnt/c/Users/benny/OneDrive/Documents/Github/ago-lobitocorridor-analysis/outputs/tables")
# Fallback for ad-hoc uploads provided in this session
B2 = Path("/mnt/d/temp/wbg/iso3/ago/lobitocorridor/outputs/tables")

SEARCH_DIRS = [B1, B2]

def resolve(fname: str) -> Path:
    for base in SEARCH_DIRS:
        p = base / fname
        if p.exists():
            return p
    # Not found anywhere -> return default in B1 so messages still show path
    return B1 / fname

def ok(x): return "PASS" if x else "FAIL"
def exists_nonempty(p: Path) -> bool:
    try:
        return p.exists() and p.stat().st_size > 0
    except Exception:
        return False

# ---------- Known / expected filenames ----------
paths = {
    # Step 00–10 (yours, unchanged)
    "iso":        resolve(f"{AOI}_kpis_isochrones.csv"),
    "risk":       resolve(f"{AOI}_roads_flood_risk_summary.csv"),
    "muni":       resolve(f"{AOI}_municipality_indicators.csv"),
    "corr":       resolve(f"{AOI}_corr_with_rural_poverty.csv"),
    "prof":       resolve(f"{AOI}_municipality_profiles.csv"),
    "rank":       resolve(f"{AOI}_priority_muni_rank.csv"),
    "scn_meta":   resolve(f"{AOI}_priority_scenarios.meta.json"),
    "scn_sum":    resolve(f"{AOI}_priority_scenarios_summary.csv"),
    "site":       resolve(f"{AOI}_site_audit_points.csv"),
    "proj":       resolve(f"{AOI}_project_kpis.csv"),
    "lookup":     resolve(f"{AOI}_admin2_lookup.csv"),
    "admin2_rank":resolve(f"{AOI}_priority_admin2_rank.csv"),
    "clusters":   resolve(f"{AOI}_priority_clusters.csv"),
    "catch_kpi":  resolve(f"{AOI}_catchments_kpis.csv"),
    "clust_syn":  resolve(f"{AOI}_cluster_synergies.csv"),
    "site_syn":   resolve(f"{AOI}_site_synergies.csv"),
    "od_grav":    resolve(f"{AOI}_od_gravity.csv"),
    "od_zone":    resolve(f"{AOI}_od_zone_attrs.csv"),
    "od_agents":  resolve(f"{AOI}_od_agents.csv"),
}

print("=== File presence check (project outputs + /mnt/data fallback) ===")
for k, p in paths.items():
    print(f"{k:12s} ->", "FOUND" if p.exists() else "MISSING", f"| {p}")
print()

# ------------------------------
# 1) Isochrones (Step 05/08)
# ------------------------------
try:
    p = paths["iso"]
    if exists_nonempty(p):
        iso = pd.read_csv(p)
        exp = {"aoi","travel_cut_min","pop_within","cells_within","area_km2_within"}
        print("[Isochrones]", ok(exp.issubset(iso.columns)), f"shape={iso.shape}")
        iso_s = iso.sort_values("travel_cut_min")
        mono_pop   = (iso_s["pop_within"].diff().fillna(0)  >= -1e-6).all()
        mono_cells = (iso_s["cells_within"].diff().fillna(0) >= -1e-6).all()
        print("  monotonic pop/cells:", ok(mono_pop and mono_cells))
        print(iso.head(3).to_string(index=False))
    else:
        print("[Isochrones] MISSING or EMPTY")
except Exception as e:
    print("[Isochrones] ERROR:", e)

# ------------------------------
# 2) Roads × Flood
# ------------------------------
try:
    p = paths["risk"]
    if exists_nonempty(p):
        risk = pd.read_csv(p)
        if not risk.empty:
            r = risk.iloc[0]
            def fnum(k):
                try: return float(r.get(k, np.nan))
                except: return np.nan
            ok_range = (0 <= fnum("risk_pct_of_roads") <= 100) and (0 <= fnum("near_prio_pct_of_risk") <= 100)
            print("[Roads×Flood] ranges:", ok(ok_range))
            for k in ["total_road_cells","total_risk_cells","risk_near_priority_cells"]:
                print(f"  {k}:", r.get(k, "<missing>"))
            print("  method:", r.get("notes",""))
            print("  fraction_min:", r.get("flood_exceed_fraction_min", np.nan))
        else:
            print("[Roads×Flood] EMPTY")
    else:
        print("[Roads×Flood] MISSING or EMPTY")
except Exception as e:
    print("[Roads×Flood] ERROR:", e)

# ------------------------------
# 3) Municipality indicators
# ------------------------------
muni = None
try:
    p = paths["muni"]
    if exists_nonempty(p):
        muni = pd.read_csv(p)
        idc = [c for c in ["ADM2CD_c","NAM_1","NAM_2"] if c in muni.columns]
        n_unique = len(muni[idc].drop_duplicates()) if idc else None
        print("[Muni indicators]", ok(n_unique==len(muni)),
              f"rows={len(muni)} unique_adm2={n_unique}")
        num = muni.select_dtypes(include="number")
        na_mean = float(num.isna().mean().mean()) if not num.empty else np.nan
        print("  mean numeric NA share:", f"{na_mean:.2%}")
        print("  sample cols:", ", ".join(list(muni.columns)[:10]), "...")
    else:
        print("[Muni indicators] MISSING or EMPTY")
except Exception as e:
    print("[Muni indicators] ERROR:", e)

# ------------------------------
# 4) Correlations
# ------------------------------
try:
    p = paths["corr"]
    if exists_nonempty(p):
        corr = pd.read_csv(p)
        if not corr.empty and {"theme","var","r","p","n"}.issubset(corr.columns):
            n_max = int(pd.to_numeric(corr["n"], errors="coerce").max())
            n_rows = len(muni) if muni is not None else None
            print("[Muni correlations] n<=#ADM2:", ok(n_rows is None or n_max <= n_rows),
                  f"n_max={n_max} #ADM2={n_rows}")
            top = corr.assign(absr=np.abs(pd.to_numeric(corr["r"], errors="coerce")))\
                      .sort_values("absr", ascending=False).head(5)
            print(top[["theme","var","r","p","n"]].to_string(index=False))
        else:
            print("[Muni correlations] present but missing cols or EMPTY")
    else:
        print("[Muni correlations] MISSING or EMPTY")
except Exception as e:
    print("[Muni correlations] ERROR:", e)

# ------------------------------
# 5) Profiles
# ------------------------------
try:
    p = paths["prof"]
    if exists_nonempty(p):
        prof = pd.read_csv(p)
        has_q = "poverty_quintile" in prof.columns
        print("[Profiles] quintile present:", ok(has_q), f"rows={len(prof)}")
        if has_q:
            print("  quintile counts:\n", prof["poverty_quintile"].value_counts(dropna=False))
    else:
        print("[Profiles] MISSING or EMPTY")
except Exception as e:
    print("[Profiles] ERROR:", e)

# ------------------------------
# 6) Priority rank (legacy)
# ------------------------------
try:
    p = paths["rank"]
    if exists_nonempty(p):
        rank = pd.read_csv(p)
        need = {"ADM2CD_c","NAM_1","NAM_2","score","rank","selected"}
        print("[Priority rank] columns:", ok(need.issubset(rank.columns)), f"shape={rank.shape}")
        if "rank" in rank.columns and rank["rank"].notna().any():
            rseq = sorted(rank["rank"].dropna().astype(int))
            contig = (rseq == list(range(min(rseq), max(rseq)+1)))
            print("  contiguous ranks:", ok(contig))
    else:
        print("[Priority rank] MISSING or EMPTY")
except Exception as e:
    print("[Priority rank] ERROR:", e)

# ------------------------------
# 7) Scenarios meta/summary
# ------------------------------
try:
    scn_ids_meta = []
    pm = paths["scn_meta"]
    if exists_nonempty(pm):
        meta = json.loads(pm.read_text())
        if isinstance(meta, list):
            scn_ids_meta = [d.get("id") for d in meta if isinstance(d, dict)]
    ps = paths["scn_sum"]
    if exists_nonempty(ps):
        scn = pd.read_csv(ps)
        scn_ids_sum = sorted(scn["scenario_id"].unique()) if "scenario_id" in scn else []
        scn_ok = not scn_ids_meta or (set(scn_ids_meta) == set(scn_ids_sum))
        print("[Scenarios] ids match meta:", ok(scn_ok), f"count={len(scn_ids_sum)}")
        for k in ["overlap_pct_vs_baseline","jaccard_vs_baseline","selected_cells","selected_km2"]:
            if k in scn:
                print(f"  {k}: min={scn[k].min():.2f} mean={scn[k].mean():.2f} max={scn[k].max():.2f}")
    else:
        print("[Scenarios] MISSING or EMPTY")
except Exception as e:
    print("[Scenarios] ERROR:", e)

# ------------------------------
# 8) Site audit points
# ------------------------------
try:
    p = paths["site"]
    if exists_nonempty(p):
        site = pd.read_csv(p)
        cols = {c.lower() for c in site.columns}
        has_xy = any(c in cols for c in ["x","lon","longitude"]) and any(c in cols for c in ["y","lat","latitude"])
        print("[Site audit points] has XY:", ok(has_xy), f"shape={site.shape}")
    else:
        print("[Site audit points] MISSING or EMPTY")
except Exception as e:
    print("[Site audit points] ERROR:", e)

# ------------------------------
# 9) Project KPIs
# ------------------------------
try:
    p = paths["proj"]
    if exists_nonempty(p):
        proj = pd.read_csv(p)
        print("[Project KPIs] shape:", proj.shape)
    else:
        print("[Project KPIs] MISSING or EMPTY")
except Exception as e:
    print("[Project KPIs] ERROR:", e)

# ------------------------------
# 10) Admin2 Lookup
# ------------------------------
lookup = None
try:
    p = paths["lookup"]
    if exists_nonempty(p):
        lookup = pd.read_csv(p)
        need_cols = {"lab", "ADM2CD_c", "NAM_1", "NAM_2"}
        has_cols = need_cols.issubset(lookup.columns)
        print("[Admin2 Lookup] columns:", ok(has_cols), f"shape={lookup.shape}")
        if has_cols:
            is_unique = lookup["ADM2CD_c"].is_unique
            print("  unique ADM2CD_c:", ok(is_unique))
            if "lab" in lookup.columns:
                expected_labs = list(range(1, len(lookup) + 1))
                actual_labs = sorted(lookup["lab"].tolist())
                labs_sequential = (actual_labs == expected_labs)
                print("  sequential lab:", ok(labs_sequential))
            if "NAM_1" in lookup.columns:
                provinces = lookup["NAM_1"].unique()
                print(f"  provinces: {', '.join(provinces)}")
            if "NAM_2" in lookup.columns:
                n_municipalities = lookup["NAM_2"].nunique()
                print(f"  municipalities: {n_municipalities}")
            print(lookup.head(3).to_string(index=False))
    else:
        print("[Admin2 Lookup] MISSING or EMPTY")
except Exception as e:
    print("[Admin2 Lookup] ERROR:", e)

# ------------------------------
# 11) Priority Admin2 Rank
# ------------------------------
admin2_rank = None
try:
    p = paths["admin2_rank"]
    if exists_nonempty(p):
        admin2_rank = pd.read_csv(p)
        need_cols = {"ADM2CD_c", "NAM_1", "NAM_2", "score", "rank", "selected", "share_selected"}
        has_cols = need_cols.issubset(admin2_rank.columns)
        print("[Priority Admin2 Rank] columns:", ok(has_cols), f"shape={admin2_rank.shape}")
        if has_cols:
            if "rank" in admin2_rank.columns and admin2_rank["rank"].notna().any():
                rseq = sorted(admin2_rank["rank"].dropna().astype(int))
                contig = (rseq == list(range(min(rseq), max(rseq)+1)))
                print("  contiguous ranks:", ok(contig))
                print(f"  rank range: {min(rseq)} to {max(rseq)}")
            if "score" in admin2_rank.columns:
                score_min = admin2_rank["score"].min()
                score_max = admin2_rank["score"].max()
                score_range_ok = (0 <= score_min <= 1) and (0 <= score_max <= 1)
                print("  score range [0-1]:", ok(score_range_ok), f"[{score_min:.4f}, {score_max:.4f}]")
            if "selected" in admin2_rank.columns:
                n_selected = admin2_rank["selected"].sum()
                pct_selected = (n_selected / len(admin2_rank)) * 100
                print(f"  selected municipalities: {n_selected}/{len(admin2_rank)} ({pct_selected:.1f}%)")
            if "share_selected" in admin2_rank.columns:
                share_min = admin2_rank["share_selected"].min()
                share_max = admin2_rank["share_selected"].max()
                share_range_ok = (0 <= share_min <= 1) and (0 <= share_max <= 1)
                print("  share_selected range [0-1]:", ok(share_range_ok), f"[{share_min:.4f}, {share_max:.4f}]")
            if "rank" in admin2_rank.columns:
                top5 = admin2_rank.sort_values("rank").head(5)
                print("  Top 5 priority municipalities:")
                print(top5[["NAM_2", "score", "rank", "selected"]].to_string(index=False))
    else:
        print("[Priority Admin2 Rank] MISSING or EMPTY")
except Exception as e:
    print("[Priority Admin2 Rank] ERROR:", e)

# ------------------------------
# 12) Priority clusters (schema-agnostic summary)
# ------------------------------
try:
    p = paths["clusters"]
    if exists_nonempty(p):
        cl = pd.read_csv(p)
        print("[Priority clusters] present:", ok(True), f"shape={cl.shape}")
        # Try common fields if available
        common = [c for c in ["cluster_id","cells","km2","score_mean","selected"] if c in cl.columns]
        if common:
            print("  sample cols:", ", ".join(common))
            print(cl[common].head(5).to_string(index=False))
        else:
            print("  columns:", ", ".join(cl.columns[:12]), "...")
    else:
        print("[Priority clusters] MISSING or EMPTY")
except Exception as e:
    print("[Priority clusters] ERROR:", e)

# ------------------------------
# 13) Catchments KPIs
# ------------------------------
try:
    if exists_nonempty(paths["catch_kpi"]):
        ck = pd.read_csv(paths["catch_kpi"])
        need = {"site_index","thresh_min"}
        print("[Catchments KPIs] columns:", ok(need.issubset(ck.columns)), f"shape={ck.shape}")

        # Types & ordering (robust to writer changes)
        ck["thresh_min"] = pd.to_numeric(ck["thresh_min"], errors="coerce")
        if "area_km2" in ck.columns:
            ck["area_km2"] = pd.to_numeric(ck["area_km2"], errors="coerce")

            ck_sorted = ck.sort_values(["site_index","thresh_min"])

            # Vectorized monotone check (no ambiguous truth values, no deprecation)
            mono_series = (
                ck_sorted
                .groupby("site_index", group_keys=False)["area_km2"]
                .apply(lambda s: (s.diff().fillna(0) >= -1e-6).all())
            )
            mono_pct = 100.0 * float(mono_series.mean()) if len(mono_series) else float("nan")
            print(f"  monotone area by site: {ok(bool(mono_series.all()))} | {mono_pct:.0f}% sites OK")
        else:
            print("  area_km2 missing → skip monotonicity")
    else:
        print("[Catchments KPIs] MISSING or EMPTY")
except Exception as e:
    print("[Catchments KPIs] ERROR:", e)


# ------------------------------
# 14) Synergies (clusters & sites)
# ------------------------------
def _summarize_synergy(name, pth):
    try:
        if exists_nonempty(pth):
            df = pd.read_csv(pth)
            print(f"[{name}] present:", ok(True), f"shape={df.shape}")
            # Try some common helpful summaries if columns exist
            maybe_cols = set(df.columns.str.lower())
            if {"site_index","cluster_id"}.issubset(maybe_cols):
                # group by cluster → count sites
                si = [c for c in df.columns if c.lower()=="site_index"][0]
                ci = [c for c in df.columns if c.lower()=="cluster_id"][0]
                g = df.groupby(ci)[si].nunique().describe()[["count","mean","max"]]
                print(f"  sites per cluster (count/mean/max): {g.to_dict()}")
            if {"km2","beneficiaries"}.issubset(maybe_cols):
                k2 = [c for c in df.columns if c.lower()=="km2"][0]
                ben = [c for c in df.columns if c.lower()=="beneficiaries"][0]
                print(f"  totals → km2={df[k2].sum():.1f}, beneficiaries={int(df[ben].sum()):,}")
            print("  columns:", ", ".join(df.columns[:12]), "...")
        else:
            print(f"[{name}] MISSING or EMPTY")
    except Exception as e:
        print(f"[{name}] ERROR:", e)

_summarize_synergy("Cluster synergies", paths["clust_syn"])
_summarize_synergy("Site synergies", paths["site_syn"])

# ------------------------------
# 15) OD-Lite
# ------------------------------
try:
    # Zone attributes
    pz = paths["od_zone"]
    grav = paths["od_grav"]
    ag = paths["od_agents"]

    if exists_nonempty(pz):
        Z = pd.read_csv(pz)
        has_xy = {"lon","lat"}.issubset(Z.columns)
        has_id = any(c in Z.columns for c in ["ADM2CD_c","adm2cd_c","id","lab"])
        print("[OD zone attrs] has lon/lat:", ok(has_xy), "| has zone id:", ok(has_id), f"shape={Z.shape}")
    else:
        Z = None
        print("[OD zone attrs] MISSING or EMPTY")

    # Gravity table
    if exists_nonempty(grav):
        G = pd.read_csv(grav)
        need = {"oi","dj","flow","dist_km"}
        print("[OD gravity] columns:", ok(need.issubset(G.columns)), f"rows={len(G)}")
        if need.issubset(G.columns):
            # Basic validity: non-negative flows, diagonal allowed but distance should be ~0 there
            nonneg = (G["flow"] >= -1e-9).all()
            print("  non-negative flows:", ok(nonneg))
            # Compute a couple of quick stats
            total = G["flow"].sum()
            mean_d = np.average(G["dist_km"], weights=G["flow"]) if total > 0 else np.nan
            print(f"  total trips={total:,.0f} | flow-weighted mean dist={mean_d:,.1f} km")
            # Optional: check symmetry stats (not required for doubly-constrained but useful)
            # Build small pivot if feasible
            n_hint = int(np.sqrt(len(G)))
            if n_hint <= 300:  # avoid huge pivots
                piv = G.pivot_table(index="oi", columns="dj", values="flow", aggfunc="sum").fillna(0.0)
                asym = np.abs(piv.values - piv.values.T).mean()
                print(f"  mean asymmetry |F - F^T| = {asym:,.2f}")
    else:
        print("[OD gravity] MISSING or EMPTY")

    # Agents
    if exists_nonempty(ag):
        A = pd.read_csv(ag)
        need = {"oi","dj","o_lon","o_lat","d_lon","d_lat"}
        print("[OD agents] columns:", ok(need.issubset(A.columns)), f"N={len(A)}")
        # Spot-check coordinates in plausible bounds
        for k in ["o_lon","d_lon"]:
            if k in A:
                in_lon = A[k].between(-180, 180).mean()
                print(f"  {k} in [-180,180]: {in_lon:.2%}")
        for k in ["o_lat","d_lat"]:
            if k in A:
                in_lat = A[k].between(-90, 90).mean()
                print(f"  {k} in [-90,90]: {in_lat:.2%}")
    else:
        print("[OD agents] MISSING or EMPTY")
except Exception as e:
    print("[OD] ERROR:", e)

# ------------------------------
# 16) CROSS-FILE VALIDATION
# ------------------------------
print("\n" + "=" * 80)
print("CROSS-FILE VALIDATION")
print("=" * 80)

try:
    # Lookup ↔ Admin2 Rank
    if paths["lookup"].exists() and paths["admin2_rank"].exists():
        lookup = pd.read_csv(paths["lookup"])
        admin2_rank = pd.read_csv(paths["admin2_rank"])
        lookup_codes = set(lookup.get("ADM2CD_c", pd.Series(dtype=str)))
        rank_codes = set(admin2_rank.get("ADM2CD_c", pd.Series(dtype=str)))
        codes_match = (lookup_codes == rank_codes)
        print("[Lookup ↔ Admin2 Rank] ADM2CD_c match:", ok(codes_match))
        if not codes_match:
            missing_in_rank = sorted(list(lookup_codes - rank_codes))[:10]
            missing_in_lookup = sorted(list(rank_codes - lookup_codes))[:10]
            if missing_in_rank:
                print(f"  Missing in rank (first 10): {missing_in_rank}")
            if missing_in_lookup:
                print(f"  Missing in lookup (first 10): {missing_in_lookup}")

    # Lookup ↔ Muni Indicators count
    if paths["lookup"].exists() and paths["muni"].exists():
        lookup = pd.read_csv(paths["lookup"])
        muni = pd.read_csv(paths["muni"])
        n_lookup = len(lookup)
        n_muni_unique = len(muni[["ADM2CD_c"]].drop_duplicates()) if "ADM2CD_c" in muni.columns else None
        count_match = (n_muni_unique == n_lookup) if n_muni_unique is not None else False
        print("[Lookup ↔ Muni Indicators] count match:", ok(count_match),
              f"lookup={n_lookup} muni_unique={n_muni_unique}")

    # Check if admin2_rank matches the older rank file structure (intentional alias)
    if admin2_rank is not None and exists_nonempty(paths["rank"]):
        old_rank = pd.read_csv(paths["rank"])
        common_cols = set(admin2_rank.columns) & set(old_rank.columns)
        if len(common_cols) >= 5:
            print("[Admin2 Rank ↔ Priority Rank] OK: same schema by design (legacy alias maintained).")

    # OD zones ↔ lookup (count consistency if both exist)
    if paths["od_zone"].exists() and paths["lookup"].exists():
        Z = pd.read_csv(paths["od_zone"])
        L = pd.read_csv(paths["lookup"])
        # Try to infer the ID column in Z
        z_id = None
        for cand in ["ADM2CD_c","adm2cd_c","lab","id"]:
            if cand in Z.columns:
                z_id = cand
                break
        if z_id is not None:
            n_match = len(set(Z[z_id])) == len(L)
            print("[OD zones ↔ Lookup] zone count matches:", ok(n_match), f"zones={len(Z)} lookup={len(L)}")
        else:
            print("[OD zones ↔ Lookup] SKIP (no recognizable zone id column)")

except Exception as e:
    print("[Cross-validation] ERROR:", e)

print("\n" + "=" * 80)
print("VALIDATION COMPLETE")
print("=" * 80)


=== File presence check (project outputs + /mnt/data fallback) ===
iso          -> FOUND | /mnt/c/Users/benny/OneDrive/Documents/Github/ago-lobitocorridor-analysis/outputs/tables/huambo_kpis_isochrones.csv
risk         -> FOUND | /mnt/c/Users/benny/OneDrive/Documents/Github/ago-lobitocorridor-analysis/outputs/tables/huambo_roads_flood_risk_summary.csv
muni         -> FOUND | /mnt/c/Users/benny/OneDrive/Documents/Github/ago-lobitocorridor-analysis/outputs/tables/huambo_municipality_indicators.csv
corr         -> FOUND | /mnt/c/Users/benny/OneDrive/Documents/Github/ago-lobitocorridor-analysis/outputs/tables/huambo_corr_with_rural_poverty.csv
prof         -> FOUND | /mnt/c/Users/benny/OneDrive/Documents/Github/ago-lobitocorridor-analysis/outputs/tables/huambo_municipality_profiles.csv
rank         -> FOUND | /mnt/c/Users/benny/OneDrive/Documents/Github/ago-lobitocorridor-analysis/outputs/tables/huambo_priority_muni_rank.csv
scn_meta     -> FOUND | /mnt/c/Users/benny/OneDrive/Documents/Git