In [None]:
from pathlib import Path
import sys
# Ensure the project root (parent of this folder) is importable
sys.path.insert(0, str(Path().resolve().parent))
from python_pipeline_scripts import utils, runner
config = utils.load_config(Path().resolve().parent / 'config' / 'config.yaml')



%load_ext autoreload
%autoreload 1

### working for diffuse loads

In [None]:
# Jupyter single-cell: Monte Carlo pipeline using generic ops with mean/extreme/random modes and per-realization provenance files.
from pathlib import Path
import sys
from itertools import product
import random
import geopandas as gpd

# Ensure the project root (parent of this folder) is importable
sys.path.insert(0, str(Path().resolve().parent))

from python_pipeline_scripts import utils
from python_pipeline_scripts.raster_agg import raster_zonal_aggregation_to_gpkg
from python_pipeline_scripts.transforms.soil_chm import (
    read_n_p_means_from_csv_to_df,
    transform_apply_ops,
    transform_split_with_bounds,
    transform_write_chm_from_df,
)

from python_pipeline_scripts.transforms.soil_chm import (
    read_n_p_means_from_csv_to_df,
    transform_apply_ops,
    transform_split_with_bounds,
    transform_write_chm_from_df,
)

from python_pipeline_scripts.transforms.point_dat import (
    read_population_by_subbasin_csv_to_df,
    transform_interpolate_years_wide,
    transform_build_point_load_timeseries,
    transform_write_point_dat_from_df,
)

from python_pipeline_scripts.mc_engine import run_monte_carlo
from python_pipeline_scripts.provenance_report import summarize_run, realization_report, build_upstream_inputs

DEBUG = True  # set False to silence extra transform prints

def _resolve(p: str, base: Path) -> Path:
    pp = Path(p)
    return pp if pp.is_absolute() else (base / pp).resolve()

# ------------------------- 0) Resolve paths -------------------------
script_dir = Path(r"C:\Users\Usuario\OneDrive - UNIVERSIDAD DE HUELVA\Granada\TrabajoFM\scripts\script DIFFUSE loads - input .chm")

output_gpkg = r"..\..\Genil GEO_INFO_POOL\Input Data\Diffuse loads\Soil chemical composition\python calculated hru stats\hru_chem_stats.gpkg"
raster_folder = r"..\..\Genil_ArcGIS_Ruben\ArcGIS _ base de suelo quimico _ ruben _ 30-05-25\raster\temp_rasters"
zones_fp = r"C:\Users\Usuario\OneDrive - UNIVERSIDAD DE HUELVA\Archivos de Cesar Ruben Fernandez De Villaran San Juan - swat_cubillas\cubillas_hru\Watershed\Shapes\hru1.shp"

output_gpkg_p = _resolve(output_gpkg, script_dir)
raster_folder_p = _resolve(raster_folder, script_dir)
zones_fp_p = Path(zones_fp)

cfg = utils.load_config(Path("../config/config.yaml"))

_ = raster_zonal_aggregation_to_gpkg(
    raster_folder=raster_folder_p,
    zones_fp=zones_fp_p,
    zone_field="HRU_GIS",
    label_field="OBJECTID",
    output_gpkg=output_gpkg_p,
    files_end_with="_rediam.tif",
    stat_operation="mean",
    raster_alias="full_name",
    zone_meaning="HRU",
    overwrite_cache=False,
    write_manifest=True,  # emit .manifest.json
    config=cfg,
)

layer_name = "values_by_hru"
gdf = gpd.read_file(output_gpkg_p, layer=layer_name)
csv_output = str(output_gpkg_p).replace(".gpkg", ".csv")
gdf.drop(columns="geometry").to_csv(csv_output, sep=";", index=False)
print(f"Exported to CSV: {csv_output}")

# -------------------- 1) Monte Carlo spec (single dict) --------------------
# MODE: 'extreme', 'random', or 'mean'
MC_SPEC = {
    "mode": "mean",   # 'mean' | 'extreme' | 'random'
    "draws": 2,       # used only when mode == 'random'
    "transforms": [
        # 0) Base conversions via generic ops (unbounded, deterministic)
        {"type": "ops", "name": "compute_base_ops", "ops": [
            {
                "src": "mean_Nitrogeno_total_porcent_resample_Rediam",
                "out": "N_total_mg_kg",
                "op": "mul",
                #"factor": 10_000.0,      # deterministic
                "mean": 10_000.0, "lower": None, "upper": None,
                "source": "deterministic",
            },
            {
                "src": "mean_Fosforo_mg_100g_P205_rediam",
                "out": "P_element_mg_kg",
                "op": "mul",
                #"factor": 10.0 * 0.4364, # deterministic
                "mean": 10.0 * 0.4364, "lower": None, "upper": None,
                "source": "deterministic",
            },
        ]},


        # # 1) Scaling via generic ops (bounded; per-realization picks factor)
        # {"type": "ops", "name": "scale_ops", "ops": [
        #     {
        #         "src": "N_total_mg_kg",
        #         "out": "N_total_mg_kg_scaled",
        #         "op": "mul",
        #         "mean": 1.0, "lower": 0.8, "upper": 1.2,
        #     },
        #     {
        #         "src": "P_element_mg_kg",
        #         "out": "P_element_mg_kg_scaled",
        #         "op": "mul",
        #         "mean": 1.0, "lower": 0.6, "upper": 1.4,
        #     },
        # ]},

        # 2) Split N_total_mg_kg_scaled into pools (bounded ratios)
        {"type": "split", "src": "N_total_mg_kg", "renormalize": True,
         "outputs": [
             {"name": "Soil NO3 [mg/kg]", "mean": 0.02, "lower": 0.018, "upper": 0.022},
             {"name": "Soil organic N [mg/kg]", "mean": 0.98, "lower": 0.978, "upper": 0.982},
         ]},

        # 2b) Derive Soil organic P from N (bounded ratio)
        {"type": "ops", "name": "derive_p_org_from_n", "ops": [
            {
                "src": "N_total_mg_kg",
                "out": "Soil organic P [mg/kg]",
                "op": "mul",
                "mean": 0.125, "lower": 0.1, "upper": 0.3,
                "source": "deterministic",
            },
        ]},


        # 3) Split P_element_mg_kg_scaled into pools (bounded ratios)
        {"type": "split", "src": "P_element_mg_kg", "renormalize": True,
         "outputs": [
             {"name": "Soil labile P [mg/kg]", "mean": 1.0, "lower": 1.0, "upper": 1.0},
             #{"name": "Soil organic P [mg/kg]", "mean": 0.0, "lower": 0.0, "upper": 0.0},
         ]},



        # 4) Write CHM files
        {"type": "write_chm", "id_col": "HRU_GIS",
         "label_map": {
             "Soil NO3 [mg/kg]": "Soil NO3 [mg/kg]",
             "Soil organic N [mg/kg]": "Soil organic N [mg/kg]",
             "Soil labile P [mg/kg]": "Soil labile P [mg/kg]",
             "Soil organic P [mg/kg]": "Soil organic P [mg/kg]",
         }},


        ######### THIS SHOULD BE CONNECTED TO .CIO FILE TO ALWAYS HAVE CORRECT START AND END YEARS
        # Interpolate to full yearly range (optional; place before build_point):
        {"type": "interpolate_years_wide", "id_col": "GRIDCODE", "year_start": 1970, "year_end": 2021},

        # Build point-source timeseries from population:
        {
            "type": "build_point",
            "id_col": "GRIDCODE",
            "wastewater_lppd": 150.0,
            "mgL_values": {
            "ORGNYR": {"mean": 15, "lower": 12, "upper": 18},
            "ORGPYR": {"mean": 3,  "lower": 2.5, "upper": 3.5},
            "NO3YR":  {"mean": 0,  "lower": 0,   "upper": 0},
            "NH3YR":  {"mean": 25, "lower": 20,  "upper": 30},
            "NO2YR":  {"mean": 0,  "lower": 0,   "upper": 0},
            "MINPYR": {"mean": 5,  "lower": 4,   "upper": 6},
            "SEDYR":  {"mean": 720,"lower": 600, "upper": 800},
            "CBODYR": {"mean": 220,"lower": 180, "upper": 260},
            "DISOXYR":{"mean": 2.5,"lower": 2.0, "upper": 3.0},
            "CHLAYR": {"mean": 0.001,"lower": 0.0,"upper": 0.002}
            },
            "out_columns": ["YEAR","FLOYR","SEDYR","ORGNYR","ORGPYR","NO3YR","NH3YR","NO2YR","MINPYR","CBODYR","DISOXYR","CHLAYR"]
        },

        ######### THIS SHOULD BE CONNECTED TO .CIO FILE TO ALWAYS HAVE CORRECT START AND END YEARS
        # Write .dat files:
        {"type": "write_point_dat",
        "id_col": "GRIDCODE",
        "columns_order": ["YEAR","FLOYR","SEDYR","ORGNYR","ORGPYR","NO3YR","NH3YR","NO2YR","MINPYR","CBODYR","DISOXYR","CHLAYR"],
        "start_year": 1970, "end_year": 2021}


    ]
}

# ------------------ 2) Build transforms & defaults ------------------
base_txtinout = r"C:\SWAT\RSWAT\cubillas\cubillas_set_219_ruben\cubillas_BASE_set-219\TxtInOut_1"
realizations_root = r"C:\SWAT\RSWAT\cubillas\mc_realizations"
results_root = r"C:\SWAT\RSWAT\cubillas\mc_results"

base_txtinout_p = Path(base_txtinout)
realizations_root_p = Path(realizations_root)
results_root_p = Path(results_root)

manifest_file = Path(str(output_gpkg_p) + ".manifest.json")
upstream = build_upstream_inputs(
    raster_folder=raster_folder_p,
    pattern="*_rediam.tif",
    zones_fp=zones_fp_p,
    gpkg_path=output_gpkg_p,
    csv_path=Path(csv_output),
)

transforms = []
transforms_params = []

for t in MC_SPEC["transforms"]:
    ttype = t["type"]

    if ttype == "ops":
        transforms.append(transform_apply_ops)
        transforms_params.append({
            "ops": t["ops"],
            "input_source": str(manifest_file) if manifest_file.exists() else None,
            "debug": DEBUG,
        })

    elif ttype == "split":
        transforms.append(transform_split_with_bounds)
        outputs = [{
            "name": o["name"],
            "mean": o["mean"],
            "lower": o["lower"],
            "upper": o["upper"],
            "source": MC_SPEC["mode"]
        } for o in t["outputs"]]
        transforms_params.append({
            "src": t["src"],
            "renormalize": bool(t.get("renormalize", True)),
            "outputs": outputs,
            "input_source": str(manifest_file) if manifest_file.exists() else None,
            "debug": DEBUG,
        })

    elif ttype == "write_chm":
        transforms.append(transform_write_chm_from_df)
        transforms_params.append({
            "id_col": t["id_col"],
            "label_map": t["label_map"],
        })

    # Optional: keep legacy base computation (now logs via generic scaling)

    # elif ttype == "compute_base":
    #     transforms.append(transform_compute_base_soil_vars)
    #     transforms_params.append({
    #         "id_col": t.get("id_col", "HRU_GIS"),
    #         "n_col": t["n_col"],
    #         "p_col": t["p_col"],
    #         "debug": DEBUG,
    #     })

    # Point loads: interpolate decade→year (wide table)
    elif ttype == "interpolate_years_wide":
        transforms.append(transform_interpolate_years_wide)
        transforms_params.append({
            "id_col": t.get("id_col", "GRIDCODE"),
            "year_start": int(t["year_start"]),
            "year_end": int(t["year_end"]),
            "keep_existing": bool(t.get("keep_existing", True)),
        })

    # Point loads: build timeseries from population and mg/L specs
    elif ttype == "build_point":
        transforms.append(transform_build_point_load_timeseries)
        transforms_params.append({
            "id_col": t.get("id_col", "GRIDCODE"),
            "wastewater_lppd": float(t.get("wastewater_lppd", 150.0)),
            "mgL_values": t["mgL_values"],     # dict[var] = {mean, lower, upper}
            "out_columns": t.get("out_columns"),  # optional ordered list
            "round_to": int(t.get("round_to", 6)),
        })

    # Point loads: write rcyr_<subbasin>.dat per subbasin
    elif ttype == "write_point_dat":
        transforms.append(transform_write_point_dat_from_df)
        transforms_params.append({
            "id_col": t.get("id_col", "GRIDCODE"),
            "columns_order": t.get("columns_order"),
            "start_year": t.get("start_year"),
            "end_year": t.get("end_year"),
        })

    else:
        raise ValueError(f"Unknown transform type: {ttype}")

# ------------------ 3) Build per_realization overrides ------------------
def _op_param_name(kind: str) -> str:
    return {"mul":"factor","relative":"factor","add":"delta","absolute":"delta","set":"value"}.get(kind, "factor")

def _has_bounds(lo, hi) -> bool:
    try: return lo is not None and hi is not None and float(hi) != float(lo)
    except: return False

def build_extreme_overrides(MC_SPEC):
    from itertools import product
    per_tf_opts = []
    for t in MC_SPEC["transforms"]:
        tt = t["type"]
        if tt == "ops":
            bundles, any_b = [], False
            for op in t["ops"]:
                kind = op.get("op","mul"); key = _op_param_name(kind)
                lo, hi = op.get("lower"), op.get("upper")
                if _has_bounds(lo, hi):
                    any_b = True
                    bundles.append([{**op, key: float(lo), "source":"extreme_lower"},
                                    {**op, key: float(hi), "source":"extreme_upper"}])
                else:
                    bundles.append([{**op}])
            per_tf_opts.append([{"ops":[dict(b) for b in combo]} for combo in product(*bundles)] if any_b else [{}])
        elif tt == "split":
            outs = []
            for o in t["outputs"]:
                lo, hi = float(o["lower"]), float(o["upper"])
                name = o["name"]
                outs.append([{ "name": name, "ratio": lo, "source":"extreme_lower"}] if abs(hi-lo)<1e-12
                             else [{ "name": name, "ratio": lo, "source":"extreme_lower"},
                                   { "name": name, "ratio": hi, "source":"extreme_upper"}])
            per_tf_opts.append([{"outputs": list(combo)} for combo in product(*outs)])
        elif tt == "build_point":
            mg_sets = []
            for var, meta in t["mgL_values"].items():
                lo, hi = meta.get("lower"), meta.get("upper")
                if _has_bounds(lo, hi):
                    mg_sets.append([{var: {"mgL": float(lo), "source":"extreme_lower"}},
                                    {var: {"mgL": float(hi), "source":"extreme_upper"}}])
                else:
                    mg_sets.append([{var: {"mgL": float(meta.get("mean", 0.0)), "source":"mean"}}])
            combos = []
            for combo in product(*mg_sets):
                merged = {}
                for d in combo: merged.update(d)
                combos.append({"mgL_values": merged})
            per_tf_opts.append(combos)
        else:
            per_tf_opts.append([{}])
    return [list(c) for c in product(*per_tf_opts)]

def build_random_overrides(MC_SPEC, n_draws, seed=0):
    import random; rnd = random.Random(seed)
    draws = []
    for _ in range(n_draws):
        per_tf = []
        for t in MC_SPEC["transforms"]:
            tt = t["type"]
            if tt == "ops":
                bundle = []
                for op in t["ops"]:
                    kind = op.get("op","mul"); key = _op_param_name(kind)
                    lo, hi = op.get("lower"), op.get("upper")
                    if lo is not None and hi is not None:
                        bundle.append({**op, key: rnd.uniform(float(lo), float(hi)), "source":"random"})
                    else:
                        bundle.append(dict(op))
                per_tf.append({"ops": bundle})
            elif tt == "split":
                outs = []
                for o in t["outputs"]:
                    lo, hi = float(o["lower"]), float(o["upper"])
                    r = lo if abs(hi-lo)<1e-12 else rnd.uniform(lo, hi)
                    src = "lower_equals_upper" if abs(hi-lo)<1e-12 else "random"
                    outs.append({"name": o["name"], "ratio": r, "source": src})
                per_tf.append({"outputs": outs})
            elif tt == "build_point":
                mg = {}
                for var, meta in t["mgL_values"].items():
                    lo, hi = meta.get("lower"), meta.get("upper")
                    if lo is not None and hi is not None:
                        mg[var] = {"mgL": rnd.uniform(float(lo), float(hi)), "source":"random"}
                    else:
                        mg[var] = {"mgL": float(meta.get("mean", 0.0)), "source":"mean"}
                per_tf.append({"mgL_values": mg})
            else:
                per_tf.append({})
        draws.append(per_tf)
    return draws

def build_mean_overrides(MC_SPEC):
    per_tf = []
    for t in MC_SPEC["transforms"]:
        tt = t["type"]
        if tt == "ops":
            bundle = []
            for op in t["ops"]:
                kind = op.get("op","mul"); key = _op_param_name(kind)
                mean = op.get("mean")
                bundle.append({**op, key: float(mean) if mean is not None else op.get(key), "source":"mean"})
            per_tf.append({"ops": bundle})
        elif tt == "split":
            outs = [{"name": o["name"], "ratio": float(o["mean"]), "source":"mean"} for o in t["outputs"]]
            per_tf.append({"outputs": outs})
        elif tt == "build_point":
            mg = {var: {"mgL": float(meta.get("mean", 0.0)), "source":"mean"} for var, meta in t["mgL_values"].items()}
            per_tf.append({"mgL_values": mg})
        else:
            per_tf.append({})
    return [per_tf]

MODE = MC_SPEC["mode"].strip().lower()
if MODE == "extreme":
    per_realization_params = build_extreme_overrides_from_spec(MC_SPEC)
elif MODE == "random":
    per_realization_params = build_random_overrides_from_spec(MC_SPEC, MC_SPEC.get("draws", 1), seed=0)
elif MODE == "mean":
    per_realization_params = build_mean_overrides_from_spec(MC_SPEC)
else:
    raise ValueError(f"Unknown mode: {MODE}")

# ------------------ 4) Run MC (write CHMs only) ------------------
RUN_MODEL = False

results = run_monte_carlo(
    N=len(per_realization_params),
    base_txtinout=base_txtinout_p,
    realization_root=realizations_root_p,
    results_root=results_root_p,
    link_file_regexes=[r"^[0-9]+\.chm$"],
    outputs_to_copy=["output.std", "*.rch"],  # unused when RUN_MODEL=False
    aggregator=lambda: read_n_p_means_from_csv_to_df(
        csv_output,
        id_col="HRU_GIS",
        n_col="mean_Nitrogeno_total_porcent_resample_Rediam",
        p_col="mean_Fosforo_mg_100g_P205_rediam",
    ),
    transforms=transforms,
    transforms_params=transforms_params,
    per_realization_params=per_realization_params,
    exe_path=None,
    seed=0,
    expect_plus=False,
    config=cfg,
    include_base_run=False,
    create_workspace_copy=True,
    force_recreate_workspace=True,
    report=True,
    run_model=RUN_MODEL,
    upstream_inputs=upstream,
    manifest_file=manifest_file if manifest_file.exists() else None,
    auto_attach_manifest=True,
)

ok = sum(1 for r in results if r.success)
run_id = results[0].run_id if results else -1
print(f"Created {len(results)} realizations; {ok} succeeded. run_id={run_id}")
for r in results:
    print(f"- {r.name}: id={r.realization_id} run_id={r.run_id} success={r.success} folder={r.folder}")

# ------------------ 5) Concise run + realization summaries ------------------
print("\n=== Monte Carlo run summary ===")
print(summarize_run(run_id))

if results:
    one_id = results[0].realization_id
    print(f"\n=== Single realization report (id={one_id}) ===")
    print(realization_report(one_id))


2025-09-02 17:32:26,256 | INFO | python_pipeline_scripts.raster_agg | Zonal aggregation start | zones=C:\Users\Usuario\OneDrive - UNIVERSIDAD DE HUELVA\Archivos de Cesar Ruben Fernandez De Villaran San Juan - swat_cubillas\cubillas_hru\Watershed\Shapes\hru1.shp | rasters_dir=C:\Users\Usuario\OneDrive - UNIVERSIDAD DE HUELVA\Granada\TrabajoFM\Genil_ArcGIS_Ruben\ArcGIS _ base de suelo quimico _ ruben _ 30-05-25\raster\temp_rasters
2025-09-02 17:32:26,756 | INFO | python_pipeline_scripts.raster_agg | Valid HRU features: 878 | bounds=[ 438912.5        4123462.50012207  470812.5        4158362.50012207]
2025-09-02 17:32:26,760 | INFO | python_pipeline_scripts.raster_agg | Found 2 raster(s) matching '_rediam.tif'
2025-09-02 17:32:26,762 | INFO | python_pipeline_scripts.raster_agg | Reprojected (cached) -> C:\Users\Usuario\OneDrive - UNIVERSIDAD DE HUELVA\Granada\TrabajoFM\Genil_ArcGIS_Ruben\ArcGIS _ base de suelo quimico _ ruben _ 30-05-25\raster\temp_rasters\temp_cache\Fosforo_mg_100g_P205_

Exported to CSV: C:\Users\Usuario\OneDrive - UNIVERSIDAD DE HUELVA\Granada\TrabajoFM\Genil GEO_INFO_POOL\Input Data\Diffuse loads\Soil chemical composition\python calculated hru stats\hru_chem_stats.csv


NameError: name 'build_mean_overrides_from_spec' is not defined