In [None]:
import geopandas as gpd, pandas as pd, numpy as np, sys, os, time
from fiona import listlayers
from tqdm.auto import tqdm


# === Parámetros ===
basins_gpkg = "Cuencas_n5.shp"
rivers_gpkg = "rivers_RAISG_corrected2.gpkg"
basin_layer = listlayers(basins_gpkg)[0]
river_layer = listlayers(rivers_gpkg)[0]
out_gpkg, out_layer, out_csv = "principal_river_by_basin1.gpkg", "principal1", "principal1_river_lengths.csv"
target_epsg = 4326
simplify_tolerance_m = 0
predicate = "intersects"   # "intersects" | "within" | "crosses" | "overlaps" | "touches" | "contains"
VERBOSE=False; LOG_EVERY_N=50

# === Carga y reproyección ===
t0=time.perf_counter(); tqdm.write("Cargando datos…")
b = gpd.read_file(basins_gpkg, layer=basin_layer, encoding="latin1")
r = gpd.read_file(rivers_gpkg, layer=river_layer)
for g,n in [(b,"basins"),(r,"rivers")]:
    if g.crs is None: sys.exit(f"{n} sin CRS")
tqdm.write("Reproyectando a EPSG:4326…")
b=b.to_crs(target_epsg); r=r.to_crs(target_epsg)

if "BASIN_ID" not in b: b=b.reset_index(drop=False).rename(columns={"index":"BASIN_ID"})
b=b[["BASIN_ID","geometry"]]
need=["MAIN_RIV","ORD_CLAS","geometry"]; missing=[c for c in need if c not in r.columns]
if missing: sys.exit(f"Faltan columnas en ríos: {missing}")
if "DIS_AV_CMS" not in r: r["DIS_AV_CMS"]=pd.NA
r=r[["MAIN_RIV","ORD_CLAS","DIS_AV_CMS","geometry"]]
if simplify_tolerance_m>0:
    tqdm.write(f"Simplificando cuencas (tol={simplify_tolerance_m} m)…")
    b["geometry"]=b.geometry.simplify(simplify_tolerance_m, preserve_topology=True)

# === sjoin con barra de progreso (usa índice espacial) ===
tqdm.write("Emparejando ríos–cuencas con progreso…")
si = r.sindex
parts = []
for bid, geom in tqdm(zip(b["BASIN_ID"], b.geometry), total=len(b), desc="sjoin progreso"):
    idx = list(si.query(geom))             # filtro bbox
    if not idx:
        continue
    cand = r.iloc[idx].copy()
    mask = getattr(cand.geometry, predicate)(geom)
    cand = cand.loc[mask]
    if cand.empty:
        continue
    cand["BASIN_ID"] = bid
    parts.append(cand[["BASIN_ID","MAIN_RIV","ORD_CLAS","DIS_AV_CMS","geometry"]])

if not parts:
    sys.exit("No hay ríos que intersecten cuencas.")

tagged = gpd.GeoDataFrame(pd.concat(parts, ignore_index=True), geometry="geometry", crs=r.crs)
tqdm.write(f"sjoin (progresivo) listo: {len(tagged)} emparejamientos.")


# === mínimo ORD_CLAS por cuenca y candidatos ===
tqdm.write("Calculando ORD_CLAS mínimo por cuenca…")
min_ord=tagged.groupby("BASIN_ID")["ORD_CLAS"].min().rename("min_ord_in_basin")
tagged=tagged.merge(min_ord, on="BASIN_ID")
cands=tagged[tagged["ORD_CLAS"]==tagged["min_ord_in_basin"]].copy()
tqdm.write(f"Candidatos (mín ORD por cuenca): {len(cands)}")

# === loop por cuenca con progreso; default = mayor caudal ===
cands["len_km_total"]=cands.length/1000
rows=[]; geoms=[]
basin_geom=dict(zip(b["BASIN_ID"], b.geometry))
time_clip=time_group=time_choose=0.0
cuencas=sorted(cands["BASIN_ID"].unique().tolist())
tqdm.write(f"Procesando {len(cuencas)} cuencas…")

for idx,bid in enumerate(tqdm(cuencas, desc="Cuencas")):
    grp=cands[cands["BASIN_ID"]==bid]; g_basin=basin_geom[bid]
    tA=time.perf_counter(); clipped=gpd.clip(grp.set_geometry("geometry"), g_basin); time_clip+=time.perf_counter()-tA
    if clipped.empty:
        tB=time.perf_counter()
        agg=grp.groupby("MAIN_RIV", as_index=False).agg(len_km_in_basin=("len_km_total","sum"), dis_av_cms=("DIS_AV_CMS","mean"), ord_clas=("ORD_CLAS","min"))
        time_group+=time.perf_counter()-tB
        tC=time.perf_counter(); agg=agg.sort_values(["dis_av_cms","len_km_in_basin"], ascending=[False,False]).reset_index(drop=True)
        mr,L,Q,O=agg.loc[0,["MAIN_RIV","len_km_in_basin","dis_av_cms","ord_clas"]]; time_choose+=time.perf_counter()-tC
        rows.append({"BASIN_ID":bid,"MAIN_RIV":mr,"len_km_in_basin":L,"dis_av_cms":Q,"ord_clas_elegido":int(O),"modo":"fallback_total"})
    else:
        clipped["len_km_in_basin"]=clipped.length/1000
        tB=time.perf_counter()
        agg=clipped.groupby("MAIN_RIV", as_index=False).agg(len_km_in_basin=("len_km_in_basin","sum"), dis_av_cms=("DIS_AV_CMS","mean"), ord_clas=("ORD_CLAS","min"))
        time_group+=time.perf_counter()-tB
        tC=time.perf_counter()
        if len(agg)==1: mr,L,Q,O=agg.loc[0,["MAIN_RIV","len_km_in_basin","dis_av_cms","ord_clas"]]; mode="auto"
        else:
            agg=agg.sort_values(["dis_av_cms","len_km_in_basin"], ascending=[False,False]).reset_index(drop=True)
            if VERBOSE:
                tqdm.write(f"Cuenca {bid}: {len(agg)} candidatos (min ORD={int(agg['ord_clas'].min())})")
                for i,row in agg.iterrows():
                    d=0 if pd.isna(row.dis_av_cms) else row.dis_av_cms
                    tqdm.write(f"  [{i}] MAIN_RIV={row.MAIN_RIV} | Q={d:.1f} m³/s | L={row.len_km_in_basin:.2f} km")
            s=input(f"Cuenca {bid}: Elige índice (Enter=mayor caudal): ").strip()
            ch=0 if (s=="" or not s.isdigit() or int(s) not in range(len(agg))) else int(s)
            mr,L,Q,O=agg.loc[ch,["MAIN_RIV","len_km_in_basin","dis_av_cms","ord_clas"]]; mode="manual" if ch!=0 else "auto"
        time_choose+=time.perf_counter()-tC
        rows.append({"BASIN_ID":bid,"MAIN_RIV":mr,"len_km_in_basin":L,"dis_av_cms":Q,"ord_clas_elegido":int(O),"modo":mode})
        gsel=clipped[clipped["MAIN_RIV"]==mr]; ggeom=gsel.dissolve(by="MAIN_RIV").geometry.iloc[0]
        geoms.append((bid,mr,L,Q,int(O),ggeom))
    if (idx+1)%LOG_EVERY_N==0: tqdm.write(f"[{idx+1}/{len(cuencas)}] t_clip={time_clip:,.1f}s t_group={time_group:,.1f}s t_choose={time_choose:,.1f}s")

principal_tbl=pd.DataFrame(rows)
pg = gpd.GeoDataFrame(
    [{"BASIN_ID":bid,"MAIN_RIV":mr,"len_km_in_basin":L,"dis_av_cms":Q,"ord_clas_min_basin":O,"geometry":geom}
     for (bid,mr,L,Q,O,geom) in geoms],
    geometry="geometry", crs=b.crs
) if geoms else gpd.GeoDataFrame(columns=["BASIN_ID","MAIN_RIV","len_km_in_basin","dis_av_cms","ord_clas_min_basin","geometry"], geometry="geometry", crs=b.crs)

try:
    if os.path.exists(out_gpkg): os.remove(out_gpkg)
except: pass
tqdm.write("Escribiendo GPKG y CSV…")
pg.to_file(out_gpkg, layer=out_layer, driver="GPKG")
principal_tbl.to_csv(out_csv, index=False)
tqdm.write("¡Listo!")
