In [19]:
# BLOQUE 1: imports y utilidades
from pathlib import Path
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

def norm_emp_id(x):
    if pd.isna(x): return x
    s = str(x).strip()
    return s.zfill(4) if s.isdigit() and len(s) <= 4 else s

def make_trip_uid(df):
    """UID de viaje dependiente de bus: par (mean_id, trip_id)."""
    a = df.get("mean_id").astype(str)
    b = df.get("trip_id").astype(str)
    return (a + "§" + b)


In [20]:
# BLOQUE 2 (actualizado): paths, lectura de trips y catálogos (línea)
PATH_TRIPS = Path("data/processed/gps_match_trips.parquet")
PATH_POINTS = Path("data/processed/gps_match_points.parquet")  # opcional
PATH_EOTS  = Path("data/raw/eots.csv")
PATH_RUTAS = Path("data/raw/catalogo_rutas_cid.csv")
OUT_DIR    = Path("data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

trips = pd.read_parquet(PATH_TRIPS, engine="pyarrow").copy()

# Normalizaciones mínimas
for c in ("agency_id","route_id","ruta_hex","mean_id"):
    if c in trips.columns:
        trips[c] = trips[c].astype(str).str.upper().str.strip()
if "agency_id" in trips.columns:
    trips["agency_id"] = trips["agency_id"].apply(norm_emp_id)

# Asegurar numéricos
for c in ("pts_en_declared","pts_trip","ratio"):
    if c in trips.columns:
        trips[c] = pd.to_numeric(trips[c], errors="coerce")

# Cargar catálogo de rutas para traer LINEA desde ruta_hex
rutas_cat = pd.read_csv(PATH_RUTAS, dtype=str)
rutas_cat.columns = [c.strip().lower() for c in rutas_cat.columns]
if "ruta_hex" in rutas_cat.columns:
    rutas_cat["ruta_hex"] = rutas_cat["ruta_hex"].astype(str).str.upper().str.strip()
else:
    raise ValueError("catalogo_rutas_cid.csv no tiene columna 'ruta_hex'.")

linea_cols = ["ruta_hex"]
for opt in ["linea","ramal","origen","destino","identificacion"]:
    if opt in rutas_cat.columns:
        linea_cols.append(opt)
rutas_dim = rutas_cat[linea_cols].drop_duplicates()

# Merge: agregar columna 'linea' a trips usando ruta_hex
trips = trips.merge(rutas_dim, on="ruta_hex", how="left")

# Hora si existe
if "hora" in trips.columns:
    trips["hora"] = trips["hora"].astype("Int64")

print("shape:", trips.shape)
display(trips.head(3)[["agency_id","mean_id","trip_id","ruta_hex","linea","route_id","pts_en_declared","pts_trip","ratio","hora"]])


shape: (521, 16)


Unnamed: 0,agency_id,mean_id,trip_id,ruta_hex,linea,route_id,pts_en_declared,pts_trip,ratio,hora
0,5,005DD,-1,NONE,,001D,0,77,0.0,10
1,5,005DD,0,00B1,34,001D,0,110,0.0,10
2,5,005DF,0,008C,23-33,001D,0,196,0.0,10


In [21]:
# BLOQUE 3 (actualizado): columnas derivadas y trip_uid
trips = trips.copy()

# Ratio (si no venía)
if "ratio" not in trips.columns and {"pts_en_declared","pts_trip"}.issubset(trips.columns):
    trips["ratio"] = trips["pts_en_declared"] / trips["pts_trip"]

# Cumplimiento por ratio
trips["trip_match"] = trips["ratio"] >= 0.60

# Consistencia declarada vs ejecutada (route_id vs ruta_hex)
trips["route_match"] = (trips.get("route_id").astype(str) == trips.get("ruta_hex").astype(str))

# Puntos dentro/fuera
trips["pts_in"]  = trips.get("pts_en_declared", pd.Series([np.nan]*len(trips)))
trips["pts_out"] = trips.get("pts_trip", pd.Series([np.nan]*len(trips))) - trips["pts_in"]

# UID de viaje dependiente del bus
trips["trip_uid"] = make_trip_uid(trips)

print("OK: añadidas columnas ['trip_match','route_match','pts_in','pts_out','trip_uid']")
display(trips.head(3)[["agency_id","mean_id","trip_id","trip_uid","linea","ruta_hex","ratio","trip_match"]])


OK: añadidas columnas ['trip_match','route_match','pts_in','pts_out','trip_uid']


Unnamed: 0,agency_id,mean_id,trip_id,trip_uid,linea,ruta_hex,ratio,trip_match
0,5,005DD,-1,005DD§-1,,NONE,0.0,False
1,5,005DD,0,005DD§0,34,00B1,0.0,False
2,5,005DF,0,005DF§0,23-33,008C,0.0,False


In [22]:
# BLOQUE 4 (igual idea): métricas base y sanity check
def count_ge(th):
    return int((trips["ratio"] >= th).sum())

print("Filas totales en trips:", len(trips))
print("Viajes OK (ratio>=0.60):", int(trips["trip_match"].sum()))
print("Viajes con route_id == ruta_hex:", int(trips["route_match"].sum()))
print()

print("ratio describe():")
display(trips["ratio"].describe(percentiles=[.1,.25,.5,.75,.9,.95,.99]))

for th in (0.60, 0.30, 0.10, 0.05):
    print(f"≥ {th:.2f}: {count_ge(th)}")


Filas totales en trips: 521
Viajes OK (ratio>=0.60): 190
Viajes con route_id == ruta_hex: 12

ratio describe():


count    521.000000
mean       0.379510
std        0.465588
min        0.000000
10%        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
90%        1.000000
95%        1.000000
99%        1.000000
max        1.000000
Name: ratio, dtype: float64

≥ 0.60: 190
≥ 0.30: 212
≥ 0.10: 222
≥ 0.05: 225


In [23]:
len(trips), trips[["agency_id","mean_id","trip_id"]].drop_duplicates().shape


(521, (521, 3))

In [24]:
# BLOQUE 5 (actualizado): KPIs por empresa + LINEA + hora
# Importante: total_trips = nunique de trip_uid (par mean_id+trip_id)

# Dimension principal: línea (no ruta_hex)
group_cols = []
if "agency_id" in trips.columns: group_cols.append("agency_id")
if "linea" in trips.columns:     group_cols.append("linea")
elif "ruta_hex" in trips.columns: group_cols.append("ruta_hex")  # fallback si línea faltase
if "hora" in trips.columns:      group_cols.append("hora")

if not group_cols:
    raise ValueError("No hay columnas suficientes para agrupar (se espera agency_id y linea/ruta_hex).")

facts = (
    trips.groupby(group_cols, dropna=False)
         .agg(
             total_trips   = ("trip_uid", "nunique"),     # clave corregida
             trips_ok      = ("trip_match", "sum"),
             trips_route_match = ("route_match", "sum"),
             total_pts_in  = ("pts_in", "sum"),
             total_pts_out = ("pts_out", "sum")
         )
         .reset_index()
)

facts["pct_trips_ok"]    = (facts["trips_ok"] / facts["total_trips"]).replace([np.inf,-np.inf], np.nan).round(3)
facts["pct_route_match"] = (facts["trips_route_match"] / facts["total_trips"]).replace([np.inf,-np.inf], np.nan).round(3)
facts["pct_pts_in"]      = (facts["total_pts_in"] / (facts["total_pts_in"] + facts["total_pts_out"])).replace([np.inf,-np.inf], np.nan).round(3)

print("Preview KPIs (ordenado por total_trips):")
display(facts.sort_values("total_trips", ascending=False).head(10))


Preview KPIs (ordenado por total_trips):


Unnamed: 0,agency_id,linea,hora,total_trips,trips_ok,trips_route_match,total_pts_in,total_pts_out,pct_trips_ok,pct_route_match,pct_pts_in
16,7,23-33,10,54,20,0,3469,5794,0.37,0.0,0.375
4,5,23-33,10,46,23,0,3765,3061,0.5,0.0,0.552
57,20,96,10,45,10,0,703,3807,0.222,0.0,0.156
11,7,12,10,27,17,0,2526,1586,0.63,0.0,0.614
53,20,23-33,10,25,12,0,2826,2923,0.48,0.0,0.492
56,20,58,10,25,9,0,1996,2849,0.36,0.0,0.412
43,17,23-33,10,19,0,0,0,3489,0.0,0.0,0.0
51,20,128,10,19,5,0,966,1957,0.263,0.0,0.33
46,17,44,10,17,0,6,0,2233,0.0,0.353,0.0
63,25,111,10,16,11,4,1919,745,0.688,0.25,0.72


In [25]:
# BLOQUE 6 (actualizado): consistencia declarada vs ejecutada con vista de línea
# Si bien la consistencia se calcula sobre route_id vs ruta_hex, mostramos también la 'linea' asociada a la ruta ejecutada.

have_route_cols = {"route_id","ruta_hex"}.issubset(trips.columns)
if have_route_cols:
    # traer la línea ejecutada por ruta_hex
    # (ya la tenemos en trips['linea'], pero la conservamos en la tabla de pares)
    pairs_cols = ["agency_id","route_id","ruta_hex","linea","trip_uid","trip_match"]
    keep = [c for c in pairs_cols if c in trips.columns]

    base_pairs = trips[keep].drop_duplicates(subset=["agency_id","route_id","ruta_hex","linea","trip_uid"])

    pairs = (
        base_pairs.groupby(["agency_id","route_id","ruta_hex","linea"], dropna=False)
                  .agg(total_trips=("trip_uid","nunique"),
                       trips_ok=("trip_match","sum"))
                  .reset_index()
                  .sort_values("total_trips", ascending=False)
    )

    mismatch = pairs[pairs["route_id"] != pairs["ruta_hex"]]
    print("Top 15 combinaciones declarada≠ejecutada:")
    display(mismatch.head(15))

    # Indicador global de coincidencia declarada-ejecutada
    total_pairs_trips = int(pairs["total_trips"].sum())
    exact_pairs = int(pairs.loc[pairs["route_id"] == pairs["ruta_hex"], "total_trips"].sum())
    pct_global = (exact_pairs / total_pairs_trips) if total_pairs_trips else np.nan
    print(f"Coincidencia declarada=ejecutada (por trips_uid únicos): {exact_pairs:,}/{total_pairs_trips:,} ({pct_global:.1%} si no es NaN)")

else:
    pairs = pd.DataFrame()
    print("No hay columnas 'route_id' y/o 'ruta_hex' para la tabla de consistencia.")


Top 15 combinaciones declarada≠ejecutada:


Unnamed: 0,agency_id,route_id,ruta_hex,linea,total_trips,trips_ok
15,5,0021,008C,23-33,15,15
165,17,015E,008C,23-33,8,0
246,22,0110,008C,23-33,7,7
171,17,015E,01BB,44,7,0
5,5,001E,008C,23-33,7,0
211,20,0152,0124,128,6,3
1,5,001D,008C,23-33,5,0
8,5,001F,008C,23-33,5,0
154,14,00B9,008C,23-33,5,0
115,7,0099,008C,23-33,5,5


Coincidencia declarada=ejecutada (por trips_uid únicos): 12/521 (2.3% si no es NaN)


In [26]:
# BLOQUE 7 (actualizado): guardar KPIs y pares
facts.to_parquet(OUT_DIR / "kpi_facts_notebook.parquet", engine="pyarrow", index=False)
facts.to_csv(OUT_DIR / "kpi_facts_notebook.csv", index=False)
if not pairs.empty:
    pairs.to_parquet(OUT_DIR / "kpi_pairs_decl_vs_exec.parquet", engine="pyarrow", index=False)
    pairs.to_csv(OUT_DIR / "kpi_pairs_decl_vs_exec.csv", index=False)

print("Guardado en:")
print(" -", (OUT_DIR / "kpi_facts_notebook.parquet").resolve())
print(" -", (OUT_DIR / "kpi_facts_notebook.csv").resolve())
if not pairs.empty:
    print(" -", (OUT_DIR / "kpi_pairs_decl_vs_exec.parquet").resolve())
    print(" -", (OUT_DIR / "kpi_pairs_decl_vs_exec.csv").resolve())


Guardado en:
 - D:\x\OneDrive\Escritorio\Python\cumplimiento-rutas-h3\data\processed\kpi_facts_notebook.parquet
 - D:\x\OneDrive\Escritorio\Python\cumplimiento-rutas-h3\data\processed\kpi_facts_notebook.csv
 - D:\x\OneDrive\Escritorio\Python\cumplimiento-rutas-h3\data\processed\kpi_pairs_decl_vs_exec.parquet
 - D:\x\OneDrive\Escritorio\Python\cumplimiento-rutas-h3\data\processed\kpi_pairs_decl_vs_exec.csv


In [27]:
# BLOQUE A: diccionarios de nombres (empresa y buses)

# 1) Empresa: agency_id -> empresa_nombre (desde eots.csv)
empresa_dim = None
try:
    eots = pd.read_csv(PATH_EOTS, dtype=str)
    # heurística para columnas de id y nombre
    col_id_emp = next((c for c in eots.columns if any(k in c.lower() for k in ["eot_id","cod_catalogo","agency","id_eot_vmt_hex"])), None)
    col_nom_emp = next((c for c in eots.columns if any(k in c.lower() for k in ["nombre","permisionario","razon"])), None)
    if col_id_emp and col_nom_emp:
        eots["agency_id"] = eots[col_id_emp].astype(str).str.strip()
        eots["agency_id"] = eots["agency_id"].apply(lambda s: s.zfill(4) if s.isdigit() and len(s) <= 4 else s)
        eots["empresa_nombre"] = eots[col_nom_emp].astype(str).str.strip()
        empresa_dim = eots[["agency_id","empresa_nombre"]].dropna().drop_duplicates()
except Exception as e:
    print("Aviso: no pude construir el diccionario de empresa desde eots.csv:", e)

# 2) Bus label: por empresa, mapear mean_id -> "Bus 1", "Bus 2", ...
#    Creamos un mapping estable usando el orden natural de mean_id
if "agency_id" not in trips.columns or "mean_id" not in trips.columns:
    raise ValueError("Se requieren las columnas 'agency_id' y 'mean_id' en 'trips'.")

bus_key = trips[["agency_id","mean_id"]].drop_duplicates().sort_values(["agency_id","mean_id"])
bus_key["bus_ordinal"] = bus_key.groupby("agency_id").cumcount() + 1
bus_key["bus_label"] = "Bus " + bus_key["bus_ordinal"].astype(str)

# 3) Aplicar nombres a TRIPS y FACTS
trips_named = trips.copy()
facts_named = facts.copy()

# empresa_nombre en trips y facts
if empresa_dim is not None:
    trips_named = trips_named.merge(empresa_dim, on="agency_id", how="left")
    facts_named = facts_named.merge(empresa_dim, on="agency_id", how="left")
else:
    # fallback: usar agency_id como nombre
    trips_named["empresa_nombre"] = trips_named["agency_id"]
    if "empresa_nombre" not in facts_named.columns:
        facts_named["empresa_nombre"] = facts_named["agency_id"]

# bus_label en trips (si luego haces KPIs por bus)
trips_named = trips_named.merge(bus_key[["agency_id","mean_id","bus_label"]], on=["agency_id","mean_id"], how="left")

print("Listo: agregados empresa_nombre y bus_label.")
display(trips_named.head(3)[["agency_id","empresa_nombre","mean_id","bus_label","linea","ratio","trip_match"]])


Listo: agregados empresa_nombre y bus_label.


Unnamed: 0,agency_id,empresa_nombre,mean_id,bus_label,linea,ratio,trip_match
0,5,1° DE DICIEMBRE SRL,005DD,Bus 1,,0.0,False
1,5,1° DE DICIEMBRE SRL,005DD,Bus 1,34,0.0,False
2,5,1° DE DICIEMBRE SRL,005DF,Bus 2,23-33,0.0,False


In [28]:
# BLOQUE B: interfaz interactiva con etiquetas amigables
import ipywidgets as widgets
from IPython.display import display, clear_output

# Usaremos facts_named como base de exploración
df_display = facts_named.copy()

# Opciones de filtros con etiquetas: (label, value)
# Empresa: mostrar nombres; el value real sigue siendo agency_id para filtrar sin ambigüedad
emp_pairs = []
for ag, name in (
    df_display[["agency_id","empresa_nombre"]]
    .fillna({"empresa_nombre": ""})
    .drop_duplicates()
    .sort_values(["empresa_nombre","agency_id"])
    .itertuples(index=False, name=None)
):
    label = f"{name or ag} ({ag})" if name and name != ag else ag
    emp_pairs.append((label, ag))

# Línea (si existe). Mostramos la etiqueta tal cual; value = la misma línea
lin_pairs = []
if "linea" in df_display.columns:
    lin_vals = sorted(df_display["linea"].dropna().astype(str).unique().tolist())
    lin_pairs = [(lv, lv) for lv in lin_vals]

# Hora (si existe)
hor_pairs = []
if "hora" in df_display.columns:
    hora_vals = sorted(df_display["hora"].dropna().astype(int).unique().tolist())
    hor_pairs = [(str(h), h) for h in hora_vals]

# Widgets (usamos SelectMultiple para poder combinar)
w_emp = widgets.SelectMultiple(options=[("(todas)", None)] + emp_pairs, description="Empresa", rows=min(8, len(emp_pairs)+1))
w_lin = widgets.SelectMultiple(options=[("(todas)", None)] + lin_pairs, description="Línea", rows=min(10, len(lin_pairs)+1))
w_hor = widgets.SelectMultiple(options=[("(todas)", None)] + hor_pairs, description="Hora", rows=min(8, len(hor_pairs)+1))

btn_export = widgets.Button(description="Exportar selección (CSV)")
out = widgets.Output()

def current_selection(df):
    sel = df.copy()
    # Empresa
    emp_selected = [v for (lbl, v) in w_emp.value if v is not None] if isinstance(w_emp.value, tuple) else list(w_emp.value)
    emp_selected = [v for v in emp_selected if v is not None]
    if emp_selected:
        sel = sel[sel["agency_id"].isin(emp_selected)]
    # Línea
    lin_selected = [v for (lbl, v) in w_lin.value if v is not None] if isinstance(w_lin.value, tuple) else list(w_lin.value)
    lin_selected = [v for v in lin_selected if v is not None]
    if lin_selected and "linea" in sel.columns:
        sel = sel[sel["linea"].isin(lin_selected)]
    # Hora
    hor_selected = [v for (lbl, v) in w_hor.value if v is not None] if isinstance(w_hor.value, tuple) else list(w_hor.value)
    hor_selected = [v for v in hor_selected if v is not None]
    if hor_selected and "hora" in sel.columns:
        sel = sel[sel["hora"].isin(hor_selected)]
    return sel

def render(_=None):
    with out:
        clear_output(wait=True)
        df_f = current_selection(df_display)

        if df_f.empty:
            print("Sin resultados para la selección.")
            return

        # KPIs agregados de la selección
        tot = int(df_f["total_trips"].sum())
        ok  = int(df_f["trips_ok"].sum()) if "trips_ok" in df_f else 0
        rm  = int(df_f["trips_route_match"].sum()) if "trips_route_match" in df_f else 0
        pin = int(df_f["total_pts_in"].sum()) if "total_pts_in" in df_f else 0
        pout= int(df_f["total_pts_out"].sum()) if "total_pts_out" in df_f else 0

        def pct(a, b): 
            return (a / b) if b else np.nan

        print("=== KPIs de la selección ===")
        print(f"- Total trips: {tot:,}")
        print(f"- Trips OK (ratio ≥ 0.60): {ok:,}  ({pct(ok, tot):.1%} si no es NaN)")
        if "trips_route_match" in df_f:
            print(f"- Coincidencia declarada=ejecutada (trips): {rm:,}  ({pct(rm, tot):.1%} si no es NaN)")
        if "total_pts_in" in df_f and "total_pts_out" in df_f:
            print(f"- Puntos dentro: {pin:,}  ({pct(pin, pin+pout):.1%} si no es NaN)")
            print(f"- Puntos fuera:  {pout:,} ({pct(pout, pin+pout):.1%} si no es NaN)")

        # Tabla detallada con empresa_nombre visible
        cols = [c for c in ["empresa_nombre","agency_id","linea","hora","total_trips","trips_ok","pct_trips_ok","trips_route_match","pct_route_match","total_pts_in","total_pts_out","pct_pts_in"] if c in df_f.columns]
        if cols:
            df_show = df_f[cols].sort_values(["empresa_nombre","linea","hora"], na_position="last").reset_index(drop=True)
            display(df_show.head(30))
        else:
            print("(Sin columnas esperadas para resumen)")

def on_export(_):
    df_f = current_selection(df_display)
    out_path = OUT_DIR / "kpi_selection.csv"
    df_f.to_csv(out_path, index=False)
    with out:
        print(f"\nExportado: {out_path.resolve()}")

w_emp.observe(render, names="value")
w_lin.observe(render, names="value")
w_hor.observe(render, names="value")
btn_export.on_click(on_export)

display(widgets.HBox([w_emp, w_lin, w_hor], layout=widgets.Layout(justify_content='space-between')))
display(btn_export)
display(out)

render()


HBox(children=(SelectMultiple(description='Empresa', options=(('(todas)', None), ('000A', '000A'), ('000B', '0…

Button(description='Exportar selección (CSV)', style=ButtonStyle())

Output()

In [29]:
# BLOQUE C (opcional): KPI por bus con etiquetas "Bus n"
grp_bus = [c for c in ["agency_id","linea","hora","mean_id","bus_label"] if c in trips_named.columns]
if grp_bus:
    base = trips_named[grp_bus + ["trip_uid","trip_match","pts_in","pts_out"]].drop_duplicates(subset=grp_bus + ["trip_uid"])
    facts_bus = (
        base.groupby(["agency_id","linea","hora","mean_id","bus_label"], dropna=False)
            .agg(total_trips=("trip_uid","nunique"),
                 trips_ok=("trip_match","sum"),
                 total_pts_in=("pts_in","sum"),
                 total_pts_out=("pts_out","sum"))
            .reset_index()
    )
    facts_bus["pct_trips_ok"] = (facts_bus["trips_ok"] / facts_bus["total_trips"]).replace([np.inf,-np.inf], np.nan).round(3)
    facts_bus["pct_pts_in"]   = (facts_bus["total_pts_in"] / (facts_bus["total_pts_in"] + facts_bus["total_pts_out"])).replace([np.inf,-np.inf], np.nan).round(3)

    print("Vista por bus (muestra):")
    display(facts_bus.sort_values(["agency_id","linea","hora","bus_label"]).head(20))
else:
    print("No hay columnas suficientes para KPI por bus (se requiere agency_id, linea, mean_id).")


Vista por bus (muestra):


Unnamed: 0,agency_id,linea,hora,mean_id,bus_label,total_trips,trips_ok,total_pts_in,total_pts_out,pct_trips_ok,pct_pts_in
1,5,101,10,005FB,Bus 28,1,0,0,182,0.0,0.0
2,5,101,10,00600,Bus 33,1,1,197,0,1.0,1.0
3,5,101,10,00679,Bus 51,1,1,201,0,1.0,1.0
0,5,101,10,005E6,Bus 9,1,0,0,174,0.0,0.0
4,5,111,10,005FD,Bus 30,1,1,205,0,1.0,1.0
5,5,111,10,005FF,Bus 32,1,1,186,0,1.0,1.0
6,5,111,10,006E1,Bus 58,1,1,163,18,1.0,0.901
7,5,111,10,006E4,Bus 61,1,1,199,0,1.0,1.0
8,5,111,10,006E6,Bus 63,1,1,200,0,1.0,1.0
9,5,111,10,006E7,Bus 64,1,1,116,4,1.0,0.967


In [33]:
# AÑADIR CAMPOS DE RUTA A trips DESDE catalogo_rutas_cid.csv

from pathlib import Path
import pandas as pd

PATH_RUTAS = Path("data/raw/catalogo_rutas_cid.csv")

rutas_cat = pd.read_csv(PATH_RUTAS, dtype=str)
rutas_cat.columns = [c.strip().lower() for c in rutas_cat.columns]

if "ruta_hex" not in rutas_cat.columns:
    raise ValueError("catalogo_rutas_cid.csv no tiene columna 'ruta_hex'.")

rutas_cat["ruta_hex"] = rutas_cat["ruta_hex"].astype(str).str.upper().str.strip()
extra_cols = [c for c in ["linea","ramal","origen","destino","identificacion"] if c in rutas_cat.columns]
rutas_dim = rutas_cat[["ruta_hex"] + extra_cols].drop_duplicates()

# Merge a trips (si aún no se hizo)
if not set(extra_cols).issubset(trips.columns):
    trips = trips.merge(rutas_dim, on="ruta_hex", how="left")


In [34]:
# DIMENSIÓN LÍNEA PARA DESCRIPTORES Y MERGE A FACTS/FACTS_NAMED

desc_cols = [c for c in ["ramal","origen","destino","identificacion"] if c in trips.columns]

# Construimos una tabla por (agency_id, linea) con el primer no-nulo de cada descriptor
def first_non_null(s: pd.Series):
    s = s.dropna()
    return s.iloc[0] if len(s) else pd.NA

if "linea" in trips.columns and desc_cols:
    linea_dim = (
        trips.dropna(subset=["linea"])
             .groupby(["agency_id","linea"], dropna=False)[desc_cols]
             .agg(first_non_null)
             .reset_index()
    )
    # facts/facts_named pueden existir según tu flujo; aplicamos a ambos si están
    if "facts" in globals():
        facts = facts.merge(linea_dim, on=["agency_id","linea"], how="left")
    if "facts_named" in globals():
        facts_named = facts_named.merge(linea_dim, on=["agency_id","linea"], how="left")
    else:
        facts_named = facts.copy()
else:
    # Si no hay línea o no hay descriptores, aseguremos facts_named
    if "facts_named" not in globals():
        facts_named = facts.copy()


In [35]:
# VISTA INTERACTIVA: Empresa, Línea, Hora + Ramal, Origen, Destino, Identificación

import ipywidgets as widgets
from IPython.display import display, clear_output
import numpy as np
import pandas as pd

df_display = facts_named.copy() if "facts_named" in globals() else facts.copy()

# Asegurar empresa_nombre
if "empresa_nombre" not in df_display.columns:
    df_display["empresa_nombre"] = df_display.get("agency_id", pd.Series([""]*len(df_display)))

# Opciones base
empresas = sorted(df_display["empresa_nombre"].dropna().astype(str).unique().tolist())
lineas   = sorted(df_display["linea"].dropna().astype(str).unique().tolist()) if "linea" in df_display.columns else []
horas    = sorted(df_display["hora"].dropna().astype(int).unique().tolist()) if "hora" in df_display.columns else []

# Opciones de descriptores
ramales = sorted(df_display["ramal"].dropna().astype(str).unique().tolist()) if "ramal" in df_display.columns else []
origenes = sorted(df_display["origen"].dropna().astype(str).unique().tolist()) if "origen" in df_display.columns else []
destinos = sorted(df_display["destino"].dropna().astype(str).unique().tolist()) if "destino" in df_display.columns else []
idents   = sorted(df_display["identificacion"].dropna().astype(str).unique().tolist()) if "identificacion" in df_display.columns else []

# Widgets (Dropdowns)
emp_select  = widgets.Dropdown(options=["(todas)"] + empresas, description="Empresa:")
lin_select  = widgets.Dropdown(options=["(todas)"] + lineas, description="Línea:")
hor_select  = widgets.Dropdown(options=["(todas)"] + horas, description="Hora:")

ram_select  = widgets.Dropdown(options=["(todos)"] + ramales, description="Ramal:")
ori_select  = widgets.Dropdown(options=["(todos)"] + origenes, description="Origen:")
des_select  = widgets.Dropdown(options=["(todos)"] + destinos, description="Destino:")
idn_select  = widgets.Dropdown(options=["(todos)"] + idents, description="Identif:")

out = widgets.Output()

def update_view(_=None):
    with out:
        clear_output(wait=True)
        df_f = df_display.copy()

        # Filtros esenciales
        if emp_select.value != "(todas)":
            df_f = df_f[df_f["empresa_nombre"] == emp_select.value]
        if lin_select.value != "(todas)" and "linea" in df_f.columns:
            df_f = df_f[df_f["linea"] == lin_select.value]
        if hor_select.value != "(todas)" and "hora" in df_f.columns:
            df_f = df_f[df_f["hora"] == hor_select.value]

        # Filtros de descriptores
        if "ramal" in df_f.columns and ram_select.value != "(todos)":
            df_f = df_f[df_f["ramal"] == ram_select.value]
        if "origen" in df_f.columns and ori_select.value != "(todos)":
            df_f = df_f[df_f["origen"] == ori_select.value]
        if "destino" in df_f.columns and des_select.value != "(todos)":
            df_f = df_f[df_f["destino"] == des_select.value]
        if "identificacion" in df_f.columns and idn_select.value != "(todos)":
            df_f = df_f[df_f["identificacion"] == idn_select.value]

        if df_f.empty:
            print("Sin resultados para la selección.")
            return

        # Métricas agregadas
        def safe_sum(col):
            return int(df_f[col].sum()) if col in df_f.columns and not df_f.empty else 0

        total_trips = safe_sum("total_trips")
        trips_ok    = safe_sum("trips_ok")
        trips_rm    = safe_sum("trips_route_match") if "trips_route_match" in df_f.columns else 0
        pts_in      = safe_sum("total_pts_in") if "total_pts_in" in df_f.columns else 0
        pts_out     = safe_sum("total_pts_out") if "total_pts_out" in df_f.columns else 0

        mean_pct_ok = df_f["pct_trips_ok"].mean() if "pct_trips_ok" in df_f.columns else np.nan
        print(f"Total trips: {total_trips:,}")
        print(f"Trips OK (≥0.6): {trips_ok:,}  |  {mean_pct_ok:.1%} promedio" if pd.notna(mean_pct_ok) else f"Trips OK (≥0.6): {trips_ok:,}")
        if "total_pts_in" in df_f.columns and "total_pts_out" in df_f.columns:
            print(f"Puntos dentro: {pts_in:,} / fuera: {pts_out:,}")
        if "pct_route_match" in df_f.columns:
            mean_pct_rm = df_f["pct_route_match"].mean()
            print(f"Coincidencia declarada=ejecutada (promedio filas): {mean_pct_rm:.1%}" if pd.notna(mean_pct_rm) else "Coincidencia declarada=ejecutada: s/d")

        # Tabla detalle (incluye descriptores)
        cols_show = [
            c for c in ["empresa_nombre","agency_id","linea","ramal","origen","destino","identificacion","hora",
                        "total_trips","trips_ok","pct_trips_ok",
                        "trips_route_match","pct_route_match",
                        "total_pts_in","total_pts_out","pct_pts_in"]
            if c in df_f.columns
        ]
        if cols_show:
            display(
                df_f[cols_show]
                .sort_values(["empresa_nombre","linea","hora"], na_position="last")
                .reset_index(drop=True)
                .head(30)
            )
        else:
            print("No hay columnas esperadas para mostrar el resumen.")

# Eventos
for w in [emp_select, lin_select, hor_select, ram_select, ori_select, des_select, idn_select]:
    w.observe(update_view, names="value")

display(widgets.HBox([emp_select, lin_select, hor_select]))
display(widgets.HBox([ram_select, ori_select, des_select, idn_select]))
display(out)
update_view()


HBox(children=(Dropdown(description='Empresa:', options=('(todas)', '1° DE DICIEMBRE SRL', 'ALDANA SA', 'CIUDA…

HBox(children=(Dropdown(description='Ramal:', options=('(todos)', '1', '2', '3', '32', '4', '5', '6', '8'), va…

Output()

In [37]:
pip install h3


Collecting h3
  Downloading h3-4.3.1-cp313-cp313-win_amd64.whl.metadata (18 kB)
Downloading h3-4.3.1-cp313-cp313-win_amd64.whl (783 kB)
   ---------------------------------------- 0.0/783.8 kB ? eta -:--:--
   ------------- -------------------------- 262.1/783.8 kB ? eta -:--:--
   ---------------------------------------- 783.8/783.8 kB 5.0 MB/s  0:00:00
Installing collected packages: h3
Successfully installed h3-4.3.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
# BLOQUE Q0: imports, paths y helpers para H3 -> polígono y lectura de listas

from pathlib import Path
import json, ast
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, Polygon
import h3

PATH_RUTAS_H3 = Path("data/processed/rutas_h3.parquet")       # contiene ruta_hex + h3_list
PATH_TRIPS    = Path("data/processed/gps_match_trips.parquet") 
PATH_POINTS   = Path("data/processed/gps_match_points.parquet")# puntos crudos por trip
PATH_RUTAS_CAT= Path("data/raw/catalogo_rutas_cid.csv")

OUT_DIR = Path("data/processed/qgis_from_selection")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CRS = "EPSG:4326"

def parse_h3_list(val):
    """Normaliza distintos formatos de entrada a una lista de celdas H3.

    Soporta:
    - list/tuple
    - pandas.Series
    - numpy.ndarray
    - string JSON / literal Python (ej: "[\"8a1...\", \"8a2...\"]")
    - valores nulos
    """
    # None explícito
    if val is None:
        return []

    # ya es lista/tuple
    if isinstance(val, (list, tuple)):
        return list(val)

    # pandas Series o numpy array -> convertir a lista y filtrar nulos
    if isinstance(val, (pd.Series, np.ndarray)):
        try:
            lst = list(val.tolist())
            return [x for x in lst if pd.notna(x)]
        except Exception:
            return []

    # escalares con NA
    try:
        if pd.isna(val):
            return []
    except Exception:
        # pd.isna puede fallar para algunos tipos (ej arrays); ya manejados arriba
        pass

    s = str(val).strip()
    if not s:
        return []

    # intentar JSON
    try:
        parsed = json.loads(s)
        if isinstance(parsed, (list, tuple)):
            return list(parsed)
        else:
            return [parsed]
    except Exception:
        try:
            parsed = ast.literal_eval(s)
            if isinstance(parsed, (list, tuple)):
                return list(parsed)
            else:
                return [parsed]
        except Exception:
            return []


def boundary_to_polygon(hcell: str) -> Polygon:
    if hasattr(h3, "h3_to_geo_boundary"):            # v3
        coords = h3.h3_to_geo_boundary(hcell, geo_json=True)  # [(lat, lng), ...]
    elif hasattr(h3, "cell_to_boundary"):            # v4
        coords = h3.cell_to_boundary(hcell)                   # [(lat, lng), ...]
    else:
        raise RuntimeError("No hay funciones de boundary disponibles en 'h3'.")
    ring = [(lng, lat) for lat, lng in coords]
    return Polygon(ring)


In [None]:
# BLOQUE Q1: selección actual basada en los widgets (facts y trips)


def current_selection_facts():
    df = facts_named.copy() if "facts_named" in globals() else facts.copy()
    # esenciales
    if emp_select.value != "(todas)":
        df = df[df["empresa_nombre"] == emp_select.value]
    if "linea" in df.columns and lin_select.value != "(todas)":
        df = df[df["linea"] == lin_select.value]
    if "hora" in df.columns and hor_select.value != "(todas)":
        df = df[df["hora"] == hor_select.value]
    # descriptores
    if "ramal" in df.columns and ram_select.value != "(todos)":
        df = df[df["ramal"] == ram_select.value]
    if "origen" in df.columns and ori_select.value != "(todos)":
        df = df[df["origen"] == ori_select.value]
    if "destino" in df.columns and des_select.value != "(todos)":
        df = df[df["destino"] == des_select.value]
    if "identificacion" in df.columns and idn_select.value != "(todos)":
        df = df[df["identificacion"] == idn_select.value]
    return df

def current_selection_trips():
    df = trips_named.copy()
    # esenciales
    if emp_select.value != "(todas)":
        df = df[df["empresa_nombre"] == emp_select.value]
    if "linea" in df.columns and lin_select.value != "(todas)":
        df = df[df["linea"] == lin_select.value]
    if "hora" in df.columns and hor_select.value != "(todas)":
        df = df[df["hora"] == hor_select.value]
    # descriptores
    if "ramal" in df.columns and ram_select.value != "(todos)":
        df = df[df["ramal"] == ram_select.value]
    if "origen" in df.columns and ori_select.value != "(todos)":
        df = df[df["origen"] == ori_select.value]
    if "destino" in df.columns and des_select.value != "(todos)":
        df = df[df["destino"] == des_select.value]
    if "identificacion" in df.columns and idn_select.value != "(todos)":
        df = df[df["identificacion"] == idn_select.value]
    # bus específico (opcional)
    if "bus_label" in df.columns and bus_select.value and bus_select.value != "(todos)":
        df = df[df["bus_label"] == bus_select.value]
    return df

sel_facts  = current_selection_facts()
sel_trips  = current_selection_trips()

print("Selection facts:", sel_facts.shape)
print("Selection trips:", sel_trips.shape)


Selection facts: (67, 16)
Selection trips: (521, 22)


In [40]:
# BLOQUE Q2: export CSV de la selección actual
csv_facts = OUT_DIR / "selection_kpis.csv"
csv_trips = OUT_DIR / "selection_trips.csv"

sel_facts.to_csv(csv_facts, index=False)
sel_trips.to_csv(csv_trips, index=False)

print("Exportados:")
print(" -", csv_facts.resolve())
print(" -", csv_trips.resolve())


Exportados:
 - D:\x\OneDrive\Escritorio\Python\cumplimiento-rutas-h3\data\processed\qgis_from_selection\selection_kpis.csv
 - D:\x\OneDrive\Escritorio\Python\cumplimiento-rutas-h3\data\processed\qgis_from_selection\selection_trips.csv


In [43]:
# BLOQUE Q3: construir capa de polígonos H3 de las rutas ejecutadas en la selección

# 1) ruta_hex objetivo: usar los ruta_hex presentes en los trips filtrados
ruta_hex_sel = sorted(sel_trips.get("ruta_hex", pd.Series([], dtype=str)).dropna().astype(str).str.upper().unique().tolist())

# Fallback opcional: si quisieras derivar desde la línea (por si faltara ruta_hex)
if not ruta_hex_sel and "linea" in sel_trips.columns and PATH_RUTAS_CAT.exists():
    cat = pd.read_csv(PATH_RUTAS_CAT, dtype=str)
    cat.columns = [c.strip().lower() for c in cat.columns]
    cat["ruta_hex"] = cat["ruta_hex"].astype(str).str.upper().str.strip()
    ruta_hex_sel = sorted(
        cat[cat["linea"].isin(sel_trips["linea"].dropna().astype(str))]["ruta_hex"].dropna().unique().tolist()
    )

if not ruta_hex_sel:
    raise ValueError("No hay ruta_hex en la selección. Verificá filtros o columnas.")

# 2) leer rutas_h3.parquet y filtrar
rutas_h3 = pd.read_parquet(PATH_RUTAS_H3, engine="pyarrow").copy()
if "ruta_hex" not in rutas_h3.columns:
    raise ValueError("rutas_h3.parquet debe tener columna 'ruta_hex'.")
if "h3_list" not in rutas_h3.columns:
    alt = next((c for c in ["h3_cells","h3_hexes"] if c in rutas_h3.columns), None)
    if alt: rutas_h3 = rutas_h3.rename(columns={alt:"h3_list"})
    else: raise ValueError("No se encontró 'h3_list' (ni h3_cells/h3_hexes) en rutas_h3.parquet.")

rutas_h3["ruta_hex"] = rutas_h3["ruta_hex"].astype(str).str.upper().str.strip()
rutas_h3_sel = rutas_h3[rutas_h3["ruta_hex"].isin(ruta_hex_sel)].copy()

# 3) poligonizar H3
polys = []
for _, row in rutas_h3_sel.iterrows():
    rhex = row["ruta_hex"]
    hlist = parse_h3_list(row["h3_list"])
    for hcell in hlist:
        try:
            poly = boundary_to_polygon(str(hcell))
            polys.append((rhex, str(hcell), poly))
        except Exception:
            pass

routes_hex_gdf = gpd.GeoDataFrame(polys, columns=["ruta_hex","h3","geometry"], crs=CRS)
print("Polígonos H3 construidos:", len(routes_hex_gdf))


Polígonos H3 construidos: 547


In [44]:
# BLOQUE Q4: centroides de trips (primer punto por trip) y split OK/Fail

# Identificadores de los trips seleccionados (trip_uid o el par agency_id/mean_id/trip_id)
if "trip_uid" in sel_trips.columns:
    trip_uids = set(sel_trips["trip_uid"].astype(str))
else:
    # construir por si acaso
    trip_uids = set(sel_trips["mean_id"].astype(str) + "§" + sel_trips["trip_id"].astype(str))

# cargar puntos
pts = pd.read_parquet(PATH_POINTS, engine="pyarrow").copy()
# normalizaciones mínimas de join
for c in ("agency_id","mean_id","trip_id","ruta_hex"):
    if c in pts.columns: pts[c] = pts[c].astype(str).str.upper().str.strip()

# timestamp
if "fecha_hora" in pts.columns and not pd.api.types.is_datetime64_any_dtype(pts["fecha_hora"]):
    pts["fecha_hora"] = pd.to_datetime(pts["fecha_hora"], errors="coerce", utc=True)

# construir trip_uid en puntos
pts["trip_uid"] = pts["mean_id"].astype(str) + "§" + pts["trip_id"].astype(str)

# filtrar puntos de la selección
pts_sel = pts[pts["trip_uid"].isin(trip_uids)].copy()

# primer punto por trip (orden temporal)
if "fecha_hora" in pts_sel.columns:
    pts_sel = pts_sel.sort_values(["trip_uid","fecha_hora"])
first_pts = pts_sel.groupby("trip_uid", as_index=False).first()

# adjuntar flag trip_match desde sel_trips
mini = sel_trips[["trip_uid","trip_match"]].drop_duplicates()
centroids = first_pts.merge(mini, on="trip_uid", how="left")

# convertir a GeoDataFrame
centroids_gdf = gpd.GeoDataFrame(
    centroids,
    geometry=gpd.points_from_xy(centroids["longitude"].astype(float), centroids["latitude"].astype(float)),
    crs=CRS
)
ok_gdf   = centroids_gdf[centroids_gdf["trip_match"] == True].copy()
fail_gdf = centroids_gdf[centroids_gdf["trip_match"] == False].copy()

print("Trips OK:", len(ok_gdf), " | Trips Fail:", len(fail_gdf))


Trips OK: 178  | Trips Fail: 270


In [None]:
# BLOQUE Q5: puntos crudos filtrados (con sampling opcional)

# Parámetros para limitar el tamaño
POINTS_MAX = 200_000      # máximo de puntos a exportar
SAMPLE_FRAC = 0.1         # fracción si hay demasiados; se ignora si entra en el MAX

gps_points_sel = pts_sel.copy()

# muestreo si excede el máximo
if len(gps_points_sel) > POINTS_MAX:
    n = int(min(len(gps_points_sel) * SAMPLE_FRAC, POINTS_MAX))
    gps_points_sel = gps_points_sel.sample(n=n, random_state=42)

gps_points_gdf = gpd.GeoDataFrame(
    gps_points_sel,
    geometry=gpd.points_from_xy(gps_points_sel["longitude"].astype(float), gps_points_sel["latitude"].astype(float)),
    crs=CRS
)

print("Puntos crudos a exportar:", len(gps_points_gdf))


Puntos crudos a exportar: 81458


In [46]:
# BLOQUE Q6: escribir GeoPackage con las capas de la selección

gpkg_path = OUT_DIR / "selection_layers.gpkg"

# QGIS reescribe capas si existen; para evitar residuo, podés borrar el archivo previo
if gpkg_path.exists():
    gpkg_path.unlink()

routes_hex_gdf.to_file(gpkg_path, layer="routes_h3_h8", driver="GPKG")
ok_gdf.to_file(gpkg_path, layer="gps_trips_ok", driver="GPKG")
fail_gdf.to_file(gpkg_path, layer="gps_trips_fail", driver="GPKG")
gps_points_gdf.to_file(gpkg_path, layer="gps_points", driver="GPKG")

print("GeoPackage escrito en:", gpkg_path.resolve())
print("Capas:")
print(" - routes_h3_h8")
print(" - gps_trips_ok")
print(" - gps_trips_fail")
print(" - gps_points")


GeoPackage escrito en: D:\x\OneDrive\Escritorio\Python\cumplimiento-rutas-h3\data\processed\qgis_from_selection\selection_layers.gpkg
Capas:
 - routes_h3_h8
 - gps_trips_ok
 - gps_trips_fail
 - gps_points


In [47]:
# BLOQUE Q7 (opcional): exportar ids mínimos de la selección para reproducir en otros flujos

ids_path = OUT_DIR / "selection_ids.json"
payload = {
    "agency_ids": sorted(sel_trips["agency_id"].dropna().unique().tolist()),
    "lineas": sorted(sel_trips.get("linea", pd.Series([], dtype=str)).dropna().unique().tolist()),
    "ruta_hex": sorted(sel_trips.get("ruta_hex", pd.Series([], dtype=str)).dropna().unique().tolist()),
    "trip_uids": sorted(list(trip_uids)),
}
import json
with open(ids_path, "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)
print("IDs exportados en:", ids_path.resolve())


IDs exportados en: D:\x\OneDrive\Escritorio\Python\cumplimiento-rutas-h3\data\processed\qgis_from_selection\selection_ids.json


In [48]:
# BLOQUE Q8: hexágonos de densidad (H3) para los puntos filtrados (gps_points_gdf)
# Crea capa adicional: gps_points_hexcounts (Polygon) con "count" por celda H3.

import math

H3_RES = 8  # cambia si quieres otra resolución (8 está bien para urbano)
use_existing_h3 = "h3" in gps_points_gdf.columns

def latlon_to_h3(lat, lon, res=H3_RES):
    if hasattr(h3, "geo_to_h3"):      # v3
        return h3.geo_to_h3(lat, lon, res)
    elif hasattr(h3, "latlng_to_cell"): # v4
        return h3.latlng_to_cell(lat, lon, res)
    else:
        raise RuntimeError("No hay función para convertir lat/lon a H3.")

pts_df = gps_points_gdf.copy()
if not use_existing_h3:
    pts_df["h3"] = [
        latlon_to_h3(float(lat), float(lon), H3_RES)
        for lat, lon in zip(pts_df["latitude"], pts_df["longitude"])
    ]

hex_counts = (
    pts_df.groupby("h3", dropna=True)
          .size()
          .reset_index(name="count")
)

def boundary_to_polygon_any(hcell: str):
    if hasattr(h3, "h3_to_geo_boundary"):
        coords = h3.h3_to_geo_boundary(hcell, geo_json=True)
    else:
        coords = h3.cell_to_boundary(hcell)
    ring = [(lng, lat) for lat, lng in coords]
    return Polygon(ring)

hex_polys = []
for _, row in hex_counts.iterrows():
    try:
        poly = boundary_to_polygon_any(str(row["h3"]))
        hex_polys.append((row["h3"], int(row["count"]), poly))
    except Exception:
        pass

gps_points_hexcounts = gpd.GeoDataFrame(hex_polys, columns=["h3","count","geometry"], crs=CRS)

# Escribir como capa adicional al mismo GPKG
gps_points_hexcounts.to_file(gpkg_path, layer="gps_points_hexcounts", driver="GPKG")
print("Hex densidad creados:", len(gps_points_hexcounts))


Hex densidad creados: 199


In [51]:
pip install fiona

Collecting fiona
  Downloading fiona-1.10.1-cp313-cp313-win_amd64.whl.metadata (58 kB)
Collecting attrs>=19.2.0 (from fiona)
  Downloading attrs-25.4.0-py3-none-any.whl.metadata (10 kB)
Collecting click~=8.0 (from fiona)
  Downloading click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting click-plugins>=1.0 (from fiona)
  Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting cligj>=0.5 (from fiona)
  Downloading cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Downloading fiona-1.10.1-cp313-cp313-win_amd64.whl (24.5 MB)
   ---------------------------------------- 0.0/24.5 MB ? eta -:--:--
   --- ------------------------------------ 1.8/24.5 MB 10.9 MB/s eta 0:00:03
   ----- ---------------------------------- 3.1/24.5 MB 9.6 MB/s eta 0:00:03
   ------- -------------------------------- 4.5/24.5 MB 7.8 MB/s eta 0:00:03
   ----------- ---------------------------- 6.8/24.5 MB 8.9 MB/s eta 0:00:02
   --------------- ------------------------ 9.2/24.5 MB 9.4 MB/s

In [52]:
# BLOQUE Q9: proyecto QGIS (.qgs) que referencia el GPKG + estilos verde/rojo

project_path = OUT_DIR / "selection_project.qgs"

def _qgis_layer_vector(layer_id, layer_name, geom_type, gpkg_path, gpkg_layer, symbol_xml):
    # symbol_xml: bloque <renderer-v2>...</renderer-v2>
    ds = f"{gpkg_path.resolve().as_posix()}|layername={gpkg_layer}"
    return f"""
    <maplayer type="vector" geometry="{geom_type}">
      <id>{layer_id}</id>
      <datasource>{ds}</datasource>
      <layername>{layer_name}</layername>
      <provider>ogr</provider>
      {symbol_xml}
    </maplayer>
    """

def _renderer_single_symbol_polygon_outline(outline_color="#7f7f7f", outline_width="0.5", fill_color="255,255,255,0"):
    return f"""
    <renderer-v2 type="singleSymbol" symbollevels="0" forceraster="0" enableorderby="0">
      <symbols>
        <symbol name="0" alpha="1" clip_to_extent="1" type="fill">
          <layer pass="0" class="SimpleFill" enabled="1" locked="0">
            <prop k="color" v="{fill_color}"/>
            <prop k="outline_color" v="{outline_color}"/>
            <prop k="outline_width" v="{outline_width}"/>
            <prop k="style" v="no"/>
          </layer>
        </symbol>
      </symbols>
    </renderer-v2>
    """

def _renderer_single_symbol_point(color="0,158,115,255", size="2.2"):
    return f"""
    <renderer-v2 type="singleSymbol" symbollevels="0" forceraster="0" enableorderby="0">
      <symbols>
        <symbol name="0" alpha="1" clip_to_extent="1" type="marker">
          <layer pass="0" class="SimpleMarker" enabled="1" locked="0">
            <prop k="color" v="{color}"/>
            <prop k="outline_color" v="0,0,0,0"/>
            <prop k="name" v="circle"/>
            <prop k="size" v="{size}"/>
          </layer>
        </symbol>
      </symbols>
    </renderer-v2>
    """

def _renderer_single_symbol_polygon_fill(color="220,220,220,120", outline_color="160,160,160,120", outline_width="0.2"):
    return f"""
    <renderer-v2 type="singleSymbol" symbollevels="0" forceraster="0" enableorderby="0">
      <symbols>
        <symbol name="0" alpha="1" clip_to_extent="1" type="fill">
          <layer pass="0" class="SimpleFill" enabled="1" locked="0">
            <prop k="color" v="{color}"/>
            <prop k="outline_color" v="{outline_color}"/>
            <prop k="outline_width" v="{outline_width}"/>
            <prop k="style" v="solid"/>
          </layer>
        </symbol>
      </symbols>
    </renderer-v2>
    """

# Estilos 
# verde OK  -> "0,158,115,255"  (#1b9e77 aprox)
# rojo FAIL -> "217,95,2,255"   (#d95f02 aprox)
# gris puntos -> "120,120,120,180"
# rutas: sin relleno, borde gris
# hex densidad (si existe): relleno gris claro

layers_xml = []

# routes_h3_h8 (Polygon, outline)
layers_xml.append(
    _qgis_layer_vector(
        layer_id="routes_h3_h8",
        layer_name="routes_h3_h8",
        geom_type="Polygon",
        gpkg_path=gpkg_path,
        gpkg_layer="routes_h3_h8",
        symbol_xml=_renderer_single_symbol_polygon_outline(outline_color="120,120,120,200", outline_width="0.6")
    )
)

# gps_trips_ok (Point, green)
layers_xml.append(
    _qgis_layer_vector(
        layer_id="gps_trips_ok",
        layer_name="gps_trips_ok",
        geom_type="Point",
        gpkg_path=gpkg_path,
        gpkg_layer="gps_trips_ok",
        symbol_xml=_renderer_single_symbol_point(color="0,158,115,255", size="2.4")
    )
)

# gps_trips_fail (Point, red)
layers_xml.append(
    _qgis_layer_vector(
        layer_id="gps_trips_fail",
        layer_name="gps_trips_fail",
        geom_type="Point",
        gpkg_path=gpkg_path,
        gpkg_layer="gps_trips_fail",
        symbol_xml=_renderer_single_symbol_point(color="217,95,2,255", size="2.4")
    )
)

# gps_points (Point, grey small)
layers_xml.append(
    _qgis_layer_vector(
        layer_id="gps_points",
        layer_name="gps_points",
        geom_type="Point",
        gpkg_path=gpkg_path,
        gpkg_layer="gps_points",
        symbol_xml=_renderer_single_symbol_point(color="120,120,120,160", size="1.6")
    )
)

# gps_points_hexcounts (Polygon fill), si existe
try:
    # Intento rápido: si la capa existe, añadirla
    import fiona
    with fiona.Env():
        with fiona.open(gpkg_path, layer="gps_points_hexcounts") as _:
            layers_xml.append(
                _qgis_layer_vector(
                    layer_id="gps_points_hexcounts",
                    layer_name="gps_points_hexcounts",
                    geom_type="Polygon",
                    gpkg_path=gpkg_path,
                    gpkg_layer="gps_points_hexcounts",
                    symbol_xml=_renderer_single_symbol_polygon_fill(color="200,200,200,120", outline_color="150,150,150,120", outline_width="0.3")
                )
            )
except Exception:
    pass

project_xml = f"""<?xml version="1.0" encoding="UTF-8"?>
<qgis projectname="SelectionProject" version="3.44.4" simplifyMaxScale="1" simplifyLocal="1">
  <layer-tree-group name="Selección actual">
    <layer-tree-layer name="gps_trips_fail" id="gps_trips_fail"/>
    <layer-tree-layer name="gps_trips_ok" id="gps_trips_ok"/>
    <layer-tree-layer name="gps_points" id="gps_points"/>
    <layer-tree-layer name="routes_h3_h8" id="routes_h3_h8"/>
    <layer-tree-layer name="gps_points_hexcounts" id="gps_points_hexcounts"/>
  </layer-tree-group>
  <projectlayers>
    {''.join(layers_xml)}
  </projectlayers>
</qgis>
"""

with open(project_path, "w", encoding="utf-8") as f:
    f.write(project_xml)

print("Proyecto QGIS creado en:", project_path.resolve())
print("Abre en QGIS y deberías ver:")
print(" - routes_h3_h8 (polígono, borde gris)")
print(" - gps_trips_ok (punto verde)")
print(" - gps_trips_fail (punto rojo)")
print(" - gps_points (punto gris)")
print(" - gps_points_hexcounts (polígono gris claro, si lo generaste)")


Proyecto QGIS creado en: D:\x\OneDrive\Escritorio\Python\cumplimiento-rutas-h3\data\processed\qgis_from_selection\selection_project.qgs
Abre en QGIS y deberías ver:
 - routes_h3_h8 (polígono, borde gris)
 - gps_trips_ok (punto verde)
 - gps_trips_fail (punto rojo)
 - gps_points (punto gris)
 - gps_points_hexcounts (polígono gris claro, si lo generaste)


In [None]:
# BOTÓN: regenerar GPKG + proyecto QGIS (.qgs) según la selección actual de filtros
# - Usa los mismos widgets: emp_select, lin_select, hor_select, ram_select, ori_select, des_select, idn_select, bus_select
# - Requiere: facts_named/facts, trips_named, PATH_* y helpers parse_h3_list / boundary_to_polygon del bloque Q0
# - Genera: data/processed/qgis_from_selection/selection_layers.gpkg + selection_project.qgs

import ipywidgets as widgets
from IPython.display import display, clear_output
from pathlib import Path
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Polygon, Point
import json, ast, os, fiona, h3
import time

# Parámetros configurables desde la UI
POINTS_MAX_DEFAULT = 200_000
SAMPLE_FRAC_DEFAULT = 0.10
H3_RES_DEFAULT = 8

# Widgets de control
w_points_max   = widgets.IntText(value=POINTS_MAX_DEFAULT, description="Máx. puntos:", layout=widgets.Layout(width="200px"))
w_sample_frac  = widgets.BoundedFloatText(value=SAMPLE_FRAC_DEFAULT, min=0.01, max=1.0, step=0.01, description="Frac. sample:", layout=widgets.Layout(width="220px"))
w_h3_res       = widgets.BoundedIntText(value=H3_RES_DEFAULT, min=4, max=12, step=1, description="H3 res:", layout=widgets.Layout(width="180px"))
w_hex_density  = widgets.Checkbox(value=True, description="Crear hex de densidad (gps_points_hexcounts)")
btn_run        = widgets.Button(description="Generar GPKG + Proyecto QGIS", button_style="success")
out_qgis       = widgets.Output()

display(widgets.HBox([w_points_max, w_sample_frac, w_h3_res, w_hex_density]))
display(btn_run)
display(out_qgis)

# Helpers mínimos (por si no están en la sesión)
def _parse_h3_list(val):
    if isinstance(val, list): return val
    if pd.isna(val): return []
    s = str(val).strip()
    try:
        return json.loads(s)
    except Exception:
        try:
            return ast.literal_eval(s)
        except Exception:
            return []

def _boundary_to_polygon(hcell: str) -> Polygon:
    if hasattr(h3, "h3_to_geo_boundary"):            # v3
        coords = h3.h3_to_geo_boundary(hcell, geo_json=True)
    elif hasattr(h3, "cell_to_boundary"):            # v4
        coords = h3.cell_to_boundary(hcell)
    else:
        raise RuntimeError("No hay funciones de boundary disponibles en 'h3'.")
    ring = [(lng, lat) for lat, lng in coords]
    return Polygon(ring)

def _latlon_to_h3(lat, lon, res: int):
    if hasattr(h3, "geo_to_h3"):       # v3
        return h3.geo_to_h3(lat, lon, res)
    elif hasattr(h3, "latlng_to_cell"): # v4
        return h3.latlng_to_cell(lat, lon, res)
    else:
        raise RuntimeError("No hay función para lat/lon → H3.")

def _current_selection_facts():
    base = facts_named.copy() if "facts_named" in globals() else facts.copy()
    if emp_select.value != "(todas)": base = base[base["empresa_nombre"] == emp_select.value]
    if "linea" in base.columns and lin_select.value != "(todas)": base = base[base["linea"] == lin_select.value]
    if "hora"  in base.columns and hor_select.value != "(todas)": base = base[base["hora"] == hor_select.value]
    if "ramal" in base.columns and ram_select.value != "(todos)": base = base[base["ramal"] == ram_select.value]
    if "origen" in base.columns and ori_select.value != "(todos)": base = base[base["origen"] == ori_select.value]
    if "destino" in base.columns and des_select.value != "(todos)": base = base[base["destino"] == des_select.value]
    if "identificacion" in base.columns and idn_select.value != "(todos)": base = base[base["identificacion"] == idn_select.value]
    return base

def _current_selection_trips():
    base = trips_named.copy()
    if emp_select.value != "(todas)": base = base[base["empresa_nombre"] == emp_select.value]
    if "linea" in base.columns and lin_select.value != "(todas)": base = base[base["linea"] == lin_select.value]
    if "hora"  in base.columns and hor_select.value != "(todas)": base = base[base["hora"] == hor_select.value]
    if "ramal" in base.columns and ram_select.value != "(todos)": base = base[base["ramal"] == ram_select.value]
    if "origen" in base.columns and ori_select.value != "(todos)": base = base[base["origen"] == ori_select.value]
    if "destino" in base.columns and des_select.value != "(todos)": base = base[base["destino"] == des_select.value]
    if "identificacion" in base.columns and idn_select.value != "(todos)": base = base[base["identificacion"] == idn_select.value]
    if "bus_label" in base.columns and bus_select.value and bus_select.value != "(todos)":
        base = base[base["bus_label"] == bus_select.value]
    return base

def _write_qgs(gpkg_path: Path, out_path: Path):
    # Minimal XML con estilos: verde OK, rojo FAIL, gris para puntos, rutas borde gris
    def _layer(layer_id, layer_name, geom_type, gpkg_layer, symbol_xml):
        ds = f"{gpkg_path.resolve().as_posix()}|layername={gpkg_layer}"
        return f"""
    <maplayer type="vector" geometry="{geom_type}">
      <id>{layer_id}</id>
      <datasource>{ds}</datasource>
      <layername>{layer_name}</layername>
      <provider>ogr</provider>
      {symbol_xml}
    </maplayer>
"""
    def _renderer_polygon_outline(outline_color="120,120,120,200", outline_width="0.6"):
        return f"""
    <renderer-v2 type="singleSymbol">
      <symbols>
        <symbol type="fill" name="0">
          <layer class="SimpleFill">
            <prop k="color" v="255,255,255,0"/>
            <prop k="outline_color" v="{outline_color}"/>
            <prop k="outline_width" v="{outline_width}"/>
            <prop k="style" v="no"/>
          </layer>
        </symbol>
      </symbols>
    </renderer-v2>
"""
    def _renderer_point(color, size="2.4"):
        return f"""
    <renderer-v2 type="singleSymbol">
      <symbols>
        <symbol type="marker" name="0">
          <layer class="SimpleMarker">
            <prop k="color" v="{color}"/>
            <prop k="outline_color" v="0,0,0,0"/>
            <prop k="name" v="circle"/>
            <prop k="size" v="{size}"/>
          </layer>
        </symbol>
      </symbols>
    </renderer-v2>
"""
    def _renderer_polygon_fill(color="200,200,200,120", outline_color="150,150,150,120", outline_width="0.3"):
        return f"""
    <renderer-v2 type="singleSymbol">
      <symbols>
        <symbol type="fill" name="0">
          <layer class="SimpleFill">
            <prop k="color" v="{color}"/>
            <prop k="outline_color" v="{outline_color}"/>
            <prop k="outline_width" v="{outline_width}"/>
            <prop k="style" v="solid"/>
          </layer>
        </symbol>
      </symbols>
    </renderer-v2>
"""

    layers_xml = []
    layers_xml.append(_layer("routes_h3_h8","routes_h3_h8","Polygon","routes_h3_h8",_renderer_polygon_outline()))
    layers_xml.append(_layer("gps_trips_ok","gps_trips_ok","Point","gps_trips_ok",_renderer_point("0,158,115,255")))  # verde
    layers_xml.append(_layer("gps_trips_fail","gps_trips_fail","Point","gps_trips_fail",_renderer_point("217,95,2,255")))  # rojo
    layers_xml.append(_layer("gps_points","gps_points","Point","gps_points",_renderer_point("120,120,120,160","1.6")))
    # hex de densidad si existe
    try:
        with fiona.open(gpkg_path, layer="gps_points_hexcounts") as _:
            layers_xml.append(_layer("gps_points_hexcounts","gps_points_hexcounts","Polygon","gps_points_hexcounts",_renderer_polygon_fill()))
    except Exception:
        pass

    xml = f"""<?xml version="1.0" encoding="UTF-8"?>
<qgis projectname="SelectionProject" version="3.44.4">
  <layer-tree-group name="Selección actual">
    <layer-tree-layer name="gps_trips_fail" id="gps_trips_fail"/>
    <layer-tree-layer name="gps_trips_ok" id="gps_trips_ok"/>
    <layer-tree-layer name="gps_points" id="gps_points"/>
    <layer-tree-layer name="routes_h3_h8" id="routes_h3_h8"/>
    <layer-tree-layer name="gps_points_hexcounts" id="gps_points_hexcounts"/>
  </layer-tree-group>
  <projectlayers>
    {''.join(layers_xml)}
  </projectlayers>
</qgis>
"""
    out_path.write_text(xml, encoding="utf-8")

def _run_export(_=None):
    with out_qgis:
        clear_output(wait=True)
        t0 = time.time()
        print("Generando selección para QGIS...")

        # 1) Selección actual
        sel_facts = _current_selection_facts()
        sel_trips = _current_selection_trips()
        print(" - facts:", sel_facts.shape, "| trips:", sel_trips.shape)

        if sel_trips.empty:
            print("No hay trips en la selección actual. Ajusta filtros.")
            return

        # 2) Paths/constantes
        out_dir = Path("data/processed/qgis_from_selection"); out_dir.mkdir(parents=True, exist_ok=True)
        gpkg_path = out_dir / "selection_layers.gpkg"
        qgs_path  = out_dir / "selection_project.qgs"

        # 3) Rutas H3 (polígonos) desde rutas_h3.parquet, filtrando por ruta_hex usados
        if "ruta_hex" not in sel_trips.columns:
            print("No existe columna 'ruta_hex' en trips seleccionados. Aborto.")
            return
        ruta_hex_sel = sorted(sel_trips["ruta_hex"].dropna().astype(str).str.upper().unique().tolist())
        if not ruta_hex_sel:
            print("Selección sin rutas ejecutadas (ruta_hex). Aborto.")
            return

        rutas_h3 = pd.read_parquet(PATH_RUTAS_H3, engine="pyarrow")
        if "ruta_hex" not in rutas_h3.columns: 
            print("rutas_h3.parquet sin 'ruta_hex'. Aborto."); return
        if "h3_list" not in rutas_h3.columns:
            alt = next((c for c in ["h3_cells","h3_hexes"] if c in rutas_h3.columns), None)
            if alt: rutas_h3 = rutas_h3.rename(columns={alt:"h3_list"})
            else: print("No se encontró 'h3_list' (ni alternas) en rutas_h3.parquet. Aborto."); return

        rutas_h3["ruta_hex"] = rutas_h3["ruta_hex"].astype(str).str.upper().str.strip()
        rutas_h3_sel = rutas_h3[rutas_h3["ruta_hex"].isin(ruta_hex_sel)].copy()

        polys = []
        for _, row in rutas_h3_sel.iterrows():
            rr = row["ruta_hex"]
            for hc in _parse_h3_list(row["h3_list"]):
                try:
                    polys.append((rr, str(hc), _boundary_to_polygon(str(hc))))
                except Exception:
                    pass
        routes_hex_gdf = gpd.GeoDataFrame(polys, columns=["ruta_hex","h3","geometry"], crs="EPSG:4326")
        print(" - Hex de rutas:", len(routes_hex_gdf))

        # 4) Puntos crudos (primer punto por trip + todos los puntos filtrados con muestreo)
        pts = pd.read_parquet(PATH_POINTS, engine="pyarrow").copy()
        for c in ("agency_id","mean_id","trip_id","ruta_hex"):
            if c in pts.columns: pts[c] = pts[c].astype(str).str.upper().str.strip()
        if "fecha_hora" in pts.columns and not pd.api.types.is_datetime64_any_dtype(pts["fecha_hora"]):
            pts["fecha_hora"] = pd.to_datetime(pts["fecha_hora"], errors="coerce", utc=True)
        pts["trip_uid"] = pts["mean_id"].astype(str) + "§" + pts["trip_id"].astype(str)

        if "trip_uid" in sel_trips.columns:
            trip_uids = set(sel_trips["trip_uid"].astype(str))
        else:
            trip_uids = set(sel_trips["mean_id"].astype(str) + "§" + sel_trips["trip_id"].astype(str))

        pts_sel = pts[pts["trip_uid"].isin(trip_uids)].copy()
        if "fecha_hora" in pts_sel.columns:
            pts_sel = pts_sel.sort_values(["trip_uid","fecha_hora"])
        first_pts = pts_sel.groupby("trip_uid", as_index=False).first()

        mini = sel_trips[["trip_uid","trip_match"]].drop_duplicates()
        centroids = first_pts.merge(mini, on="trip_uid", how="left")
        centroids_gdf = gpd.GeoDataFrame(
            centroids,
            geometry=gpd.points_from_xy(centroids["longitude"].astype(float), centroids["latitude"].astype(float)),
            crs="EPSG:4326"
        )
        ok_gdf   = centroids_gdf[centroids_gdf["trip_match"] == True].copy()
        fail_gdf = centroids_gdf[centroids_gdf["trip_match"] == False].copy()
        print(" - Trips OK/Fail:", len(ok_gdf), "/", len(fail_gdf))

        # puntos crudos filtrados con muestreo
        POINTS_MAX = int(w_points_max.value)
        SAMPLE_FRAC = float(w_sample_frac.value)
        gps_points_sel = pts_sel.copy()
        if len(gps_points_sel) > POINTS_MAX:
            n = int(min(len(gps_points_sel) * SAMPLE_FRAC, POINTS_MAX))
            gps_points_sel = gps_points_sel.sample(n=n, random_state=42)
        gps_points_gdf = gpd.GeoDataFrame(
            gps_points_sel,
            geometry=gpd.points_from_xy(gps_points_sel["longitude"].astype(float), gps_points_sel["latitude"].astype(float)),
            crs="EPSG:4326"
        )
        print(" - Puntos crudos exportados:", len(gps_points_gdf))

        # 5) Escribir GPKG
        if gpkg_path.exists(): gpkg_path.unlink()
        routes_hex_gdf.to_file(gpkg_path, layer="routes_h3_h8", driver="GPKG")
        ok_gdf.to_file(gpkg_path, layer="gps_trips_ok", driver="GPKG")
        fail_gdf.to_file(gpkg_path, layer="gps_trips_fail", driver="GPKG")
        gps_points_gdf.to_file(gpkg_path, layer="gps_points", driver="GPKG")

        # 6) Hex de densidad (opcional)
        if w_hex_density.value:
            H3_RES = int(w_h3_res.value)
            pts_df = gps_points_gdf.copy()
            if "h3" not in pts_df.columns:
                pts_df["h3"] = [
                    _latlon_to_h3(float(lat), float(lon), H3_RES)
                    for lat, lon in zip(pts_df["latitude"], pts_df["longitude"])
                ]
            hex_counts = pts_df.groupby("h3").size().reset_index(name="count")
            hex_polys = []
            for _, row in hex_counts.iterrows():
                try:
                    poly = _boundary_to_polygon(str(row["h3"]))
                    hex_polys.append((row["h3"], int(row["count"]), poly))
                except Exception:
                    pass
            gps_points_hexcounts = gpd.GeoDataFrame(hex_polys, columns=["h3","count","geometry"], crs="EPSG:4326")
            gps_points_hexcounts.to_file(gpkg_path, layer="gps_points_hexcounts", driver="GPKG")
            print(" - Hex densidad:", len(gps_points_hexcounts))

        # 7) Proyecto QGIS .qgs
        _write_qgs(gpkg_path, qgs_path)

        dt = time.time() - t0
        print("\nListo.")
        print("GeoPackage:", gpkg_path.resolve())
        print("Proyecto QGIS:", qgs_path.resolve())
        print(f"Tiempo total: {dt:.1f}s")

btn_run.on_click(_run_export)


HBox(children=(IntText(value=200000, description='Máx. puntos:', layout=Layout(width='200px')), BoundedFloatTe…

Button(button_style='success', description='Generar GPKG + Proyecto QGIS', style=ButtonStyle())

Output()

In [None]:
# ============================================================
# Selección desde TUS widgets + Export a QGIS (GPKG + .QGS)
# Verde = OK, Rojo = FAIL | Incluye hex de rutas y (opcional) densidad
# ============================================================

import ipywidgets as widgets
from IPython.display import display, clear_output
from pathlib import Path
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Polygon
import json, ast, fiona, h3
import time

# ---------- Config de archivos ----------
PATH_RUTAS_H3 = Path("data/processed/rutas_h3.parquet")         # ruta_hex + h3_list
PATH_POINTS   = Path("data/processed/gps_match_points.parquet")  # puntos crudos
OUT_DIR       = Path("data/processed/qgis_from_selection")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CRS = "EPSG:4326"

# ---------- Helpers robustos ----------
def parse_h3_list(val):
    """Acepta list/tuple/set/ndarray/Series o strings (JSON/lista Python)."""
    # Contenedores
    if isinstance(val, (list, tuple, set)):
        return list(val)
    if isinstance(val, (np.ndarray, pd.Series)):
        return list(val.tolist())
    # Nulos
    if val is None:
        return []
    try:
        # NaN escalar (float)
        if isinstance(val, float) and np.isnan(val):
            return []
    except Exception:
        pass
    # String -> JSON o literal_eval
    s = str(val).strip()
    if not s or s.lower() in {"nan", "none", "null"}:
        return []
    try:
        x = json.loads(s)
        if isinstance(x, (list, tuple, set, np.ndarray, pd.Series)):
            return list(x)
    except Exception:
        pass
    try:
        x = ast.literal_eval(s)
        if isinstance(x, (list, tuple, set, np.ndarray, pd.Series)):
            return list(x)
    except Exception:
        pass
    # Último recurso: un solo hex string
    return [s]

def boundary_to_polygon(hcell: str) -> Polygon:
    """Convierte celda H3 a polígono (v3/v4)."""
    if hasattr(h3, "h3_to_geo_boundary"):          # v3
        coords = h3.h3_to_geo_boundary(hcell, geo_json=True)  # [(lat, lng), ...]
    elif hasattr(h3, "cell_to_boundary"):          # v4
        coords = h3.cell_to_boundary(hcell)                   # [(lat, lng), ...]
    else:
        raise RuntimeError("No hay funciones boundary en h3.")
    ring = [(lng, lat) for lat, lng in coords]
    return Polygon(ring)

def latlon_to_h3(lat, lon, res: int):
    if hasattr(h3, "geo_to_h3"):       # v3
        return h3.geo_to_h3(lat, lon, res)
    elif hasattr(h3, "latlng_to_cell"):  # v4
        return h3.latlng_to_cell(lat, lon, res)
    else:
        raise RuntimeError("No hay función lat/lon → H3.")

# ============================================================
# TUS WIDGETS (idénticos a los que estás usando)
# ============================================================
df_display = facts_named.copy() if "facts_named" in globals() else facts.copy()
if "empresa_nombre" not in df_display.columns:
    df_display["empresa_nombre"] = df_display.get("agency_id", pd.Series([""]*len(df_display)))

empresas = sorted(df_display["empresa_nombre"].dropna().astype(str).unique().tolist())
lineas   = sorted(df_display["linea"].dropna().astype(str).unique().tolist()) if "linea" in df_display.columns else []
horas    = sorted(df_display["hora"].dropna().astype(int).unique().tolist()) if "hora" in df_display.columns else []

ramales  = sorted(df_display["ramal"].dropna().astype(str).unique().tolist()) if "ramal" in df_display.columns else []
origenes = sorted(df_display["origen"].dropna().astype(str).unique().tolist()) if "origen" in df_display.columns else []
destinos = sorted(df_display["destino"].dropna().astype(str).unique().tolist()) if "destino" in df_display.columns else []
idents   = sorted(df_display["identificacion"].dropna().astype(str).unique().tolist()) if "identificacion" in df_display.columns else []

emp_select  = widgets.Dropdown(options=["(todas)"] + empresas, description="Empresa:")
lin_select  = widgets.Dropdown(options=["(todas)"] + lineas,   description="Línea:")
hor_select  = widgets.Dropdown(options=["(todas)"] + horas,    description="Hora:")

ram_select  = widgets.Dropdown(options=["(todos)"] + ramales,  description="Ramal:")
ori_select  = widgets.Dropdown(options=["(todos)"] + origenes, description="Origen:")
des_select  = widgets.Dropdown(options=["(todos)"] + destinos, description="Destino:")
idn_select  = widgets.Dropdown(options=["(todos)"] + idents,   description="Identif:")

out = widgets.Output()

def update_view(_=None):
    with out:
        clear_output(wait=True)
        df_f = df_display.copy()

        if emp_select.value != "(todas)":
            df_f = df_f[df_f["empresa_nombre"] == emp_select.value]
        if lin_select.value != "(todas)" and "linea" in df_f.columns:
            df_f = df_f[df_f["linea"] == lin_select.value]
        if hor_select.value != "(todas)" and "hora" in df_f.columns:
            df_f = df_f[df_f["hora"] == hor_select.value]

        if "ramal" in df_f.columns and ram_select.value != "(todos)":
            df_f = df_f[df_f["ramal"] == ram_select.value]
        if "origen" in df_f.columns and ori_select.value != "(todos)":
            df_f = df_f[df_f["origen"] == ori_select.value]
        if "destino" in df_f.columns and des_select.value != "(todos)":
            df_f = df_f[df_f["destino"] == des_select.value]
        if "identificacion" in df_f.columns and idn_select.value != "(todos)":
            df_f = df_f[df_f["identificacion"] == idn_select.value]

        if df_f.empty:
            print("Sin resultados para la selección.")
            return

        def safe_sum(col):
            return int(df_f[col].sum()) if col in df_f.columns and not df_f.empty else 0

        total_trips = safe_sum("total_trips")
        trips_ok    = safe_sum("trips_ok")
        pts_in      = safe_sum("total_pts_in")  if "total_pts_in"  in df_f.columns else 0
        pts_out     = safe_sum("total_pts_out") if "total_pts_out" in df_f.columns else 0

        mean_pct_ok = df_f["pct_trips_ok"].mean() if "pct_trips_ok" in df_f.columns else np.nan
        print(f"Total trips: {total_trips:,}")
        print(f"Trips OK (≥0.6): {trips_ok:,}  |  {mean_pct_ok:.1%} promedio" if pd.notna(mean_pct_ok) else f"Trips OK (≥0.6): {trips_ok:,}")
        if "total_pts_in" in df_f.columns and "total_pts_out" in df_f.columns:
            print(f"Puntos dentro: {pts_in:,} / fuera: {pts_out:,}")

        cols_show = [
            c for c in ["empresa_nombre","agency_id","linea","ramal","origen","destino","identificacion","hora",
                        "total_trips","trips_ok","pct_trips_ok",
                        "total_pts_in","total_pts_out","pct_pts_in"]
            if c in df_f.columns
        ]
        if cols_show:
            display(
                df_f[cols_show]
                .sort_values(["empresa_nombre","linea","hora"], na_position="last")
                .reset_index(drop=True)
                .head(30)
            )
        else:
            print("No hay columnas esperadas para mostrar el resumen.")

for w in [emp_select, lin_select, hor_select, ram_select, ori_select, des_select, idn_select]:
    w.observe(update_view, names="value")

display(widgets.HBox([emp_select, lin_select, hor_select]))
display(widgets.HBox([ram_select, ori_select, des_select, idn_select]))
display(out)
update_view()

# ============================================================
# Botón: generar GPKG + proyecto QGIS desde ESTA selección
# ============================================================

# Controles extra
w_points_max  = widgets.IntText(value=200_000, description="Máx. puntos:", layout=widgets.Layout(width="200px"))
w_sample_frac = widgets.BoundedFloatText(value=0.10, min=0.01, max=1.0, step=0.01, description="Frac. sample:", layout=widgets.Layout(width="220px"))
w_h3_res      = widgets.BoundedIntText(value=8, min=4, max=12, step=1, description="H3 res:", layout=widgets.Layout(width="180px"))
w_hex_density = widgets.Checkbox(value=True, description="Hex densidad (gps_points_hexcounts)")
btn_run       = widgets.Button(description="Generar QGIS (GPKG + .QGS)", button_style="success")
out_qgis      = widgets.Output()

display(widgets.HBox([w_points_max, w_sample_frac, w_h3_res, w_hex_density]))
display(btn_run)
display(out_qgis)

def _current_selection_facts():
    base = df_display.copy()
    if emp_select.value != "(todas)":
        base = base[base["empresa_nombre"] == emp_select.value]
    if "linea" in base.columns and lin_select.value != "(todas)":
        base = base[base["linea"] == lin_select.value]
    if "hora" in base.columns and hor_select.value != "(todas)":
        base = base[base["hora"] == hor_select.value]
    if "ramal" in base.columns and ram_select.value != "(todos)":
        base = base[base["ramal"] == ram_select.value]
    if "origen" in base.columns and ori_select.value != "(todos)":
        base = base[base["origen"] == ori_select.value]
    if "destino" in base.columns and des_select.value != "(todos)":
        base = base[base["destino"] == des_select.value]
    if "identificacion" in base.columns and idn_select.value != "(todos)":
        base = base[base["identificacion"] == idn_select.value]
    return base

def _current_selection_trips():
    if "trips_named" not in globals():
        raise RuntimeError("Se requiere 'trips_named' en memoria.")
    base = trips_named.copy()
    if emp_select.value != "(todas)":
        base = base[base["empresa_nombre"] == emp_select.value]
    if "linea" in base.columns and lin_select.value != "(todas)":
        base = base[base["linea"] == lin_select.value]
    if "hora" in base.columns and hor_select.value != "(todas)":
        base = base[base["hora"] == hor_select.value]
    if "ramal" in base.columns and ram_select.value != "(todos)":
        base = base[base["ramal"] == ram_select.value]
    if "origen" in base.columns and ori_select.value != "(todos)":
        base = base[base["origen"] == ori_select.value]
    if "destino" in base.columns and des_select.value != "(todos)":
        base = base[base["destino"] == des_select.value]
    if "identificacion" in base.columns and idn_select.value != "(todos)":
        base = base[base["identificacion"] == idn_select.value]
    return base

def _write_qgs(gpkg_path: Path, out_path: Path):
    # Estilos: verde OK, rojo FAIL, gris puntos, rutas = borde gris
    def _layer(layer_id, layer_name, geom_type, gpkg_layer, symbol_xml):
        ds = f"{gpkg_path.resolve().as_posix()}|layername={gpkg_layer}"
        return f"""
    <maplayer type="vector" geometry="{geom_type}">
      <id>{layer_id}</id>
      <datasource>{ds}</datasource>
      <layername>{layer_name}</layername>
      <provider>ogr</provider>
      {symbol_xml}
    </maplayer>
"""
    def _renderer_polygon_outline(outline_color="120,120,120,200", outline_width="0.6"):
        return f"""
    <renderer-v2 type="singleSymbol">
      <symbols>
        <symbol type="fill" name="0">
          <layer class="SimpleFill">
            <prop k="color" v="255,255,255,0"/>
            <prop k="outline_color" v="{outline_color}"/>
            <prop k="outline_width" v="{outline_width}"/>
            <prop k="style" v="no"/>
          </layer>
        </symbol>
      </symbols>
    </renderer-v2>
"""
    def _renderer_point(color, size="2.4"):
        return f"""
    <renderer-v2 type="singleSymbol">
      <symbols>
        <symbol type="marker" name="0">
          <layer class="SimpleMarker">
            <prop k="color" v="{color}"/>
            <prop k="outline_color" v="0,0,0,0"/>
            <prop k="name" v="circle"/>
            <prop k="size" v="{size}"/>
          </layer>
        </symbol>
      </symbols>
    </renderer-v2>
"""
    def _renderer_polygon_fill(color="200,200,200,120", outline_color="150,150,150,120", outline_width="0.3"):
        return f"""
    <renderer-v2 type="singleSymbol">
      <symbols>
        <symbol type="fill" name="0">
          <layer class="SimpleFill">
            <prop k="color" v="{color}"/>
            <prop k="outline_color" v="{outline_color}"/>
            <prop k="outline_width" v="{outline_width}"/>
            <prop k="style" v="solid"/>
          </layer>
        </symbol>
      </symbols>
    </renderer-v2>
"""
    layers_xml = []
    layers_xml.append(_layer("routes_h3_h8","routes_h3_h8","Polygon","routes_h3_h8",_renderer_polygon_outline()))
    layers_xml.append(_layer("gps_trips_ok","gps_trips_ok","Point","gps_trips_ok",_renderer_point("0,158,115,255")))   # verde
    layers_xml.append(_layer("gps_trips_fail","gps_trips_fail","Point","gps_trips_fail",_renderer_point("217,95,2,255"))) # rojo
    layers_xml.append(_layer("gps_points","gps_points","Point","gps_points",_renderer_point("120,120,120,160","1.6")))
    try:
        with fiona.open(gpkg_path, layer="gps_points_hexcounts") as _:
            layers_xml.append(_layer("gps_points_hexcounts","gps_points_hexcounts","Polygon","gps_points_hexcounts",_renderer_polygon_fill()))
    except Exception:
        pass

    xml = f"""<?xml version="1.0" encoding="UTF-8"?>
<qgis projectname="SelectionProject" version="3.40">
  <layer-tree-group name="Selección actual">
    <layer-tree-layer name="gps_trips_fail" id="gps_trips_fail"/>
    <layer-tree-layer name="gps_trips_ok" id="gps_trips_ok"/>
    <layer-tree-layer name="gps_points" id="gps_points"/>
    <layer-tree-layer name="routes_h3_h8" id="routes_h3_h8"/>
    <layer-tree-layer name="gps_points_hexcounts" id="gps_points_hexcounts"/>
  </layer-tree-group>
  <projectlayers>
    {''.join(layers_xml)}
  </projectlayers>
</qgis>
"""
    out_path.write_text(xml, encoding="utf-8")

def _run_export(_=None):
    with out_qgis:
        clear_output(wait=True)
        t0 = time.time()
        print("Generando selección para QGIS...")

        # 1) Selección
        sel_facts = _current_selection_facts()
        sel_trips = _current_selection_trips()
        print(" - facts:", sel_facts.shape, "| trips:", sel_trips.shape)
        if sel_trips.empty:
            print("No hay trips en la selección actual.")
            return

        gpkg_path = OUT_DIR / "selection_layers.gpkg"
        qgs_path  = OUT_DIR / "selection_project.qgs"
        if gpkg_path.exists():
            gpkg_path.unlink()

        # 2) Hex de rutas desde rutas_h3.parquet, filtrando por ruta_hex de la selección
        if "ruta_hex" not in sel_trips.columns:
            raise ValueError("En trips seleccionados falta 'ruta_hex'.")
        ruta_hex_sel = sorted(sel_trips["ruta_hex"].dropna().astype(str).str.upper().unique().tolist())
        rutas_h3 = pd.read_parquet(PATH_RUTAS_H3, engine="pyarrow")
        if "h3_list" not in rutas_h3.columns:
            alt = next((c for c in ["h3_cells","h3_hexes"] if c in rutas_h3.columns), None)
            if alt:
                rutas_h3 = rutas_h3.rename(columns={alt: "h3_list"})
            else:
                raise ValueError("rutas_h3.parquet no posee 'h3_list' (ni alternas).")
        rutas_h3["ruta_hex"] = rutas_h3["ruta_hex"].astype(str).str.upper().str.strip()
        rutas_h3_sel = rutas_h3[rutas_h3["ruta_hex"].isin(ruta_hex_sel)].copy()

        polys = []
        for _, row in rutas_h3_sel.iterrows():
            rhex = row["ruta_hex"]
            for hcell in parse_h3_list(row["h3_list"]):
                try:
                    polys.append((rhex, str(hcell), boundary_to_polygon(str(hcell))))
                except Exception:
                    pass
        routes_hex_gdf = gpd.GeoDataFrame(polys, columns=["ruta_hex","h3","geometry"], crs=CRS)
        print(" - Hex de rutas:", len(routes_hex_gdf))
        routes_hex_gdf.to_file(gpkg_path, layer="routes_h3_h8", driver="GPKG")

        # 3) Puntos crudos: primer punto por trip para OK/FAIL y sampleo para gps_points
        pts = pd.read_parquet(PATH_POINTS, engine="pyarrow").copy()
        for c in ("agency_id","mean_id","trip_id","ruta_hex"):
            if c in pts.columns:
                pts[c] = pts[c].astype(str).str.upper().str.strip()
        if "fecha_hora" in pts.columns and not pd.api.types.is_datetime64_any_dtype(pts["fecha_hora"]):
            pts["fecha_hora"] = pd.to_datetime(pts["fecha_hora"], errors="coerce", utc=True)
        pts["trip_uid"] = pts["mean_id"].astype(str) + "§" + pts["trip_id"].astype(str)

        if "trip_uid" in sel_trips.columns:
            trip_uids = set(sel_trips["trip_uid"].astype(str))
        else:
            trip_uids = set(sel_trips["mean_id"].astype(str) + "§" + sel_trips["trip_id"].astype(str))

        pts_sel = pts[pts["trip_uid"].isin(trip_uids)].copy()
        if "fecha_hora" in pts_sel.columns:
            pts_sel = pts_sel.sort_values(["trip_uid","fecha_hora"])
        first_pts = pts_sel.groupby("trip_uid", as_index=False).first()

        mini = sel_trips[["trip_uid","trip_match"]].drop_duplicates()
        centroids = first_pts.merge(mini, on="trip_uid", how="left")
        centroids_gdf = gpd.GeoDataFrame(
            centroids,
            geometry=gpd.points_from_xy(centroids["longitude"].astype(float), centroids["latitude"].astype(float)),
            crs=CRS
        )
        ok_gdf   = centroids_gdf[centroids_gdf["trip_match"] == True].copy()
        fail_gdf = centroids_gdf[centroids_gdf["trip_match"] == False].copy()
        ok_gdf.to_file(gpkg_path, layer="gps_trips_ok", driver="GPKG")
        fail_gdf.to_file(gpkg_path, layer="gps_trips_fail", driver="GPKG")
        print(" - Trips OK/Fail:", len(ok_gdf), "/", len(fail_gdf))

        # gps_points con sampleo
        POINTS_MAX  = int(w_points_max.value)
        SAMPLE_FRAC = float(w_sample_frac.value)
        gps_points_sel = pts_sel.copy()
        if len(gps_points_sel) > POINTS_MAX:
            n = int(min(len(gps_points_sel) * SAMPLE_FRAC, POINTS_MAX))
            gps_points_sel = gps_points_sel.sample(n=n, random_state=42)
        gps_points_gdf = gpd.GeoDataFrame(
            gps_points_sel,
            geometry=gpd.points_from_xy(gps_points_sel["longitude"].astype(float), gps_points_sel["latitude"].astype(float)),
            crs=CRS
        )
        gps_points_gdf.to_file(gpkg_path, layer="gps_points", driver="GPKG")
        print(" - Puntos crudos exportados:", len(gps_points_gdf))

        # 4) Hex de densidad (opcional)
        if w_hex_density.value:
            H3_RES = int(w_h3_res.value)
            pts_df = gps_points_gdf.copy()
            if "h3" not in pts_df.columns:
                pts_df["h3"] = [
                    latlon_to_h3(float(lat), float(lon), H3_RES)
                    for lat, lon in zip(pts_df["latitude"], pts_df["longitude"])
                ]
            hex_counts = pts_df.groupby("h3").size().reset_index(name="count")
            hex_polys = []
            for _, row in hex_counts.iterrows():
                try:
                    hex_polys.append((row["h3"], int(row["count"]), boundary_to_polygon(str(row["h3"]))))
                except Exception:
                    pass
            gps_points_hexcounts = gpd.GeoDataFrame(hex_polys, columns=["h3","count","geometry"], crs=CRS)
            gps_points_hexcounts.to_file(gpkg_path, layer="gps_points_hexcounts", driver="GPKG")
            print(" - Hex densidad:", len(gps_points_hexcounts))

        # 5) Proyecto QGIS .QGS con estilos
        _write_qgs(gpkg_path, OUT_DIR / "selection_project.qgs")

        dt = time.time() - t0
        print("\nListo.")
        print("GeoPackage:", gpkg_path.resolve())
        print("Proyecto QGIS:", (OUT_DIR / 'selection_project.qgs').resolve())
        print(f"Tiempo total: {dt:.1f}s")

btn_run.on_click(_run_export)


HBox(children=(Dropdown(description='Empresa:', options=('(todas)', '1° DE DICIEMBRE SRL', 'ALDANA SA', 'CIUDA…

HBox(children=(Dropdown(description='Ramal:', options=('(todos)', '1', '2', '3', '32', '4', '5', '6', '8'), va…

Output()

HBox(children=(IntText(value=200000, description='Máx. puntos:', layout=Layout(width='200px')), BoundedFloatTe…

Button(button_style='success', description='Generar QGIS (GPKG + .QGS)', style=ButtonStyle())

Output()