In [1]:
# --- BLOQUE 1: imports y rutas ---
from pathlib import Path
import numpy as np
import pandas as pd

# Paths
PATH_TRIPS  = Path("data/processed/gps_match_trips.parquet")
PATH_POINTS = Path("data/processed/gps_match_points.parquet")
OUT_DIR     = Path("data/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

def norm_emp_id(x):
    if pd.isna(x): return x
    s = str(x).strip()
    return s.zfill(4) if s.isdigit() and len(s) <= 4 else s


In [2]:
# --- BLOQUE 2: leer trips y revisar estructura ---
trips = pd.read_parquet(PATH_TRIPS, engine="pyarrow")

print("shape:", trips.shape)
print("\nDtypes:\n", trips.dtypes)
print("\nPrimeras filas:")
display(trips.head(5))


shape: (6431, 11)

Dtypes:
 agency_id           object
mean_id             object
trip_id              int64
route_id            object
ruta_hex            object
pts_en_declared      int64
pts_trip             int64
ratio              float64
trip_match            bool
route_id_match        bool
hora                 int64
dtype: object

Primeras filas:


Unnamed: 0,agency_id,mean_id,trip_id,route_id,ruta_hex,pts_en_declared,pts_trip,ratio,trip_match,route_id_match,hora
0,5,005DD,0,001C,001C,0,1,0.0,False,False,7
1,5,005DD,0,01FB,01FB,0,1,0.0,False,False,7
2,5,005DD,0,271C,271C,0,74,0.0,False,False,7
3,5,005DD,0,001D,001D,0,74,0.0,False,False,9
4,5,005DD,0,01FC,01FC,0,1,0.0,False,False,10


In [None]:
# --- BLOQUE 3: normalizar e inferir columnas clave ---
trips = trips.copy()

# Normalizaciones mÃ­nimas
for c in ("agency_id", "route_id", "ruta_hex", "mean_id"):
    if c in trips.columns:
        trips[c] = trips[c].astype(str).str.upper().str.strip()
if "agency_id" in trips.columns:
    trips["agency_id"] = trips["agency_id"].apply(norm_emp_id)

# Asegurar numÃ©ricos
for c in ("pts_en_declared", "pts_trip"):
    if c in trips.columns:
        trips[c] = pd.to_numeric(trips[c], errors="coerce")

# Ratio: usar el existente si viene, si no calcular
if "ratio" not in trips.columns:
    trips["ratio"] = trips["pts_en_declared"] / trips["pts_trip"]

# trip_match: asegurar que refleja ratio >= 0.60
trips["trip_match"] = trips["ratio"] >= 0.60

# route_match explÃ­cito (declarada vs ejecutada)
trips["route_match"] = (trips.get("route_id").astype(str) == trips.get("ruta_hex").astype(str))

# puntos dentro / fuera (para KPIs de cobertura)
trips["pts_in"]  = trips["pts_en_declared"]
trips["pts_out"] = trips["pts_trip"] - trips["pts_en_declared"]

# Hora: si ya existe la conservamos; si no, quedarÃ¡ NaN
if "hora" in trips.columns:
    trips["hora"] = trips["hora"].astype("Int64")

print("OK: columnas derivadas agregadas â†’ ['trip_match','route_match','pts_in','pts_out']")
display(trips.head(3))


OK: columnas derivadas agregadas â†’ ['trip_match','route_match','pts_in','pts_out']


Unnamed: 0,agency_id,mean_id,trip_id,route_id,ruta_hex,pts_en_declared,pts_trip,ratio,trip_match,route_id_match,hora,route_match,pts_in,pts_out
0,5,005DD,-1,001D,NONE,0,77,0.0,False,False,10,False,0,77
1,5,005DD,0,001D,00B1,0,110,0.0,False,False,10,False,0,110
2,5,005DF,0,001D,008C,0,196,0.0,False,False,10,False,0,196


In [4]:
# --- BLOQUE 4: mÃ©tricas base y sanity check ---
from IPython.display import display

def count_ge(th): 
    return int((trips["ratio"] >= th).sum())

print("Trips total:", len(trips))
print("Trips OK (ratio >= 0.60):", int(trips["trip_match"].sum()))
print("Trips con route_id_match (segÃºn umbral):", int(trips["route_id_match"].sum()))
print("Trips con route_id == ruta_hex (literal):", int((trips["route_id"].astype(str) == trips["ruta_hex"].astype(str)).sum()))
print()

print("ðŸ“Š ratio.describe():")
display(trips["ratio"].describe(percentiles=[.1, .25, .5, .75, .9, .95, .99]))

print("DistribuciÃ³n por umbral de ratio:")
for th in (0.60, 0.30, 0.10, 0.05):
    print(f"  â‰¥ {th:.2f}: {count_ge(th)} trips")


Trips total: 6431
Trips OK (ratio >= 0.60): 1527
Trips con route_id_match (segÃºn umbral): 1527
Trips con route_id == ruta_hex (literal): 6431

ðŸ“Š ratio.describe():


count    6431.000000
mean        0.496836
std         0.477828
min         0.000000
10%         0.000000
25%         0.000000
50%         0.533333
75%         1.000000
90%         1.000000
95%         1.000000
99%         1.000000
max         1.000000
Name: ratio, dtype: float64

DistribuciÃ³n por umbral de ratio:
  â‰¥ 0.60: 3166 trips
  â‰¥ 0.30: 3379 trips
  â‰¥ 0.10: 3465 trips
  â‰¥ 0.05: 3503 trips


In [5]:
# --- BLOQUE 5: KPIs por empresa + ruta + hora ---
group_cols = [c for c in ["agency_id", "ruta_hex", "hora"] if c in trips.columns]
if not group_cols:
    group_cols = ["agency_id", "ruta_hex"]  # fallback sin hora

facts = (
    trips.groupby(group_cols, dropna=False)
         .agg(
             total_trips = ("trip_id", "nunique"),
             trips_ok    = ("trip_match", "sum"),
             trips_route_match = ("route_match", "sum"),
             total_pts_in  = ("pts_in", "sum"),
             total_pts_out = ("pts_out", "sum")
         )
         .reset_index()
)

facts["pct_trips_ok"]     = (facts["trips_ok"] / facts["total_trips"]).replace([np.inf,-np.inf], np.nan).round(3)
facts["pct_route_match"]  = (facts["trips_route_match"] / facts["total_trips"]).replace([np.inf,-np.inf], np.nan).round(3)
facts["pct_pts_in"]       = (facts["total_pts_in"] / (facts["total_pts_in"] + facts["total_pts_out"])).replace([np.inf,-np.inf], np.nan).round(3)

print("Preview KPIs (top por total_trips):")
display(facts.sort_values("total_trips", ascending=False).head(10))


Preview KPIs (top por total_trips):


Unnamed: 0,agency_id,ruta_hex,hora,total_trips,trips_ok,trips_route_match,total_pts_in,total_pts_out,pct_trips_ok,pct_route_match,pct_pts_in
3,5,0055,10,2,4,0,596,0,2.0,0.0,1.0
4,5,008C,10,2,23,0,3765,3061,11.5,0.0,0.552
5,5,00B1,10,2,0,0,0,424,0.0,0.0,0.0
15,7,00E6,10,2,6,0,635,1182,3.0,0.0,0.349
12,7,003A,10,2,10,0,1272,370,5.0,0.0,0.775
13,7,0051,10,2,0,0,55,851,0.0,0.0,0.061
14,7,008C,10,2,20,0,3469,5794,10.0,0.0,0.375
8,5,011E,10,2,11,0,1823,22,5.5,0.0,0.988
56,20,0108,10,2,2,0,162,2275,1.0,0.0,0.066
57,20,010B,10,2,8,0,541,1532,4.0,0.0,0.261


In [6]:
# --- BLOQUE 6: consistencia route_id vs ruta_hex ---
pairs = (
    trips.groupby(["agency_id","route_id","ruta_hex"], dropna=False)
         .agg(total_trips=("trip_id","nunique"),
              trips_ok=("trip_match","sum"))
         .reset_index()
         .sort_values("total_trips", ascending=False)
)

print("Top 15 combinaciones declaradaâ‰ ejecutada (si las hay):")
mismatch = pairs[pairs["route_id"] != pairs["ruta_hex"]]
display(mismatch.head(15))

print("Resumen coincidencias (por agencia):")
res_ag = (
    (trips["route_match"])
    .groupby(trips["agency_id"])
    .agg(["sum","count"])
    .rename(columns={"sum":"trips_route_match","count":"total_trips"})
)
res_ag["pct_route_match"] = (res_ag["trips_route_match"] / res_ag["total_trips"]).round(3)
display(res_ag.sort_values("pct_route_match"))


Top 15 combinaciones declaradaâ‰ ejecutada (si las hay):


Unnamed: 0,agency_id,route_id,ruta_hex,total_trips,trips_ok
246,22,0110,008C,2,7
122,7,016C,008C,2,0
5,5,001E,008C,2,0
110,7,0098,00E6,2,3
1,5,001D,008C,2,0
87,7,0045,0051,2,0
50,7,0036,008C,2,3
108,7,0098,003A,2,2
27,5,0027,011E,2,4
52,7,0036,00EE,2,2


Resumen coincidencias (por agencia):


Unnamed: 0_level_0,trips_route_match,total_trips,pct_route_match
agency_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0005,0,95,0.0
0007,0,142,0.0
000A,0,31,0.0
000B,0,2,0.0
0014,0,14,0.0
0020,0,138,0.0
002D,0,8,0.0
0017,6,57,0.105
0022,2,17,0.118
0025,4,17,0.235


In [None]:
# --- BLOQUE 7: exportar ---
SAVE = True
if SAVE:
    out_facts = OUT_DIR / "kpi_facts_notebook.parquet"
    facts.to_parquet(out_facts, engine="pyarrow", index=False)

    out_pairs = OUT_DIR / "kpi_pairs_decl_vs_exec.parquet"
    pairs.to_parquet(out_pairs, engine="pyarrow", index=False)

    # CSVs por si querÃ©s abrir rÃ¡pido
    facts.to_csv(OUT_DIR / "kpi_facts_notebook.csv", index=False)
    pairs.to_csv(OUT_DIR / "kpi_pairs_decl_vs_exec.csv", index=False)

    print("Guardados:")
    print(" -", out_facts)
    print(" -", out_pairs)


Guardados:
 - data\processed\kpi_facts_notebook.parquet
 - data\processed\kpi_pairs_decl_vs_exec.parquet


In [None]:
# --- BLOQUE 8 : KPI por bus ---
group_bus = [c for c in ["agency_id","ruta_hex","hora","mean_id"] if c in trips.columns]
if group_bus:
    facts_bus = (
        trips.groupby(group_bus, dropna=False)
             .agg(total_trips=("trip_id","nunique"),
                  trips_ok=("trip_match","sum"),
                  total_pts_in=("pts_in","sum"),
                  total_pts_out=("pts_out","sum"))
             .reset_index()
    )
    facts_bus["pct_trips_ok"] = (facts_bus["trips_ok"] / facts_bus["total_trips"]).replace([np.inf,-np.inf], np.nan).round(3)
    facts_bus["pct_pts_in"]   = (facts_bus["total_pts_in"] / (facts_bus["total_pts_in"] + facts_bus["total_pts_out"])).replace([np.inf,-np.inf], np.nan).round(3)

    print("Preview KPI por bus:")
    display(facts_bus.sort_values("pct_trips_ok").head(10))


Preview KPI por bus:


Unnamed: 0,agency_id,ruta_hex,hora,mean_id,total_trips,trips_ok,total_pts_in,total_pts_out,pct_trips_ok,pct_pts_in
29,5,008C,10,005F7,1,0,52,130,0.0,0.286
467,25,011E,10,00332,1,0,0,176,0.0,0.0
468,25,011E,10,004B7,1,0,0,184,0.0,0.0
472,25,011E,10,00527,1,0,0,185,0.0,0.0
473,25,011E,10,0052C,1,0,0,16,0.0,0.0
474,25,011E,10,0052D,1,0,0,184,0.0,0.0
0,5,001E,10,005E3,1,0,0,44,0.0,0.0
318,17,01BB,10,007EA,1,0,0,200,0.0,0.0
319,17,01BB,10,007ED,1,0,0,100,0.0,0.0
288,17,008C,10,007F0,1,0,0,182,0.0,0.0
