In [1]:
!python wikiroutes_leafletgrab.py --url "https://wikiroutes.info/es/lima?routes=154193" --out data_wikiroutes --headless 0

{
  "route_folder": "data_wikiroutes\\route_154193",
  "trips_detected": 2,
  "line_segments_total": 2,
  "points_total": 334
}


In [None]:
!python sync_wr_indexes.py

CWD: D:\ARCHIVOS\OneDrive\Documents\UNI\Cursos adicionales\Rutas\pipeline\scripts\wikiroutes
ROOT: D:\ARCHIVOS\OneDrive\Documents\UNI\Cursos adicionales\Rutas
OUT_ROOT: D:\ARCHIVOS\OneDrive\Documents\UNI\Cursos adicionales\Rutas\data\processed\transporte  exists=True
LISTA_RUTAS_CSV: D:\ARCHIVOS\OneDrive\Documents\UNI\Cursos adicionales\Rutas\config\lista_rutas.csv  exists=True
WR_MAP_JSON: D:\ARCHIVOS\OneDrive\Documents\UNI\Cursos adicionales\Rutas\config\wr_map.json
WR_OVERRIDES_JSON: D:\ARCHIVOS\OneDrive\Documents\UNI\Cursos adicionales\Rutas\config\wr_overrides.json
Folders route_* detectados: 1576

Resumen:
  folders_total: 1576
  ok_folders: 1576
  skip_missing_route_json: 0
  skip_no_trip_files: 0
  fallback_display_id_used: 133
  unmatched_lista_rutas: 1407

OK. Rutas en wr_map.json: 3143
OK. Entradas en wr_overrides.json: 3152

Dump sin display_id extraíble (fallback route_id): D:\ARCHIVOS\OneDrive\Documents\UNI\Cursos adicionales\Rutas\config\no_display_id_skips.json
Total fa

: 

In [9]:
# wikiroutes_catalog_runner.py
from __future__ import annotations

import json
import re
import time
from pathlib import Path
from typing import Dict, List
from urllib.parse import urljoin

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

from wikiroutes_leafletgrab import scrape_route, make_driver, DEFAULT_BASE

CATALOG_URL = "https://wikiroutes.info/es/lima/catalog"

# Carpeta donde se guardan las carpetas route_XXXXX
OUT_ROOT = Path("data/processed/transporte")

# JSON de salida
WR_MAP_JSON = Path("config/wr_map.json")
WR_OVERRIDES_JSON = Path("config/wr_overrides.json")


def normalizar(texto: str) -> str:
    return " ".join(texto.split())


def esperar_chips(driver, timeout: int = 60) -> None:
    """Espera a que haya al menos un chip de ruta en la página."""
    wait = WebDriverWait(driver, timeout)
    wait.until(
        EC.presence_of_element_located(
            (By.CSS_SELECTOR, "a.city.tag-btn.tag-btn--float")
        )
    )


def expandir_listas(driver, timeout: int = 60) -> None:
    """
    Hace clic en todos los botones visibles de 'Expandir la lista'
    hasta que ya no quede ninguno visible.
    """
    wait = WebDriverWait(driver, timeout)

    try:
        wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "div.button-more.expandFullList")
            )
        )
    except TimeoutException:
        # No hay botones de "Expandir la lista"
        return

    while True:
        botones = [
            b
            for b in driver.find_elements(
                By.CSS_SELECTOR, "div.button-more.expandFullList"
            )
            if b.is_displayed()
        ]
        if not botones:
            break

        for b in botones:
            try:
                driver.execute_script("arguments[0].click();", b)
            except Exception:
                pass

        time.sleep(1)


def obtener_links_rutas(driver) -> List[Dict[str, str]]:
    """
    Devuelve una lista de dicts {name, url} para las rutas visibles,
    leyendo los <a.city.tag-btn.tag-btn--float>.
    """
    chips = driver.find_elements(By.CSS_SELECTOR, "a.city.tag-btn.tag-btn--float")
    rutas = []
    vistos = set()

    for chip in chips:
        if not chip.is_displayed():
            continue

        txt = normalizar(chip.text)
        href = chip.get_attribute("href") or ""
        if not href:
            data_href = chip.get_attribute("data-href") or ""
            href = data_href

        if not href:
            continue

        href_abs = urljoin(DEFAULT_BASE, href)
        if href_abs in vistos:
            continue

        vistos.add(href_abs)
        rutas.append({"name": txt, "url": href_abs})

    return rutas


def cargar_json_si_existe(path: Path):
    if not path.exists():
        return {}
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


def guardar_json(path: Path, data) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2, sort_keys=True)


def extraer_display_id(title: str) -> str:
    """
    Intenta extraer el código de ruta (1244, 201A, etc.) del título.
    E.g. '1244 · Villa Las Palmas → Pan de Azúcar' -> '1244'
    """
    if not title:
        return ""
    m = re.match(r"\s*([0-9A-Za-z]+)", title)
    return m.group(1) if m else title


def obtener_color_desde_geojson(route_dir: Path) -> str | None:
    """
    Lee route_track.geojson y devuelve el primer color encontrado.
    """
    gpath = route_dir / "route_track.geojson"
    if not gpath.exists():
        return None

    try:
        with gpath.open("r", encoding="utf-8") as f:
            fc = json.load(f)
    except Exception:
        return None

    for feat in fc.get("features", []):
        props = feat.get("properties", {}) or {}
        color = props.get("color")
        if color:
            return color
    return None


def actualizar_wr_jsons(
    route_dir: Path,
    wr_map: Dict,
    wr_overrides: Dict,
) -> None:
    """
    Actualiza los dicts wr_map y wr_overrides para una ruta ya scrapeada.
    Espera que en route_dir haya route.json, summary.json y route_track.geojson.
    """
    meta_path = route_dir / "route.json"
    summary_path = route_dir / "summary.json"

    if not meta_path.exists():
        print(f"[WARN] No se encontró {meta_path}, se omite esta ruta.")
        return

    with meta_path.open("r", encoding="utf-8") as f:
        meta = json.load(f)

    title = meta.get("title") or ""
    route_id = meta.get("route_id") or ""
    url = meta.get("url") or ""
    city = meta.get("city") or ""

    display_id = extraer_display_id(title)
    color = obtener_color_desde_geojson(route_dir) or "#000000"

    trips_detected = 1
    if summary_path.exists():
        with summary_path.open("r", encoding="utf-8") as f:
            summary = json.load(f)
        trips_detected = max(1, int(summary.get("trips_detected", 1)))

    route_folder_key = route_dir.name  # p.ej. 'route_154193'

    wr_overrides[route_folder_key] = {
        "display_id": display_id,
        "color": color,
        "name": title,
    }

    if route_id:
        wr_overrides[route_id] = {
            "display_id": display_id,
            "color": color,
        }

    routes_map = wr_map.setdefault("routes", {})

    for trip in range(1, trips_detected + 1):
        if trips_detected == 1:
            suffix_key = ""
            suffix_label = ""
        else:
            if trip == 1 and trips_detected == 2:
                suffix_key = "-ida"
                suffix_label = " (ida)"
            elif trip == 2 and trips_detected == 2:
                suffix_key = "-vuelta"
                suffix_label = " (vuelta)"
            else:
                suffix_key = f"-trip{trip}"
                suffix_label = f" (trip {trip})"

        map_key = f"{display_id}{suffix_key}"
        folder_rel = route_dir.as_posix()

        routes_map[map_key] = {
            "folder": folder_rel,
            "trip": trip,
            "color": color,
            "name": title + suffix_label,
            "url": url,
            "city": city,
        }


def main():
    OUT_ROOT.mkdir(parents=True, exist_ok=True)

    driver = make_driver(headless=False, lang="es-ES")
    wait = WebDriverWait(driver, 60)

    try:
        driver.get(CATALOG_URL)
        time.sleep(3)

        # Asegurar pestaña "Todas las rutas"
        try:
            tab_todas = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "span#tabs-btn-1"))
            )
            driver.execute_script("arguments[0].click();", tab_todas)
        except TimeoutException:
            pass

        # Esperar a que haya chips
        esperar_chips(driver)

        # 1) Expandir todas las listas en el estado inicial (solo activas)
        expandir_listas(driver)

        # 2) Rutas activas visibles
        rutas_activas = obtener_links_rutas(driver)
        print(f"Total rutas activas: {len(rutas_activas)}")

        # 3) Click en el label "Mostrar rutas inactivas"
        label_inactivas = wait.until(
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, "label[for='checkboxShowInactive']")
            )
        )
        driver.execute_script("arguments[0].click();", label_inactivas)
        time.sleep(2)

        # 4) Volver a expandir listas (ahora con inactivas)
        expandir_listas(driver)

        # 5) Rutas totales (activas + inactivas)
        rutas_totales = obtener_links_rutas(driver)
        print(f"Total rutas totales (activas + inactivas): {len(rutas_totales)}")

        set_activas = {r["name"] for r in rutas_activas}
        rutas_inactivas = [r for r in rutas_totales if r["name"] not in set_activas]
        print(f"Total rutas inactivas: {len(rutas_inactivas)}")

    finally:
        driver.quit()

    # Cargar JSON existentes (si hay) para acumular
    wr_map = cargar_json_si_existe(WR_MAP_JSON)
    wr_overrides = cargar_json_si_existe(WR_OVERRIDES_JSON)

    # Para cada ruta total, ejecutamos el scraper de Leaflet
    for idx, ruta in enumerate(rutas_totales, start=1):
        nombre = ruta["name"]
        url = ruta["url"]
        print(f"\n[{idx}/{len(rutas_totales)}] Scrapeando ruta: {nombre} -> {url}")

        try:
            out_dir = scrape_route(url, OUT_ROOT, headless=True)
        except Exception as e:
            print(f"[ERROR] Falló scrape_route para {url}: {e}")
            continue

        try:
            actualizar_wr_jsons(out_dir, wr_map, wr_overrides)
        except Exception as e:
            print(f"[WARN] No se pudo actualizar JSONs para {out_dir}: {e}")

    guardar_json(WR_MAP_JSON, wr_map)
    guardar_json(WR_OVERRIDES_JSON, wr_overrides)

    print("\nListo. JSON generados/actualizados:")
    print(f"  - {WR_MAP_JSON}")
    print(f"  - {WR_OVERRIDES_JSON}")


if __name__ == "__main__":
    main()


Total rutas activas: 418
Total rutas totales (activas + inactivas): 1596
Total rutas inactivas: 1175

[1/1596] Scrapeando ruta: 001p -> https://wikiroutes.info/es/lima?routes=90415
{
  "route_folder": "data\\processed\\transporte\\route_90415",
  "trips_detected": 2,
  "line_segments_total": 2,
  "points_total": 197
}

[2/1596] Scrapeando ruta: 1 (Expreso San Isidro) -> https://wikiroutes.info/es/lima?routes=52587
{
  "route_folder": "data\\processed\\transporte\\route_52587",
  "trips_detected": 1,
  "line_segments_total": 1,
  "points_total": 24
}

[3/1596] Scrapeando ruta: 2 (Expreso San Isidro) -> https://wikiroutes.info/es/lima?routes=52610
{
  "route_folder": "data\\processed\\transporte\\route_52610",
  "trips_detected": 1,
  "line_segments_total": 1,
  "points_total": 28
}

[4/1596] Scrapeando ruta: 010p -> https://wikiroutes.info/es/lima?routes=75063
{
  "route_folder": "data\\processed\\transporte\\route_75063",
  "trips_detected": 2,
  "line_segments_total": 2,
  "points_tot