# Transit On the SHN 
* [GH Issue](https://github.com/cal-itp/data-analyses/issues/1477)


In [None]:
import geopandas as gpd
import google.auth
import numpy as np
import pandas as pd

credentials, project = google.auth.default()

import gcsfs

fs = gcsfs.GCSFileSystem()

In [None]:
from calitp_data_analysis import geography_utils, utils
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import (
    catalog_utils,
    dask_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates + rt_dates.y2025_dates

In [None]:
analysis_date = "2025-03-12"

## Load in Operator Routes - Fix the way `shortest_longest` and `percentile_groups` are created in `merge_operator_data`

In [None]:
def process_transit_routes() -> gpd.GeoDataFrame:
    """
    Select the most recent transit route.
    Also count how many routes there are for each operator.
    """
    OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

    subset = [
        "service_date",
        "geometry",
        "portfolio_organization_name",
        "recent_combined_name",
        "route_id",
        "schedule_gtfs_dataset_key",
    ]
    op_geography_df = gpd.read_parquet(
        f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
        storage_options={"token": credentials.token},
    )[subset]

    most_recent_dates = publish_utils.filter_to_recent_date(
        df=op_geography_df,
        group_cols=[
            "portfolio_organization_name",
            "route_id",
        ],
    )
    most_recent_routes = pd.merge(
        op_geography_df,
        most_recent_dates,
        on=["portfolio_organization_name", "route_id", "service_date"],
        how="inner",
    )
    # Calc length of route
    most_recent_routes = most_recent_routes.assign(
        route_length_feet=most_recent_routes.geometry.to_crs(
            geography_utils.CA_NAD83Albers_ft
        ).length
    )

    # Drop duplicates?
    most_recent_routes = most_recent_routes.drop_duplicates(
        subset=["portfolio_organization_name", "route_id", "service_date"]
    )
    return most_recent_routes

In [None]:
transit_routes = process_transit_routes()

In [None]:
transit_routes.drop(columns=["geometry"]).sample(3)

## Load in SHS 
* Reuse Tiffany's [code](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/parallel_corridors.py)

In [None]:
def dissolve_shn() -> gpd.GeoDataFrame:
    """
    Dissolve State Highway Network so
    there will only be one row for each
    route name and route type
    """
    # calitp-analytics-data/data-analyses/state_highway_network
    SHN_FILE = catalog_utils.get_catalog(
        "shared_data_catalog"
    ).state_highway_network.urlpath

    shn = gpd.read_parquet(
        SHN_FILE,
        storage_options={"token": credentials.token},
    ).to_crs(geography_utils.CA_NAD83Albers_ft)

    # Dissolve
    shn_dissolved = (
        shn.dissolve(by=["Route", "RouteType"])
        .reset_index()
        .drop(columns=["County", "District", "Direction"])
    )

    shn_dissolved = shn_dissolved.rename(
        columns={"Route": "shn_route", "RouteType": "route_type"}
    )

    shn_dissolved = shn_dissolved.assign(
        highway_feet=shn_dissolved.geometry.length,
        shn_route=shn_dissolved.shn_route.astype(int),
    )

    shn_dissolved.to_parquet(
        "gs://calitp-analytics-data/data-analyses/state_highway_network/shn_dissolved.parquet"
    )

In [None]:
SHN_FILE = catalog_utils.get_catalog(
    "shared_data_catalog"
).state_highway_network.urlpath

In [None]:
# shn = gpd.read_parquet(
#   SHN_FILE,
#   storage_options={"token": credentials.token},
# ).to_crs(geography_utils.CA_NAD83Albers_ft)

In [None]:
# Dissolve
# shn_dissolved = (
#   shn.dissolve(by=["Route", "RouteType"])
#   .reset_index()
#   .drop(columns=["County", "District", "Direction"])
# )

In [None]:
shn_dissolved = dissolve_shn()

### When I plot them using `explore` the routes are exactly the same
* Where to save dissolved file? 

In [None]:
# shn_dissolved.loc[shn_dissolved.shn_route == 210].drop(columns=["geometry"])

In [None]:
# shn_dissolved.loc[shn_dissolved.shn_route == 110].drop(columns=["geometry"])

In [None]:
# shn_dissolved.loc[shn_dissolved.Route == 210].explore()

In [None]:
# shn_dissolved.loc[shn_dissolved.Route == 110].explore()

In [None]:
# len(shn_dissolved)

In [None]:
SHN_HWY_BUFFER_FEET = 50
PARALLEL_HWY_BUFFER_FEET = geography_utils.FEET_PER_MI * 0.5

In [None]:
def buffer_shn(buffer_amount: int) -> gpd.GeoDataFrame:
    """
    Add a buffer to the SHN file. Save it out so
    we won't have to buffer over and over again.
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

    shn_df = gpd.read_parquet(f"{GCS_FILE_PATH}shn_dissolved.parquet")

    shn_df_buffered = shn_df.assign(
        geometry=shn_df.geometry.buffer(buffer_amount),
    )

    shn_df_buffered.to_parquet(
        f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft.parquet"
    )

    return shn_df_buffered

In [None]:
intersecting_buffer = buffer_shn(SHN_HWY_BUFFER_FEET)

## Overlay

In [None]:
def routes_shn_intersection(
    buffer_amount: int, pct_route_intersection: float
) -> gpd.GeoDataFrame:

    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

    # Read in buffered shn here or re buffer
    HWY_FILE = f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft.parquet"

    if fs.exists(HWY_FILE):
        shn_routes_gdf = gpd.read_parquet(HWY_FILE)
    else:
        shn_routes_gdf = buffer_shn(buffer_amount)

    transit_routes_gdf = process_transit_routes().to_crs(shn_routes_gdf.crs)

    gdf = gpd.overlay(transit_routes_gdf, shn_routes_gdf, how="intersection", keep_geom_type=True)

    gdf = gdf.assign(
        pct_route_on_hwy=(gdf.geometry.length / gdf.route_length_feet).round(3),
        shn_route=gdf.shn_route.astype(int),
    ).drop(columns=["geometry", ])

    gdf2 = gdf.loc[gdf.pct_route_on_hwy >= pct_route_intersection]

    # Join back for geo
    gdf3 = pd.merge(
        transit_routes_gdf,
        gdf2,
        on=[
            "service_date",
            "portfolio_organization_name",
            "recent_combined_name",
            "route_id",
            "schedule_gtfs_dataset_key",
            "route_length_feet"
        ],
        how ="inner"
    )

    return gdf3

In [None]:
intersecting = routes_shn_intersection(SHN_HWY_BUFFER_FEET, 0.2)

In [None]:
intersecting.shape

In [None]:
intersecting.columns

In [None]:
intersecting.drop(columns = ["geometry"]).sample(10)

In [None]:
intersecting.loc[intersecting.recent_combined_name == "5 Monterey - Carmel Rancho"].drop(columns = ["service_date"]).explore()

In [None]:
transit_routes.loc[transit_routes.recent_combined_name == "5 Monterey - Carmel Rancho"].drop(columns=["service_date"]).explore()

In [None]:
m = shn_dissolved.explore(name="shn")
m = (
    transit_routes.loc[transit_routes.portfolio_organization_name == "Presidio Trust"]
    .drop(columns=["service_date"])
    .explore(m=m, color="red", style_kwds={"weight": 5}, name="hqta")
)

In [None]:
m