# Add SHN Info to Transit Routes in the middle of the Pipeline

In [None]:
import datetime

import geopandas as gpd
import google.auth
import numpy as np
import pandas as pd
import yaml
from calitp_data_analysis import geography_utils, utils
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils import helpers
from shared_utils import (
    catalog_utils,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
    schedule_rt_utils,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

credentials, project = google.auth.default()

import gcsfs

fs = gcsfs.GCSFileSystem()


# import sys
# sys.path.append("../open_data")
# import create_routes_data

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
analysis_date = rt_dates.DATES["jun2025"]

## Take result from `open_data/create_routes_data`

In [None]:
SHN_HWY_BUFFER_FEET = 50

In [None]:
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
f"{GTFS_DATA_DICT.gcs_paths.GCS}AH_TEST_traffic_ops/"

In [None]:
gdf = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/AH_TEST_traffic_ops/ca_transit_routes_2025-06-11.parquet",
    storage_options={"token": credentials.token},
)

In [None]:
gdf.columns

### Drop duplicates: the dataframes goes down by 1000 rows.

In [None]:
len(gdf.drop_duplicates())

In [None]:
gdf = gdf.drop_duplicates()

In [None]:
gdf.loc[gdf.pct_route_on_hwy_across_districts > 20].sample(3).drop(
    columns=["geometry", "base64_url", "schedule_gtfs_dataset_key"]
).T

In [None]:
gdf.on_shs.value_counts()

In [None]:
gdf.pct_route_on_hwy_across_districts.describe()

In [None]:
gdf.columns

In [None]:
gdf.route_name_used.nunique()

In [None]:
shn_district_df = gpd.read_parquet(
    f"gs://calitp-analytics-data/data-analyses/state_highway_network/shn_buffered_50_ft_ct_district_route.parquet",
    storage_options={"token": credentials.token},
)

In [None]:
shn_district_df.columns

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"
shn_gdf = gpd.read_parquet(
    f"{GCS_FILE_PATH}shn_buffered_{SHN_HWY_BUFFER_FEET}_gtfs_digest.parquet",
    storage_options={"token": credentials.token},
)

In [None]:
len(shn_gdf)

In [None]:
m = shn_gdf.explore(
    name="district",
    tiles="CartoDB positron",
    style_kwds={"color": "#9DA4A6", "opacity": 0.5},
    height=500,
    width=1000,
    legend=False,
)

In [None]:
one_route = gdf.loc[(gdf.route_name_used == "RTS") & (gdf.shape_id == "p_1439962")]

In [None]:
one_route.drop(columns=["geometry"])

In [None]:
one_route.explore(
    m=m,
    cmap="Spectral",
    categorical=True,
    legend=False,
    legend_kwds={"width": 200},
)

## Overlay the geodataframe with the buffered state highway routes.

In [None]:
def routes_shn_intersection(
    routes_gdf: gpd.GeoDataFrame, buffer_amount: int, file_name: str
) -> gpd.GeoDataFrame:
    """
    Overlay the most recent transit routes with a buffered version
    of the SHN
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

    # Read in buffered shn here or re buffer if we don't have it available.
    HWY_FILE = f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_{file_name}.parquet"

    if fs.exists(HWY_FILE):
        shn_routes_gdf = gpd.read_parquet(
            HWY_FILE, storage_options={"token": credentials.token}
        )
    else:
        shn_routes_gdf = buffer_shn(buffer_amount)

    # Process the most recent transit route geographies and ensure the
    # CRS matches the SHN routes' GDF so the overlay doesn't go wonky.
    routes_gdf = routes_gdf.to_crs(shn_routes_gdf.crs)

    # Overlay transit routes with the SHN geographies.
    gdf = gpd.overlay(
        routes_gdf, shn_routes_gdf, how="intersection", keep_geom_type=True
    )

    # Calcuate the percent of the transit route that runs on a highway, round it up and
    # multiply it by 100. Drop the geometry because we want the original transit route
    # shapes.
    gdf = gdf.assign(
        pct_route_on_hwy=(gdf.geometry.length / gdf.route_length_feet).round(3) * 100,
    )

    gdf = gdf.rename(
        columns={
            "pct_route_on_hwy": "pct_route_on_hwy_across_districts",
            "district": "shn_districts",
        }
    )
    return gdf

In [None]:
intersecting = routes_shn_intersection(gdf, SHN_HWY_BUFFER_FEET, "ct_district_route")

In [None]:
intersecting.route_name_used.nunique()

In [None]:
intersecting.columns

In [None]:
def group_route_district(df: pd.DataFrame, pct_route_on_hwy_agg: str) -> pd.DataFrame:
    """
    Aggregate by adding all the districts and SHN to a single row, rather than
    multiple and sum up the total % of SHN a transit route intersects with.

    df: the dataframe you want to aggregate
    pct_route_on_hwy_agg: whether you want to find the max, min, sum, etc on the column
    "pct_route_on_hwy_across_districts"
    """

    agg1 = (
        df.groupby(
            [
                "n_trips",
                "schedule_gtfs_dataset_key",
                "route_id",
                "route_type",
                "shape_id",
                "route_name_used",
                "name",
                "base64_url",
                "organization_source_record_id",
                "organization_name",
                "caltrans_district",
            ],
            as_index=False,
        )[["shn_route", "shn_districts", "pct_route_on_hwy_across_districts"]]
        .agg(
            {
                "shn_route": lambda x: ", ".join(set(x.astype(str))),
                "shn_districts": lambda x: ", ".join(set(x.astype(str))),
                "pct_route_on_hwy_across_districts": pct_route_on_hwy_agg,
            }
        )
        .reset_index(drop=True)
    )

    # Clean up
    agg1.pct_route_on_hwy_across_districts = (
        agg1.pct_route_on_hwy_across_districts.astype(float).round(2)
    )

    return agg1

In [None]:
agg1 = group_route_district(intersecting, "sum")

In [None]:
agg1.loc[agg1.pct_route_on_hwy_across_districts > 100]

In [None]:
agg1.pct_route_on_hwy_across_districts.describe()

In [None]:
len(agg1)

In [None]:
gdf.route_id.nunique()

In [None]:
len(gdf)

In [None]:
agg1.route_id.nunique()

In [None]:
agg1.sample()

## Merge agg1 back with the original dataframe

In [None]:
m1 = pd.merge(gdf, agg1, how="outer", indicator=True)

In [None]:
m1.loc[m1.]

In [None]:
m1._merge.value_counts()

In [None]:
m1.info()

In [None]:
m1.shape

In [None]:
len(gdf)

In [None]:
type(m1)

In [None]:
def create_on_shs_column(df):
    df["on_shs"] = np.where(
        (df["pct_route_on_hwy_across_districts"].isna())
        | (df["pct_route_on_hwy_across_districts"] == 0),
        "N",
        "Y",
    )
    return df

In [None]:
m1 = create_on_shs_column(m1)

In [None]:
m1.loc[
    (m1["on_shs"] == "N") & (m1["shn_districts"] != "0"),
    ["shn_districts", "shn_route"],
] = np.nan

In [None]:
m1.on_shs.value_counts()

In [None]:
m1.drop(columns=["geometry", "base64_url"]).sample(3)

### Map

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"
shn_gdf = gpd.read_parquet(
    f"{GCS_FILE_PATH}shn_buffered_{SHN_HWY_BUFFER_FEET}_gtfs_digest.parquet",
    storage_options={"token": credentials.token},
)

In [None]:
m = shn_gdf.explore(
    name="district",
    tiles="CartoDB positron",
    style_kwds={"color": "#9DA4A6", "opacity": 0.5},
    height=500,
    width=1000,
    legend=False,
)

In [None]:
one_route = intersecting.loc[
    (intersecting.route_name_used == "RTS") & (intersecting.shape_id == "p_1439962")
]

In [None]:
""" one_route.explore(
    m=m,
    cmap="Spectral",
    categorical=True,
    legend=False,
    legend_kwds={"width": 200},
)"""

## Function

In [None]:
def add_shn_information(gdf: gpd.GeoDataFrame, buffer_amt: int) -> pd.DataFrame:
    """
    Prepare the gdf to join with the existing transit_routes
    dataframe that is published on the Open Data Portal
    """
    # Drop duplicates
    gdf = gdf.drop_duplicates()
    # Overlay
    intersecting = routes_shn_intersection(gdf, buffer_amt, "ct_district_route")

    # Group the dataframe so that one route only has one
    # row instead of multiple rows after finding its
    # intersection with any SHN routes.
    agg1 = group_route_district(intersecting, "sum")

    # Merge the dataframe with all the SHS info with the original
    # gdf so we can get the original transit route geometries &
    # any routes that don't intersect with the state highway routes.
    m1 = pd.merge(gdf, agg1, how="left")

    # Add yes/no column to signify if a transit route intersects
    # with a SHN route
    m1 = create_on_shs_column(m1)

    # Clean up rows that are tagged as "on_shs==N" but still have values
    # that appear.
    m1.loc[
        (m1["on_shs"] == "N") & (m1["shn_districts"] != "0"),
        ["shn_districts", "shn_route"],
    ] = np.nan
    return m1

In [None]:
og_gdf = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/AH_TEST_traffic_ops/ca_transit_routes_2025-06-11.parquet",
    storage_options={"token": credentials.token},
)

In [None]:
test = add_shn_information(og_gdf, SHN_HWY_BUFFER_FEET)

In [None]:
test.shape

In [None]:
test.columns

In [None]:
test.geometry.is_valid.value_counts()

In [None]:
gdf_from_function = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/AH_TEST_traffic_ops/ca_transit_routes.parquet",
    storage_options={"token": credentials.token},
)

In [None]:
gdf_from_function2 = gpd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/AH_TEST_traffic_ops/ca_transit_routes_2025-06-11.parquet",
    storage_options={"token": credentials.token},
)

In [None]:
gdf_from_function2.shape

In [None]:
gdf_from_function.shape