# Add SHN Info to Transit Routes in the middle of the Pipeline

In [1]:
import datetime

import geopandas as gpd
import google.auth
import numpy as np
import pandas as pd
import yaml
from calitp_data_analysis import geography_utils, utils
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils import helpers
from shared_utils import (
    catalog_utils,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
    schedule_rt_utils,
    shared_data,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

credentials, project = google.auth.default()

import gcsfs

fs = gcsfs.GCSFileSystem()

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = rt_dates.y2025_dates

In [4]:
analysis_date_list[0]

'2025-01-15'

In [5]:
date = analysis_date_list[0]

In [6]:
ah_testing_shn = "gs://calitp-analytics-data/data-analyses/ah_testing/ca_transit_routes.parquet"

In [7]:
ah_testing_shn_gdf = gpd.read_parquet(ah_testing_shn, storage_options={"token": credentials.token})

In [8]:
ah_testing_shn_gdf.shape

(11413, 13)

In [9]:
ah_testing_shn_gdf.columns

Index(['org_id', 'agency', 'route_id', 'route_type', 'route_name', 'shape_id',
       'n_trips', 'base64_url', 'shn_route', 'on_shs', 'shn_districts',
       'pct_route_on_hwy_across_districts', 'geometry'],
      dtype='object')

In [10]:
og_url = "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes.parquet"

In [11]:
og_gdf = gpd.read_parquet(og_url, storage_options={"token": credentials.token})

In [12]:
og_gdf.shape

(11633, 9)

In [13]:
og_gdf.columns

Index(['org_id', 'agency', 'route_id', 'route_type', 'route_name', 'shape_id',
       'n_trips', 'base64_url', 'geometry'],
      dtype='object')

In [14]:
set(list(ah_testing_shn_gdf.columns))-set(list(og_gdf.columns))

{'on_shs', 'pct_route_on_hwy_across_districts', 'shn_districts', 'shn_route'}

In [15]:
ah_testing = "gs://calitp-analytics-data/data-analyses/ah_testing/ca_transit_routes_2025-07-16.parquet"

In [16]:
ah_gdf = gpd.read_parquet(ah_testing, storage_options={"token": credentials.token})

In [17]:
ah_gdf.shape

(10267, 13)

In [18]:
SHN_HWY_BUFFER_FEET = 50

In [19]:
ah_gdf.drop(columns=["geometry"]).sample()

Unnamed: 0,n_trips,schedule_gtfs_dataset_key,route_id,route_type,shape_id,route_name_used,name,base64_url,organization_source_record_id,organization_name,caltrans_district,route_length_feet
2515,86,3364ec074ca85001da3abd78be2ae521,215,3,S2_215_2_25,215,San Diego Schedule,aHR0cHM6Ly93d3cuc2RtdHMuY29tL2dvb2dsZV90cmFuc2l0X2ZpbGVzL2dvb2dsZV90cmFuc2l0LnppcA==,rech5YtfjpQvVIBAF,Flagship Cruises and Events Inc.,11 - San Diego,52384.73


## `open_data/create_routes_data/add_shn_information`

In [None]:
def routes_shn_intersection(
    routes_gdf: gpd.GeoDataFrame, buffer_amount: int
) -> gpd.GeoDataFrame:
    """
    Overlay the most recent transit routes with a buffered version
    of the SHN
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/shared_data/"

    # Read in buffered shn here or re buffer if we don't have it available.
    HWY_FILE = f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_shn_dissolved_by_ct_district_route.parquet"

    if fs.exists(HWY_FILE):
        shn_routes_gdf = gpd.read_parquet(
            HWY_FILE, storage_options={"token": credentials.token}
        )
    else:
        shn_routes_gdf = shared_data.buffer_shn(buffer_amount)

    # Process the most recent transit route geographies and ensure the
    # CRS matches the SHN routes' GDF so the overlay doesn't go wonky.
    routes_gdf = routes_gdf.to_crs(shn_routes_gdf.crs)

    # Overlay transit routes with the SHN geographies.
    gdf = gpd.overlay(
        routes_gdf, shn_routes_gdf, how="intersection", keep_geom_type=True
    )

    # Calcuate the percent of the transit route that runs on a highway, round it up and
    # multiply it by 100. Drop the geometry because we want the original transit route
    # shapes.
    gdf = gdf.assign(
        pct_route_on_hwy=(gdf.geometry.length / gdf.route_length_feet).round(3) * 100,
    )

    # Subset
    gdf2 = gdf[
        [
            "district",
            "highway_feet",
            "shn_route",
            "pct_route_on_hwy",
            "n_trips",
            "schedule_gtfs_dataset_key",
            "route_id",
            "route_type",
            "shape_id",
            "route_name_used",
        ]
    ]

    # Clean up
    gdf2.district = gdf2.district.fillna(0).astype(int)
    
    gdf2 = gdf2.rename(
        columns={
            "pct_route_on_hwy": "pct_route_on_hwy_across_districts",
            "district": "shn_districts",
        }
    )
    return gdf2

In [None]:
intersecting = routes_shn_intersection(ah_gdf, SHN_HWY_BUFFER_FEET)

In [None]:
intersecting.columns

In [None]:
intersecting.sample().drop(columns=["geometry"]).T

In [None]:
intersecting = intersecting.rename(
    columns={
        "pct_route_on_hwy": "pct_route_on_hwy_across_districts",
        "district": "shn_districts",
    }
)

In [None]:
def group_route_district(df: pd.DataFrame, pct_route_on_hwy_agg: str) -> pd.DataFrame:
    """
    Aggregate by adding all the districts and SHN to a single row, rather than
    multiple and sum up the total % of SHN a transit route intersects with.

    df: the dataframe you want to aggregate
    pct_route_on_hwy_agg: whether you want to find the max, min, sum, etc on the column
    "pct_route_on_hwy_across_districts"
    """

    agg1 = (
        df.groupby(
            [
                "n_trips",
                "schedule_gtfs_dataset_key",
                "route_id",
                "route_type",
                "shape_id",
                "route_name_used",
            ],
            as_index=False,
        )[["shn_route", "shn_districts", "pct_route_on_hwy_across_districts"]]
        .agg(
            {
                "shn_route": lambda x: ", ".join(set(x.astype(str))),
                "shn_districts": lambda x: ", ".join(set(x.astype(str))),
                "pct_route_on_hwy_across_districts": pct_route_on_hwy_agg,
            }
        )
        .reset_index(drop=True)
    )

    # Clean up
    agg1.pct_route_on_hwy_across_districts = (
        agg1.pct_route_on_hwy_across_districts.astype(float).round(2)
    )
    return agg1

In [None]:
agg1 = group_route_district(intersecting, "sum")

In [None]:
agg1["on_shs"] = np.where(agg1["pct_route_on_hwy_across_districts"] == 0, "N", "Y")

In [None]:
agg1.loc[
    (agg1["on_shs"] == "N") & (agg1["shn_districts"] != "0"),
    ["shn_districts", "shn_route"],
] = np.nan

In [None]:
agg1.columns

In [None]:
def add_shn_information(gdf: gpd.GeoDataFrame, buffer_amt: int) -> pd.DataFrame:
    """
    Prepare the gdf to join with the existing transit_routes
    dataframe that is published on the Open Data Portal
    """
    # Overlay
    intersecting = routes_shn_intersection(gdf, buffer_amt)
    # Group the dataframe so that one route only has one
    # row instead of multiple rows after finding its
    # intersection with any SHN routes.
    agg1 = group_route_district(intersecting, "sum")

    # Add yes/no column to signify if a transit route intersects
    # with a SHN route
    agg1["on_shs"] = np.where(agg1["pct_route_on_hwy_across_districts"] == 0, "N", "Y")

    # Clean up rows that are tagged as "on_shs==N" but still have values
    # that appear.
    agg1.loc[
        (agg1["on_shs"] == "N") & (agg1["shn_districts"] != "0"),
        ["shn_districts", "shn_route"],
    ] = np.nan
    display(agg1.info())
    # Join back the dataframe above with the original transit route dataframes
    # so we can have the original transit route geographies.
    m1 = pd.merge(
        gdf,
        agg1,
        on=[
            "n_trips",
            "schedule_gtfs_dataset_key",
            "route_id",
            "route_type",
            "shape_id",
            "route_name_used",
        ],
        how="left",
    )
    return m1

In [None]:
published_routes = add_shn_information(ah_gdf, 50)

In [None]:
published_routes.columns

In [None]:
published_routes.head()

## Move the bulk of the SHN work from `open_data/create_routes_data` to `_shared_utils/shared_data`

In [None]:
shn = shared_data.make_clean_state_highway_network()

In [None]:
SHN_FILE = catalog_utils.get_catalog(
    "shared_data_catalog"
).state_highway_network.urlpath

shn = gpd.read_parquet(
    SHN_FILE,
    storage_options={"token": credentials.token},
).to_crs(geography_utils.CA_NAD83Albers_ft)

In [None]:
shn.columns

In [None]:
shn.drop(columns=["geometry"]).sample(5)

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/shared_data/"

In [None]:
def dissolve_shn_district() -> gpd.GeoDataFrame:
    """
    Dissolve State Highway Network so there will only be one row for each
    route name, route type, and Caltrans district. Find the length
    of the highway and do some light cleaning.
    """
    # Read in the dataset and change the CRS to one to feet.
    SHN_FILE = catalog_utils.get_catalog(
        "shared_data_catalog"
    ).state_highway_network.urlpath

    shn = gpd.read_parquet(
        SHN_FILE,
        storage_options={"token": credentials.token},
    ).to_crs(geography_utils.CA_NAD83Albers_ft)

    # Dissolve by route which represents the the route's name and drop the other columns
    # because they are no longer relevant.
    shn_dissolved = (shn.dissolve(by=["Route", "District"]).reset_index())[
        ["Route", "District", "geometry"]
    ]

    # Rename because I don't want any confusion between SHN route and
    # transit route.
    shn_dissolved = shn_dissolved.rename(columns={"Route": "shn_route"})
    shn_dissolved.columns = shn_dissolved.columns.str.lower()
    # Find the length of each highway.
    shn_dissolved = shn_dissolved.assign(
        highway_feet=shn_dissolved.geometry.length,
        shn_route=shn_dissolved.shn_route.astype(int).astype(str),
    )

    # Save this out so I don't have to dissolve it each time.
    shn_dissolved.to_parquet(
        f"{GCS_FILE_PATH}shn_dissolved_by_ct_district_route.parquet",
        filesystem=fs,
    )
    return shn_dissolved

In [None]:
dissolved_route = dissolve_shn_district()

In [None]:
def buffer_shn(buffer_amount: int, file_name: str) -> gpd.GeoDataFrame:
    """
    Add a buffer to the SHN before overlaying it with
    transit routes.
    """
    # GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

    # Read in the dissolved SHN file
    shn_df = gpd.read_parquet(
        f"{GCS_FILE_PATH}{file_name}.parquet",
        storage_options={"token": credentials.token},
    )

    # Buffer the state highway.
    shn_df_buffered = shn_df.assign(
        geometry=shn_df.geometry.buffer(buffer_amount),
    )

    # Save it out so we won't have to buffer over again and
    # can just read it in.
    shn_df_buffered.to_parquet(
        f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_{file_name}.parquet",
        filesystem=fs,
    )

    return shn_df_buffered

In [None]:
SHN_HWY_BUFFER_FEET = 50
PARALLEL_HWY_BUFFER_FEET = geography_utils.FEET_PER_MI * 0.5

In [None]:
buffered_shn = buffer_shn(SHN_HWY_BUFFER_FEET, "shn_dissolved_by_ct_district_route")

## Test function in `create_routes_data`

In [None]:
trips = helpers.import_scheduled_trips(
    date,
    columns=[
        "gtfs_dataset_key",
        "route_id",
        "route_type",
        "shape_id",
        "shape_array_key",
        "route_long_name",
        "route_short_name",
        "route_desc",
    ],
    get_pandas=True,
).dropna(subset="shape_array_key")

In [None]:
trips.sample()

In [None]:
shapes = helpers.import_scheduled_shapes(
    date, columns=["shape_array_key", "n_trips", "geometry"], get_pandas=True, crs=WGS84
).dropna(subset="shape_array_key")

In [None]:
shapes.sample().drop(columns=["geometry"])

In [None]:
df = (
    pd.merge(shapes, trips, on="shape_array_key", how="inner")
    .drop_duplicates(subset="shape_array_key")
    .drop(columns="shape_array_key")
)

In [None]:
df.shape

In [None]:
drop_cols = ["route_short_name", "route_long_name", "route_desc"]
route_shape_cols = ["schedule_gtfs_dataset_key", "route_id", "shape_id"]

In [None]:
def remove_erroneous_shapes(
    shapes_with_route_info: gpd.GeoDataFrame,
) -> gpd.GeoDataFrame:
    """
    Check if line is simple for Amtrak. If it is, keep.
    If it's not simple (line crosses itself), drop.

    In Jun 2023, some Amtrak shapes appeared to be funky,
    but in prior months, it's been ok.
    Checking for length is fairly time-consuming.
    """
    amtrak = "Amtrak Schedule"

    possible_error = shapes_with_route_info[shapes_with_route_info.name == amtrak]
    ok = shapes_with_route_info[shapes_with_route_info.name != amtrak]

    # Check if the line crosses itself
    ok_amtrak = (
        possible_error.assign(simple=possible_error.geometry.is_simple)
        .query("simple == True")
        .drop(columns="simple")
    )

    ok_shapes = pd.concat([ok, ok_amtrak], axis=0).reset_index(drop=True)

    return ok_shapes

### Didn't reach the step of `routes_assmebled2` because of all the different imports causing issues.

In [None]:
routes_assembled = (
    portfolio_utils.add_route_name(df)
    .drop(columns=drop_cols)
    .sort_values(route_shape_cols)
    .drop_duplicates(subset=route_shape_cols)
    .reset_index(drop=True)
)

In [None]:
routes_assembled.shape

In [None]:
routes_assembled.columns

### Add length to the transit routes.

In [None]:
routes_assembled = routes_assembled.assign(
    route_length_feet=routes_assembled.geometry.to_crs(
        geography_utils.CA_NAD83Albers_ft
    ).length
)

## Overlay the transit routes with the SHN 

In [None]:
def routes_shn_intersection(
    routes_gdf: gpd.GeoDataFrame, buffer_amount: int
) -> gpd.GeoDataFrame:
    """
    Overlay the most recent transit routes with a buffered version
    of the SHN
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/shared_data/"

    # Read in buffered shn here or re buffer if we don't have it available.
    HWY_FILE = f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_shn_dissolved_by_ct_district_route.parquet"

    if fs.exists(HWY_FILE):
        shn_routes_gdf = gpd.read_parquet(
            HWY_FILE, storage_options={"token": credentials.token}
        )
    else:
        shn_routes_gdf = shared_data.buffer_shn(buffer_amount)

    # Process the most recent transit route geographies and ensure the
    # CRS matches the SHN routes' GDF so the overlay doesn't go wonky.
    routes_gdf = routes_gdf.to_crs(shn_routes_gdf.crs)

    # Overlay transit routes with the SHN geographies.
    gdf = gpd.overlay(
        routes_gdf, shn_routes_gdf, how="intersection", keep_geom_type=True
    )

    # Calcuate the percent of the transit route that runs on a highway, round it up and
    # multiply it by 100. Drop the geometry because we want the original transit route
    # shapes.
    gdf = gdf.assign(
        pct_route_on_hwy=(gdf.geometry.length / gdf.route_length_feet).round(3) * 100,
    ).drop(
        columns=[
            "geometry",
        ]
    )

    # Join back the dataframe above with the original transit route dataframes
    # so we can have the original transit route geographies.
    gdf2 = pd.merge(
        routes_gdf,
        gdf,
        on=[
            "n_trips",
            "schedule_gtfs_dataset_key",
            "route_id",
            "route_type",
            "shape_id",
            "route_name_used",
            "route_length_feet",
        ],
        how="left",
    )

    # Clean up
    gdf2.district = gdf2.district.fillna(0).astype(int)
    return gdf2

In [None]:
intersecting = routes_shn_intersection(routes_assembled, 50)

In [None]:
len(intersecting)

In [None]:
intersecting.pct_route_on_hwy.describe()

In [None]:
intersecting.columns

### Find multi route districts

In [None]:
# Find routes that cross multiple districts
multi_district_routes = (
    intersecting.groupby(["schedule_gtfs_dataset_key", "route_name_used", "route_id"])
    .agg({"district": "nunique"})
    .reset_index()
)

In [None]:
multi_district_routes.district.describe()

In [None]:
multi_district_routes.sort_values(by=["district"], ascending=False).head(10)

## Routes that overlap with multiple SHN now have 1+ row. Change it so one route will only have one row.

In [None]:
intersecting.columns

In [None]:
def group_route_district(df: pd.DataFrame, pct_route_on_hwy_agg: str) -> pd.DataFrame:
    """
    Aggregate by adding all the districts and SHN to a single row, rather than
    multiple and sum up the total % of SHN a transit route intersects with.

    df: the dataframe you want to aggregate
    pct_route_on_hwy_agg: whether you want to find the max, min, sum, etc on the column
    "pct_route_on_hwy_across_districts"
    """

    agg1 = (
        df.groupby(
            [
                "schedule_gtfs_dataset_key",
                "route_type",
                "shape_id",
                "route_id",
                "route_name_used",
            ],
            as_index=False,
        )[["shn_route", "shn_districts", "pct_route_on_hwy_across_districts"]]
        .agg(
            {
                "shn_route": lambda x: ", ".join(set(x.astype(str))),
                "shn_districts": lambda x: ", ".join(set(x.astype(str))),
                "pct_route_on_hwy_across_districts": pct_route_on_hwy_agg,
            }
        )
        .reset_index(drop=True)
    )

    # Clean up
    agg1.pct_route_on_hwy_across_districts = (
        agg1.pct_route_on_hwy_across_districts.astype(float).round(2)
    )
    return agg1

In [None]:
def add_shn_information(gdf: gpd.GeoDataFrame, buffer_amt: int) -> pd.DataFrame:
    """
    Prepare the gdf to join with the existing transit_routes
    dataframe that is published on the Open Data Portal
    """
    # Overlay
    intersecting = routes_shn_intersection(gdf, buffer_amt)
    # Rename column
    gdf = gdf.rename(
        columns={
            "pct_route_on_hwy": "pct_route_on_hwy_across_districts",
            "district": "shn_districts",
        }
    )
    # Group the dataframe so that one route only has one
    # row instead of multiple rows after finding its
    # intersection with any SHN routes.
    agg1 = group_route_district(gdf, "sum")

    # Add yes/no column to signify if a transit route intersects
    # with a SHN route
    agg1["on_shs"] = np.where(agg1["pct_route_on_hwy_across_districts"] == 0, "N", "Y")

    # Clean up rows that are tagged as "on_shs==N" but still have values
    # that appear.
    agg1.loc[
        (agg1["on_shs"] == "N") & (agg1["shn_districts"] != "0"),
        ["shn_districts", "shn_route"],
    ] = "NA"
    return agg1

In [None]:
open_data_portal_df = add_shn_information(intersecting, SHN_HWY_BUFFER_FEET)

In [None]:
len(open_data_portal_df)

In [None]:
open_data_portal_df.columns

In [None]:
open_data_portal_df.info()

In [None]:
open_data_portal_df.pct_route_on_hwy_across_districts.describe()

In [None]:
open_data_portal_df.on_shs.value_counts()

In [None]:
open_data_portal_df.columns

## Missing Routes for D2 [PR](https://github.com/cal-itp/data-analyses/issues/1582)

In [None]:
open_data_portal_df.loc[open_data_portal_df.shn_districts.str.contains("2")]

### Map

In [None]:
m = shn_district_df.explore(
    name="district",
    tiles="CartoDB positron",
    style_kwds={"color": "#9DA4A6", "opacity": 0.5},
    height=500,
    width=1000,
    legend=False,
)

In [None]:
southwest_chief = intersecting.loc[(intersecting.route_name_used == "Southwest Chief")]

In [None]:
"""southwest_chief.explore(
    m=m,
    cmap="Spectral",
    categorical=True,
    legend=False,
    legend_kwds={"width": 200},
)"""