# Transit On the SHN 
* [GH Issue](https://github.com/cal-itp/data-analyses/issues/1477)


In [27]:
import geopandas as gpd
import google.auth
import numpy as np
import pandas as pd

credentials, project = google.auth.default()

import gcsfs
fs = gcsfs.GCSFileSystem()

In [28]:
from calitp_data_analysis import geography_utils, utils
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import (
    catalog_utils,
    dask_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [29]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [30]:
analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates + rt_dates.y2025_dates

In [31]:
analysis_date = "2025-03-12"

## Load in Operator Routes

In [32]:
def process_transit_routes() -> gpd.GeoDataFrame:
    """
    Select the most recent transit route.
    Also count how many routes there are for each operator.
    """
    OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

    subset = [
        "service_date",
        "geometry",
        "portfolio_organization_name",
        "recent_combined_name",
        "route_id",
        "schedule_gtfs_dataset_key",
    ]
    op_geography_df = gpd.read_parquet(
        f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
        storage_options={"token": credentials.token},
    )[subset]
    
    most_recent_dates = publish_utils.filter_to_recent_date(
        df=op_geography_df, group_cols=["portfolio_organization_name", "route_id", ]
    )
    most_recent_routes = pd.merge(
        op_geography_df,
        most_recent_dates,
        on=["portfolio_organization_name", "route_id","service_date"],
        how="inner",
    )
    # Get this to same CRS as highways
    most_recent_routes = (
        most_recent_routes.assign(
            route_length_feet=most_recent_routes.geometry.to_crs(
                geography_utils.CA_NAD83Albers_ft
            ).length
        )
    )

    return most_recent_routes

In [33]:
transit_routes = process_transit_routes()

## Load in SHS 
* Reuse Tiffany's [code](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/parallel_corridors.py)

In [34]:
hwy_group_cols = ["Route", "County", "District", "RouteType"]

In [35]:
def process_highways(group_cols: list, buffer_feet: int) -> gpd.GeoDataFrame:
    """
    Put in a list of group_cols, and aggregate highway segments with
    the direction info up to the group_col level.

    For each highway, store what directions it runs in
    as dummy variables. This will allow us to dissolve
    the geometry and get fewer rows for highways
    without losing direction info.
    """
    SHN_FILE = catalog_utils.get_catalog(
        "shared_data_catalog"
    ).state_highway_network.urlpath

    direction_cols = ["NB", "SB", "EB", "WB"]

    df = gpd.read_parquet(
        SHN_FILE,
        storage_options={"token": credentials.token},
    ).to_crs(geography_utils.CA_NAD83Albers_ft)

    # Get dummies for direction
    # Can make data wide instead of long
    direction_dummies = pd.get_dummies(df.Direction, dtype=int)
    df = pd.concat([df.drop(columns="Direction"), direction_dummies], axis=1)

    # For each highway, allow multiple dummies to be 1 (as long as highway had that direction,
    # we'll allow dummy to be 1. A highway can be tagged as WB and SB, and we want to keep info for both).
    for c in direction_cols:
        df[c] = df.groupby(group_cols)[c].transform("max").astype(int)

    # Buffer first, then dissolve
    # If dissolve first, then buffer, kernel times out
    df = df.assign(
        highway_feet=df.geometry.length,
        geometry=df.geometry.buffer(buffer_feet),
        Route=df.Route.astype(int),
    )

    df2 = df.dissolve(by=group_cols + direction_cols).reset_index()

    df2[direction_cols] = df2[direction_cols].astype(int)

    return df2

In [36]:
processed_hwy = process_highways(group_cols=hwy_group_cols, buffer_feet=50)

In [37]:
processed_hwy.drop(columns=["geometry"]).sample()

Unnamed: 0,Route,County,District,RouteType,NB,SB,EB,WB,highway_feet
182,66,LA,7,State,0,0,1,1,17004.11


In [38]:
def overlay_transit_to_highways(
    analysis_date: str,
    hwy_buffer_feet: int,
    pct_route_threshold: float,
) -> gpd.GeoDataFrame:
    """
    Function to find areas of intersection between
    highways (default of 1 mile buffer) and transit routes.
    
    Returns: geopandas.GeoDataFrame, with geometry column reflecting
    the areas of intersection.
    """    
    # Can pass a different buffer zone to determine parallel corridors
    HWY_FILE = f"{BUS_SERVICE_GCS}highways_buffer{hwy_buffer_feet}.parquet"
    
    if fs.exists(HWY_FILE):
        highways = gpd.read_parquet(HWY_FILE,
        storage_options={"token": credentials.token},)
    else:
        highways = process_highways(
            group_cols = hwy_group_cols, 
            buffer_feet = hwy_buffer_feet
        )
    transit_routes = process_transit_routes().to_crs(highways.crs)
    
    # Overlay
    # Note: an overlay based on intersection changes the geometry column
    # The new geometry column will reflect that area of intersection
    gdf = gpd.overlay(
        transit_routes, 
        highways, 
        how = "intersection", 
        keep_geom_type = True
    )  
    
    # Using new geometry column, calculate what % that intersection 
    # is of the route and hwy
    gdf = gdf.assign(
        pct_route = (gdf.geometry.length / gdf.route_length_feet).round(3),
        Route = gdf.Route.astype(int),
    )
    
    gdf2 = gdf.loc[gdf.pct_route >= pct_route_threshold]
    
    return gdf2


In [39]:
SHN_HWY_BUFFER_FEET = 50 
SHN_PCT_ROUTE = 0.2 # we'll use the same for both categories
PARALLEL_HWY_BUFFER_FEET = geography_utils.FEET_PER_MI * 0.5

In [40]:
BUS_SERVICE_GCS = "gs://calitp-analytics-data/data-analyses/bus_service_increase/"

### Function not working because of the 1:m merge indicator [here](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/parallel_corridors.py#L138-L147) but maybe we dont need this since we just want to know which routes intersect/are parallel

In [41]:
"""highways = process_highways(
            group_cols = hwy_group_cols, 
            buffer_feet = SHN_HWY_BUFFER_FEET
        )"""

'highways = process_highways(\n            group_cols = hwy_group_cols, \n            buffer_feet = SHN_HWY_BUFFER_FEET\n        )'

In [42]:
"""on_shn_test = gpd.overlay(
        transit_routes, 
        highways, 
        how = "intersection", 
        keep_geom_type = True
    )  """

'on_shn_test = gpd.overlay(\n        transit_routes, \n        highways, \n        how = "intersection", \n        keep_geom_type = True\n    )  '

In [43]:
# is of the route and hwy
"""on_shn_test = on_shn_test.assign(
        pct_route = (on_shn_test.geometry.length / on_shn_test.route_length_feet).round(3),
        Route = on_shn_test.Route.astype(int),
    )"""

'on_shn_test = on_shn_test.assign(\n        pct_route = (on_shn_test.geometry.length / on_shn_test.route_length_feet).round(3),\n        Route = on_shn_test.Route.astype(int),\n    )'

In [44]:
"""gdf2 = pd.merge(
        transit_routes[["portfolio_organization_name","route_id", "geometry"]],
        on_shn_test.drop(columns = "geometry"),
        on = ["portfolio_organization_name","route_id"],
        how = "inner",
        # Allow 1:m merge because the same transit route 
        # can overlap with various highways
        validate = "1:m",
    )"""

'gdf2 = pd.merge(\n        transit_routes[["portfolio_organization_name","route_id", "geometry"]],\n        on_shn_test.drop(columns = "geometry"),\n        on = ["portfolio_organization_name","route_id"],\n        how = "inner",\n        # Allow 1:m merge because the same transit route \n        # can overlap with various highways\n        validate = "1:m",\n    )'

In [45]:
final_cols = ['portfolio_organization_name', 'District',"Route",'route_id', 'pct_route']

In [46]:
on_shn = overlay_transit_to_highways(
        analysis_date,
        hwy_buffer_feet = SHN_HWY_BUFFER_FEET,
        pct_route_threshold = SHN_PCT_ROUTE,
    )[final_cols].drop_duplicates().rename(columns = {"pct_route":"route_pct_on_shn"})
    
parallel_or_intersecting = overlay_transit_to_highways(
        analysis_date,
        hwy_buffer_feet = PARALLEL_HWY_BUFFER_FEET, 
        pct_route_threshold = SHN_PCT_ROUTE,
    )[final_cols].drop_duplicates().rename(columns = {"pct_route":"route_pct_parallel_intersecting_shn"})

In [47]:
on_shn.head(2)

Unnamed: 0,portfolio_organization_name,District,Route,route_id,route_pct_on_shn
3,Tulare County Regional Transit Agency,6,201,c6726149-9979-4ebb-85f6-0be90402266c,0.23
15,Tulare County Regional Transit Agency,6,190,P6,0.26


In [48]:
parallel_or_intersecting.head(2)

Unnamed: 0,portfolio_organization_name,District,Route,route_id,route_pct_parallel_intersecting_shn
1,Tulare County Regional Transit Agency,6,99,0e85fd4c-5258-4256-9852-4a96554aadb7,0.65
2,Tulare County Regional Transit Agency,6,99,T1,0.2


In [49]:
transit_routes.drop(columns = ["geometry"]).head(2)

Unnamed: 0,service_date,portfolio_organization_name,recent_combined_name,route_id,schedule_gtfs_dataset_key,route_length_feet
0,2024-03-13,Tulare County Regional Transit Agency,C50 Loop 50,50 Loop,0139b1253130b33adcd4b3a4490530d2,154036.29
1,2024-06-12,Tulare County Regional Transit Agency,P1 Porterville Route 1,P1,0139b1253130b33adcd4b3a4490530d2,48113.65


In [69]:
df = pd.merge(
        transit_routes.drop(columns = ["geometry"]).assign(other=1),
        on_shn.assign(on_shn=3),
        on = ["portfolio_organization_name","route_id",],
        how = "left"
    )
    

In [70]:
df.sample(3)

Unnamed: 0,service_date,portfolio_organization_name,recent_combined_name,route_id,schedule_gtfs_dataset_key,route_length_feet,other,District,Route,route_pct_on_shn,on_shn
2689,2025-04-16,Los Angeles World Airports,VN to LAX FlyAway - Van Nuys to LAX,1,73e3164043eb7312454dd03413a6b310,233714.99,1,7.0,405.0,0.78,3.0
1160,2025-04-16,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",Santa Paula School Tripper (AM),4250,1770249a5a2e770ca90628434d4934b1,68830.14,1,,,,
5272,2025-04-16,City of Los Angeles,CE437A,858,cc53a0dbf5df90e3009b9cb5d89d80ba,104254.75,1,7.0,10.0,0.29,3.0


In [71]:
df.columns

Index(['service_date', 'portfolio_organization_name', 'recent_combined_name',
       'route_id', 'schedule_gtfs_dataset_key', 'route_length_feet', 'other',
       'District', 'Route', 'route_pct_on_shn', 'on_shn'],
      dtype='object')

In [72]:
df2 = pd.merge(
        df,
        parallel_or_intersecting.assign(parallel=2),
        on = ["District", "Route","portfolio_organization_name","route_id",],
        how = "left"
    )
    

In [73]:
category_cols = ["on_shn", "parallel", "other"]

    
df2 = df2.assign(
        category = df2[category_cols].idxmax(axis=1)
    )

    

In [74]:
df2.category.value_counts()

other     4666
on_shn    1768
Name: category, dtype: int64

In [75]:
df2.route_pct_on_shn = df2.route_pct_on_shn.fillna(0)

In [86]:
district_df = df[["portfolio_organization_name", "District"]].dropna(subset = ["District"]).drop_duplicates()

## Need to find a way to get districts while loading in routes + figure out if it's really true that there aren't routes that are parallel to SHN.

In [77]:
df2.sample(3)

Unnamed: 0,service_date,portfolio_organization_name,recent_combined_name,route_id,schedule_gtfs_dataset_key,route_length_feet,other,District,Route,route_pct_on_shn,on_shn,route_pct_parallel_intersecting_shn,parallel,category
6035,2023-05-17,Butte County Association of Governments,20 Chico - Oroville,1b8c181e-5565-4495-b085-3042ee9483b8,f1cc580313b37ae0f853b2e469b27228,143812.09,1,3.0,99.0,0.33,3.0,0.49,2.0,on_shn
1757,2023-11-15,Los Angeles County Metropolitan Transportation Authority,108 Metro Local Line,108-13168,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,135406.71,1,,,0.0,,,,other
294,2024-12-11,Los Angeles County Metropolitan Transportation Authority,51 Metro Local Line,51-13183,0666caf3ec1ecc96b74f4477ee4bc939,91612.83,1,,,0.0,,,,other


In [78]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6434 entries, 0 to 6433
Data columns (total 14 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   service_date                         6434 non-null   datetime64[ns]
 1   portfolio_organization_name          6434 non-null   object        
 2   recent_combined_name                 6434 non-null   object        
 3   route_id                             6434 non-null   object        
 4   schedule_gtfs_dataset_key            6434 non-null   object        
 5   route_length_feet                    6250 non-null   float64       
 6   other                                6434 non-null   int64         
 7   District                             1768 non-null   float64       
 8   Route                                1768 non-null   float64       
 9   route_pct_on_shn                     6434 non-null   float64       
 10  on_shn      