# SHN and Route Typology
* https://docs.google.com/spreadsheets/d/1gmRmVC4phwA3EunOhI4-aJ7uF5R2nZhIhPV2h3FM25w/edit?gid=0#gid=0

## Questions
* Is `route_typology` refreshed January of each year?
* Do I need to go back to 2023 and add back the route typologies? Or can I just add route typologies from August onward?
* Best way to troubleshoot why a dataframe increases in rows after a merge?
* What's the difference between `shape_id` in `open_data/create_routes` vs `common_shape_id` in `route_typology_df?`

In [1]:
import geopandas as gpd
import google.auth
import numpy as np
import pandas as pd

credentials, project = google.auth.default()

import gcsfs

fs = gcsfs.GCSFileSystem()

In [2]:
from calitp_data_analysis import geography_utils, utils
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import (
    catalog_utils,
    dask_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## Bring in final result from `open_data/create_routes` -> published each month for the latest date.

In [4]:
open_data_url = "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes_2025-07-16.parquet"

In [5]:
open_data = gpd.read_parquet(
   open_data_url,
  storage_options={"token": credentials.token},
)

In [6]:
# open_data.columns

In [7]:
#open_data = gpd.read_parquet(
 #   "gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes.parquet",
#    storage_options={"token": credentials.token},
#)

In [8]:
#open_data.columns

In [9]:
#open_data.drop(columns=["geometry"]).sample(3)

## Bring in results from `route_typology` [here](https://github.com/cal-itp/data-analyses/blob/087474dd0be3ea1ec3195f72ee8435cc08845b30/gtfs_funnel/route_typologies.py#L256) -> on a year grain 

In [10]:
SCHED_GCS

'gs://calitp-analytics-data/data-analyses/gtfs_schedule/'

In [11]:
GTFS_DATA_DICT.schedule_tables.route_typologies

'nacto_typologies/route_typologies'

In [12]:
route_typology_2025 = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/nacto_typologies/route_typologies_2025.parquet"

In [13]:
route_typology_df = pd.read_parquet(route_typology_2025).drop(
    columns=[
        "route_long_name",
        "route_short_name",
        "combined_name",
    ]
)

In [14]:
route_typology_df.columns

Index(['schedule_gtfs_dataset_key', 'name', 'route_id', 'route_type',
       'is_express', 'is_rapid', 'is_rail', 'is_ferry', 'is_local',
       'common_shape_id', 'is_coverage', 'is_downtown_local'],
      dtype='object')

In [15]:
route_typology_df.sample()

Unnamed: 0,schedule_gtfs_dataset_key,name,route_id,route_type,is_express,is_rapid,is_rail,is_ferry,is_local,common_shape_id,is_coverage,is_downtown_local
858,4c105bd9f414afe82dba2c3687cc1d88,GET Schedule,81,3,0,1,0,0,0,51,1,0


In [16]:
open_data.route_type.value_counts()

3    9486
2     391
0     242
4      88
5      32
1      28
Name: route_type, dtype: int64

In [17]:
route_typology_df.route_type.value_counts()

3    2968
2      70
0      25
1      14
4      14
5       3
Name: route_type, dtype: int64

In [18]:
route_typology_df.route_id.nunique(), len(route_typology_df)

(1949, 3094)

## Merge them.

### Few matches when `shape_id` is added -> what's the difference between `shape_id` and `common_shape_id`

In [19]:
open_data.columns

Index(['n_trips', 'geometry', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'name', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'route_length_feet'],
      dtype='object')

In [20]:
m1 = pd.merge(
    open_data,
    route_typology_df,
    left_on=["schedule_gtfs_dataset_key", "name", "route_id", "shape_id"],
    right_on=["schedule_gtfs_dataset_key", "name", "route_id", "common_shape_id"],
    how="outer",
    indicator=True,
)

In [21]:
m1._merge.value_counts()

left_only     8728
right_only    2039
both          1567
Name: _merge, dtype: int64

In [22]:
open_data_shape_id = open_data[["shape_id"]]

In [23]:
route_typology_df_id = route_typology_df[["common_shape_id"]].rename(
    columns={"common_shape_id": "shape_id"}
)

In [24]:
shape_id_m1 = pd.merge(
    open_data_shape_id, route_typology_df_id, how="outer", indicator=True
)

In [25]:
shape_id_m1._merge.value_counts()

left_only     8295
both          2695
right_only    1823
Name: _merge, dtype: int64

### More matches when only merging on a couple of columns.

In [26]:
m2 = pd.merge(
    open_data,
    route_typology_df,
    left_on=["schedule_gtfs_dataset_key", "name", "route_id"],
    right_on=["schedule_gtfs_dataset_key", "name", "route_id"],
    how="outer",
    indicator=True,
)

In [27]:
m2._merge.value_counts()

both          7415
left_only     3062
right_only    1364
Name: _merge, dtype: int64

In [28]:
m3 = pd.merge(
    open_data,
    route_typology_df,
    left_on=["schedule_gtfs_dataset_key", "name", "route_id", "route_type"],
    right_on=["schedule_gtfs_dataset_key", "name", "route_id", "route_type"],
    how="outer",
    indicator=True,
)

In [29]:
m3._merge.value_counts()

both          7415
left_only     3062
right_only    1364
Name: _merge, dtype: int64

In [30]:
m3.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 11841 entries, 0 to 11840
Data columns (total 22 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   n_trips                        10477 non-null  float64 
 1   geometry                       10477 non-null  geometry
 2   schedule_gtfs_dataset_key      11841 non-null  object  
 3   route_id                       11841 non-null  object  
 4   route_type                     11841 non-null  object  
 5   shape_id                       10477 non-null  object  
 6   route_name_used                10477 non-null  object  
 7   name                           11841 non-null  object  
 8   base64_url                     10477 non-null  object  
 9   organization_source_record_id  10477 non-null  object  
 10  organization_name              10477 non-null  object  
 11  caltrans_district              10477 non-null  object  
 12  route_length_feet       

### How are a bunch of rows added? 

In [31]:
m3 = pd.merge(
    open_data,
    route_typology_df,
    on=["schedule_gtfs_dataset_key", "name", "route_id", "route_type"],
    how="left",
)

In [32]:
type(m3), len(m3), len(open_data)

(geopandas.geodataframe.GeoDataFrame, 10477, 10267)

## Check?

In [33]:
m3.sample(3).drop(columns=["geometry", "base64_url"]).T

Unnamed: 0,7526,8004,8911
n_trips,1,1,27
schedule_gtfs_dataset_key,a37760dde6b9fdcb76b82e57afab7274,ac2951bfaa7ecf6b80ba9e50aef1ae86,d3ec92741001094ed14a27847c72e9d0
route_id,US0600,14216,45
route_type,3,3,3
shape_id,4a37db9f9d21e197a03589ff62c6569d,p_1438985,74
route_name_used,Greyhound US0600,1,45
name,Flixbus Schedule,Nevada County Schedule,GET Schedule
organization_source_record_id,recLW3EuIHTqTjdds,reczUcQgqgtMpkpKC,recIh3vq8jwuuJlvL
organization_name,Greyhound,Nevada County,Golden Empire Transit District
caltrans_district,07 - Los Angeles / Ventura,03 - Marysville / Sacramento,06 - Fresno / Bakersfield


In [34]:
m3.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 10477 entries, 0 to 10476
Data columns (total 21 columns):
 #   Column                         Non-Null Count  Dtype   
---  ------                         --------------  -----   
 0   n_trips                        10477 non-null  int64   
 1   geometry                       10477 non-null  geometry
 2   schedule_gtfs_dataset_key      10477 non-null  object  
 3   route_id                       10477 non-null  object  
 4   route_type                     10477 non-null  object  
 5   shape_id                       10477 non-null  object  
 6   route_name_used                10477 non-null  object  
 7   name                           10477 non-null  object  
 8   base64_url                     10477 non-null  object  
 9   organization_source_record_id  10477 non-null  object  
 10  organization_name              10477 non-null  object  
 11  caltrans_district              10477 non-null  object  
 12  route_length_feet       

## Attach SHS

In [35]:
def routes_shn_intersection(
    routes_gdf: gpd.GeoDataFrame, buffer_amount: int
) -> gpd.GeoDataFrame:
    """
    Overlay the most recent transit routes with a buffered version
    of the SHN
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/shared_data/"

    # Read in buffered shn here or re buffer if we don't have it available.
    HWY_FILE = f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_shn_dissolved_by_ct_district_route.parquet"

    if fs.exists(HWY_FILE):
        shn_routes_gdf = gpd.read_parquet(
            HWY_FILE, storage_options={"token": credentials.token}
        )
    else:
        shn_routes_gdf = shared_data.buffer_shn(buffer_amount)

    # Process the most recent transit route geographies and ensure the
    # CRS matches the SHN routes' GDF so the overlay doesn't go wonky.
    routes_gdf = routes_gdf.to_crs(shn_routes_gdf.crs)

    # Overlay transit routes with the SHN geographies.
    gdf = gpd.overlay(
        routes_gdf, shn_routes_gdf, how="intersection", keep_geom_type=True
    )

    # Calcuate the percent of the transit route that runs on a highway, round it up and
    # multiply it by 100. Drop the geometry because we want the original transit route
    # shapes.
    gdf = gdf.assign(
        pct_route_on_hwy=(gdf.geometry.length / gdf.route_length_feet).round(3) * 100,
    )
    # Subset
    gdf2 = gdf[
        [
            "district",
            "shn_route",
            "pct_route_on_hwy",
            "n_trips",
            "schedule_gtfs_dataset_key",
            "route_id",
            "route_type",
            "shape_id",
            "route_name_used",
            "name",
            "organization_name",
        ]
    ]

    # Clean up
    gdf2.district = gdf2.district.fillna(0).astype(int)

    gdf2 = gdf2.rename(
        columns={
            "pct_route_on_hwy": "pct_route_on_hwy_across_districts",
            "district": "shn_districts",
        }
    )
    return gdf2

In [36]:
def group_route_district(df: pd.DataFrame, pct_route_on_hwy_agg: str) -> pd.DataFrame:
    """
    Aggregate by adding all the districts and SHN to a single row, rather than
    multiple and sum up the total % of SHN a transit route intersects with.

    df: the dataframe you want to aggregate
    pct_route_on_hwy_agg: whether you want to find the max, min, sum, etc on the column
    "pct_route_on_hwy_across_districts"
    """

    agg1 = (
        df.groupby(
            [
                "n_trips",
                "schedule_gtfs_dataset_key",
                "route_id",
                "route_type",
                "shape_id",
                "route_name_used",
                "name",
                "organization_name",
            ],
            as_index=False,
        )[["shn_route", "shn_districts", "pct_route_on_hwy_across_districts"]]
        .agg(
            {
                "shn_route": lambda x: ", ".join(set(x.astype(str))),
                "shn_districts": lambda x: ", ".join(set(x.astype(str))),
                "pct_route_on_hwy_across_districts": pct_route_on_hwy_agg,
            }
        )
        .reset_index(drop=True)
    )

    # Clean up
    agg1.pct_route_on_hwy_across_districts = (
        agg1.pct_route_on_hwy_across_districts.astype(float).round(2)
    )

    return agg1

In [37]:
def add_shn_information(gdf: gpd.GeoDataFrame, buffer_amt: int) -> pd.DataFrame:
    """
    Prepare the gdf to join with the existing transit_routes
    dataframe that is published on the Open Data Portal
    """
    # Drop duplicates
    gdf = gdf.drop_duplicates()
    # Overlay
    intersecting = routes_shn_intersection(gdf, buffer_amt)
    # Group the dataframe so that one route only has one
    # row instead of multiple rows after finding its
    # intersection with any SHN routes.
    print(intersecting.columns)
    agg1 = group_route_district(intersecting, "sum")

    # Merge the dataframe with all the SHS info with the original
    # gdf so we can get the original transit route geometries &
    # any routes that don't intersect with the state highway routes.
    m1 = pd.merge(gdf, agg1, how="left")

    # Add yes/no column to signify if a transit route intersects
    # with a SHN route
    agg1["on_shs"] = np.where(agg1["pct_route_on_hwy_across_districts"] == 0, "N", "Y")

    # Clean up rows that are tagged as "on_shs==N" but still have values
    # that appear.
    agg1.loc[
        (agg1["on_shs"] == "N") & (agg1["shn_districts"] != "0"),
        ["shn_districts", "shn_route"],
    ] = np.nan
    # Join back the dataframe above with the original transit route dataframes
    # so we can have the original transit route geographies.
    m1 = pd.merge(
        gdf,
        agg1,
        on=[
            "n_trips",
            "schedule_gtfs_dataset_key",
            "route_id",
            "route_type",
            "shape_id",
            "route_name_used",
            "name",
            "organization_name"
        ],
        how="left",
    )
    # Clean up rows that are tagged as "on_shs==N" but still have values
    # that appear.
    m1.loc[
        (m1["on_shs"] == "N") & (m1["shn_districts"] != "0"),
        ["shn_districts", "shn_route"],
    ] = np.nan
    return m1

In [38]:
shn_typology = add_shn_information(m3, 50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf2.district = gdf2.district.fillna(0).astype(int)


Index(['shn_districts', 'shn_route', 'pct_route_on_hwy_across_districts',
       'n_trips', 'schedule_gtfs_dataset_key', 'route_id', 'route_type',
       'shape_id', 'route_name_used', 'name', 'organization_name'],
      dtype='object')


In [39]:
shn_typology.columns

Index(['n_trips', 'geometry', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'name', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'route_length_feet', 'is_express', 'is_rapid',
       'is_rail', 'is_ferry', 'is_local', 'common_shape_id', 'is_coverage',
       'is_downtown_local', 'shn_route', 'shn_districts',
       'pct_route_on_hwy_across_districts', 'on_shs'],
      dtype='object')

In [41]:
shn_typology.drop(columns = ["geometry","base64_url"]).sample(3).T

Unnamed: 0,3166,1140,3518
n_trips,2,21,30
schedule_gtfs_dataset_key,3364ec074ca85001da3abd78be2ae521,131be0ff0e953b7b292a1548fed19c6d,5456c80d420043e15c8eb7368a8a4d89
route_id,985,7,141
route_type,3,3,3
shape_id,S2_985_9_15,6567,1410355
route_name_used,985,7,141
name,San Diego Schedule,SunLine Avail Schedule,Bay Area 511 SamTrans Schedule
organization_source_record_id,recZALk4vysuoTVjF,recAsbHMwQWB7cri8,recw3mRsmKDTNnVlT
organization_name,San Diego Metropolitan Transit System,SunLine Transit Agency,San Mateo County Transit District
caltrans_district,11 - San Diego,08 - San Bernardino / Riverside,04 - Bay Area / Oakland
