# Add SHN Info to Transit Routes in the middle of the Pipeline

In [57]:
import datetime

import geopandas as gpd
import google.auth
import numpy as np
import pandas as pd
import yaml
from calitp_data_analysis import geography_utils, utils
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils import helpers
from shared_utils import (
    catalog_utils,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
    schedule_rt_utils,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

credentials, project = google.auth.default()

import gcsfs

fs = gcsfs.GCSFileSystem()

In [12]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [8]:
analysis_date_list = rt_dates.y2025_dates

In [10]:
analysis_date_list[0]

'2025-01-15'

In [13]:
date = analysis_date_list[0]

## Load in Routes from `open_data_portal`

In [15]:
trips = helpers.import_scheduled_trips(
    date,
    columns=[
        "gtfs_dataset_key",
        "route_id",
        "route_type",
        "shape_id",
        "shape_array_key",
        "route_long_name",
        "route_short_name",
        "route_desc",
    ],
    get_pandas=True,
).dropna(subset="shape_array_key")

In [17]:
trips.sample()

Unnamed: 0,schedule_gtfs_dataset_key,route_id,route_type,shape_id,shape_array_key,route_long_name,route_short_name,route_desc
7397,68aa06a25a32c83eb38c20c43977feff,14,3,4ac40e7a-0de6-42d1-84e2-f8a378d27183,8fdc285be5ebfc3f0fb0d68950a2770d,Park/Forest/MLK,14,CTC to MLK via Forest clockwise loop


In [16]:
shapes = helpers.import_scheduled_shapes(
    date, columns=["shape_array_key", "n_trips", "geometry"], get_pandas=True, crs=WGS84
).dropna(subset="shape_array_key")

In [31]:
shapes.sample().drop(columns=["geometry"])

Unnamed: 0,shape_array_key,n_trips
7293,655d72b494b9e721d06e68458d640a05,3


In [19]:
df = (
    pd.merge(shapes, trips, on="shape_array_key", how="inner")
    .drop_duplicates(subset="shape_array_key")
    .drop(columns="shape_array_key")
)

In [20]:
df.shape

(7416, 9)

In [21]:
drop_cols = ["route_short_name", "route_long_name", "route_desc"]
route_shape_cols = ["schedule_gtfs_dataset_key", "route_id", "shape_id"]

In [32]:
def remove_erroneous_shapes(
    shapes_with_route_info: gpd.GeoDataFrame,
) -> gpd.GeoDataFrame:
    """
    Check if line is simple for Amtrak. If it is, keep.
    If it's not simple (line crosses itself), drop.

    In Jun 2023, some Amtrak shapes appeared to be funky,
    but in prior months, it's been ok.
    Checking for length is fairly time-consuming.
    """
    amtrak = "Amtrak Schedule"

    possible_error = shapes_with_route_info[shapes_with_route_info.name == amtrak]
    ok = shapes_with_route_info[shapes_with_route_info.name != amtrak]

    # Check if the line crosses itself
    ok_amtrak = (
        possible_error.assign(simple=possible_error.geometry.is_simple)
        .query("simple == True")
        .drop(columns="simple")
    )

    ok_shapes = pd.concat([ok, ok_amtrak], axis=0).reset_index(drop=True)

    return ok_shapes

### Didn't reach the step of `routes_assmebled2` because of all the different imports causing issues.

In [22]:
routes_assembled = (
    portfolio_utils.add_route_name(df)
    .drop(columns=drop_cols)
    .sort_values(route_shape_cols)
    .drop_duplicates(subset=route_shape_cols)
    .reset_index(drop=True)
)

In [23]:
routes_assembled.shape

(7416, 7)

In [50]:
routes_assembled.columns

Index(['n_trips', 'geometry', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used'],
      dtype='object')

### Add length to the transit routes.

In [58]:
routes_assembled = routes_assembled.assign(
    route_length_feet=routes_assembled.geometry.to_crs(
        geography_utils.CA_NAD83Albers_ft
    ).length
)

## Load in SHS


In [60]:
def dissolve_shn(columns_to_dissolve: list, file_name: str) -> gpd.GeoDataFrame:
    """
    Dissolve State Highway Network so there will only be one row for each
    route name and route type
    """
    # Read in the dataset and change the CRS to one to feet.
    SHN_FILE = catalog_utils.get_catalog(
        "shared_data_catalog"
    ).state_highway_network.urlpath

    shn = gpd.read_parquet(
        SHN_FILE,
        storage_options={"token": credentials.token},
    ).to_crs(geography_utils.CA_NAD83Albers_ft)

    # Dissolve by route which represents the the route's name and drop the other columns
    # because they are no longer relevant.
    shn_dissolved = (shn.dissolve(by=columns_to_dissolve).reset_index())[
        columns_to_dissolve + ["geometry"]
    ]

    # Rename because I don't want any confusion between SHN route and
    # transit route.
    shn_dissolved = shn_dissolved.rename(columns={"Route": "shn_route"})

    # Find the length of each highway.
    shn_dissolved = shn_dissolved.assign(
        highway_feet=shn_dissolved.geometry.length,
        shn_route=shn_dissolved.shn_route.astype(int).astype(str),
    )

    # Save this out so I don't have to dissolve it each time.
    shn_dissolved.to_parquet(
        f"gs://calitp-analytics-data/data-analyses/state_highway_network/shn_dissolved_by_{file_name}.parquet",
        filesystem=fs,
    )
    return shn_dissolved

In [61]:
# dissolved_route = dissolve_shn(["Route", "District"], "ct_district_route")

In [62]:
dissolved_url = "gs://calitp-analytics-data/data-analyses/state_highway_network/shn_dissolved_by_ct_district_route.parquet"

In [63]:
dissolved_df = gpd.read_parquet(
    dissolved_url,
    storage_options={"token": credentials.token},
)

In [64]:
def buffer_shn(buffer_amount: int, file_name: str) -> gpd.GeoDataFrame:
    """
    Add a buffer to the SHN before overlaying it with
    transit routes.
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

    # Read in the dissolved SHN file
    shn_df = gpd.read_parquet(
        f"{GCS_FILE_PATH}shn_dissolved_by_{file_name}.parquet",
        storage_options={"token": credentials.token},
    )

    # Buffer the state highway.
    shn_df_buffered = shn_df.assign(
        geometry=shn_df.geometry.buffer(buffer_amount),
    )

    # Save it out so we won't have to buffer over again and
    # can just read it in.
    shn_df_buffered.to_parquet(
        f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_{file_name}.parquet",
        filesystem=fs,
    )

    return shn_df_buffered

In [65]:
SHN_HWY_BUFFER_FEET = 50
PARALLEL_HWY_BUFFER_FEET = geography_utils.FEET_PER_MI * 0.5

In [66]:
shn_district_df = gpd.read_parquet(
    f"gs://calitp-analytics-data/data-analyses/state_highway_network/shn_buffered_50_ft_ct_district_route.parquet",
    storage_options={"token": credentials.token},
)

In [78]:
len(shn_district_df)

344

## Overlay the transit routes with the SHN 

In [74]:
def routes_shn_intersection(
    routes_gdf: gpd.GeoDataFrame, buffer_amount: int, file_name: str
) -> gpd.GeoDataFrame:
    """
    Overlay the most recent transit routes with a buffered version
    of the SHN
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

    # Read in buffered shn here or re buffer if we don't have it available.
    HWY_FILE = f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_{file_name}.parquet"

    if fs.exists(HWY_FILE):
        shn_routes_gdf = gpd.read_parquet(
            HWY_FILE, storage_options={"token": credentials.token}
        )
    else:
        shn_routes_gdf = buffer_shn(buffer_amount)

    # Process the most recent transit route geographies and ensure the
    # CRS matches the SHN routes' GDF so the overlay doesn't go wonky.
    routes_gdf = routes_gdf.to_crs(shn_routes_gdf.crs)

    # Overlay transit routes with the SHN geographies.
    gdf = gpd.overlay(
        routes_gdf, shn_routes_gdf, how="intersection", keep_geom_type=True
    )

    # Calcuate the percent of the transit route that runs on a highway, round it up and
    # multiply it by 100. Drop the geometry because we want the original transit route
    # shapes.
    gdf = gdf.assign(
        pct_route_on_hwy=(gdf.geometry.length / gdf.route_length_feet).round(3) * 100,
    ).drop(
        columns=[
            "geometry",
        ]
    )

    # Join back the dataframe above with the original transit route dataframes
    # so we can have the original transit route geographies.
    gdf2 = pd.merge(
        routes_gdf,
        gdf,
        on=[
            "n_trips",
            "schedule_gtfs_dataset_key",
            "route_id",
            "route_type",
            "shape_id",
            "route_name_used",
            "route_length_feet",
        ],
        how="left",
    )

    # Clean up
    gdf2.District = gdf2.District.fillna(0).astype(int)
    return gdf2

In [71]:
routes_assembled.columns

Index(['n_trips', 'geometry', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'route_length_feet'],
      dtype='object')

In [75]:
intersecting = routes_shn_intersection(routes_assembled, 50, "ct_district_route")

In [77]:
len(intersecting)

20476

In [76]:
intersecting.pct_route_on_hwy.describe()

count   18951.00
mean        6.65
std        15.80
min         0.00
25%         0.10
50%         0.40
75%         2.70
max        99.20
Name: pct_route_on_hwy, dtype: float64

In [80]:
intersecting.columns

Index(['n_trips', 'geometry', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'route_length_feet',
       'shn_route', 'District', 'highway_feet', 'pct_route_on_hwy'],
      dtype='object')

In [81]:
# Find routes that cross multiple districts
multi_district_routes = (
    intersecting.groupby(["schedule_gtfs_dataset_key", "route_name_used", "route_id"])
    .agg({"District": "nunique"})
    .reset_index()
)

In [83]:
multi_district_routes.District.describe()

count   2683.00
mean       1.09
std        0.38
min        1.00
25%        1.00
50%        1.00
75%        1.00
max        7.00
Name: District, dtype: float64

In [164]:
multi_district_routes.sort_values(by=["District"], ascending=False).head(30)

Unnamed: 0,schedule_gtfs_dataset_key,route_name_used,route_id,District
1620,a37760dde6b9fdcb76b82e57afab7274,Greyhound US0831,US0831,7
1513,a37760dde6b9fdcb76b82e57afab7274,FlixBus N2003,N2003,6
555,48e137bc977da88970393f629c18432c,Coast Starlight,36924,5
549,48e137bc977da88970393f629c18432c,California Zephyr,96,4
1615,a37760dde6b9fdcb76b82e57afab7274,Greyhound US0800,US0800,4
1616,a37760dde6b9fdcb76b82e57afab7274,Greyhound US0802,US0802,4
1617,a37760dde6b9fdcb76b82e57afab7274,Greyhound US0810,US0810,4
582,48e137bc977da88970393f629c18432c,Texas Eagle,87,4
1619,a37760dde6b9fdcb76b82e57afab7274,Greyhound US0830,US0830,4
1618,a37760dde6b9fdcb76b82e57afab7274,Greyhound US0811,US0811,4


## Routes that overlap with multiple SHN now have 1+ row. Change it so one route will only have one row.

In [84]:
intersecting.columns

Index(['n_trips', 'geometry', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'route_length_feet',
       'shn_route', 'District', 'highway_feet', 'pct_route_on_hwy'],
      dtype='object')

In [118]:
def group_route_district(df: pd.DataFrame, pct_route_on_hwy_agg: str) -> pd.DataFrame:
    """
    Aggregate by adding all the districts and SHN to a single row, rather than
    multiple and sum up the total % of SHN a transit route intersects with.

    df: the dataframe you want to aggregate
    pct_route_on_hwy_agg: whether you want to find the max, min, sum, etc on the column
    "pct_route_on_hwy_across_districts"
    """

    agg1 = (
        df.groupby(
            ["schedule_gtfs_dataset_key", "route_type", "shape_id", "route_id", "route_name_used"],
            as_index=False,
        )[["shn_route", "District", "pct_route_on_hwy_across_districts"]]
        .agg(
            {
                "shn_route": lambda x: ", ".join(set(x.astype(str))),
                "District": lambda x: ", ".join(set(x.astype(str))),
                "pct_route_on_hwy_across_districts": pct_route_on_hwy_agg,
            }
        )
        .reset_index(drop=True)
    )

    # Clean up
    agg1.pct_route_on_hwy_across_districts = (
        agg1.pct_route_on_hwy_across_districts.astype(float).round(2)
    )
    return agg1

In [87]:
def create_on_shs_column(df):
    df["on_shs"] = np.where(df["pct_route_on_hwy_across_districts"] == 0, "N", "Y")
    return df

In [119]:
def prep_open_data_portal(gdf: gpd.GeoDataFrame) -> pd.DataFrame:
    """
    Prepare the gdf to join with the existing transit_routes
    dataframe that is published on the Open Data Portal
    """
    # Rename column
    gdf = gdf.rename(columns={"pct_route_on_hwy": "pct_route_on_hwy_across_districts"})
    # Group the dataframe so that one route only has one
    # row instead of multiple rows after finding its
    # intersection with any SHN routes.
    agg1 = group_route_district(gdf, "sum")

    # Add yes/no column to signify if a transit route intersects
    # with a SHN route
    agg1 = create_on_shs_column(agg1)

    return agg1

In [120]:
open_data_portal_df = prep_open_data_portal(intersecting)

## Why are the rows not the same?? -> Need `shape_id`

In [121]:
len(open_data_portal_df)

7416

In [122]:
open_data_portal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7416 entries, 0 to 7415
Data columns (total 9 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   schedule_gtfs_dataset_key          7416 non-null   object 
 1   route_type                         7416 non-null   object 
 2   shape_id                           7416 non-null   object 
 3   route_id                           7416 non-null   object 
 4   route_name_used                    7416 non-null   object 
 5   shn_route                          7416 non-null   object 
 6   District                           7416 non-null   object 
 7   pct_route_on_hwy_across_districts  7416 non-null   float64
 8   on_shs                             7416 non-null   object 
dtypes: float64(1), object(8)
memory usage: 521.6+ KB


In [123]:
open_data_portal_df.pct_route_on_hwy_across_districts.describe()

count   7416.00
mean      16.98
std       27.33
min        0.00
25%        0.40
50%        1.40
75%       21.42
max      100.00
Name: pct_route_on_hwy_across_districts, dtype: float64

In [183]:
open_data_portal_df.on_shs.value_counts()

Y    5882
N    1534
Name: on_shs, dtype: int64

In [160]:
open_data_portal_df.columns

Index(['schedule_gtfs_dataset_key', 'route_type', 'shape_id', 'route_id',
       'route_name_used', 'shn_route', 'District',
       'pct_route_on_hwy_across_districts', 'on_shs'],
      dtype='object')

## How is possible to have `on_shs==N` but there are populated values in `shn_route` and `District`

In [165]:
open_data_portal_df.loc[open_data_portal_df.route_name_used == "Southwest Chief"]

Unnamed: 0,schedule_gtfs_dataset_key,route_type,shape_id,route_id,route_name_used,shn_route,District,pct_route_on_hwy_across_districts,on_shs
1739,48e137bc977da88970393f629c18432c,2,118,51,Southwest Chief,"10, 215, 66, 101, 5, 90, 605, 210, 39, 138, 58, 95, 91, 18, 40, 710, 15, 57, 60","8, 12, 7",0.0,N
1794,48e137bc977da88970393f629c18432c,2,270,51,Southwest Chief,"10, 215, 66, 101, 5, 90, 605, 210, 39, 138, 58, 95, 91, 18, 40, 710, 15, 57, 60","8, 12, 7",0.0,N


In [166]:
open_data_portal_df.loc[open_data_portal_df.route_name_used == "Pacific Surfliner"]

Unnamed: 0,schedule_gtfs_dataset_key,route_type,shape_id,route_id,route_name_used,shn_route,District,pct_route_on_hwy_across_districts,on_shs
1727,48e137bc977da88970393f629c18432c,2,101,78,Pacific Surfliner,"5, 710, 101, 8, 605, 73, 39, 10, 805, 55, 22, 57, 1, 133, 52, 91, 60","11, 12, 7",0.6,Y
1728,48e137bc977da88970393f629c18432c,2,102,78,Pacific Surfliner,"5, 710, 101, 8, 605, 73, 39, 10, 805, 55, 22, 57, 1, 133, 52, 91, 60","11, 12, 7",0.6,Y
1748,48e137bc977da88970393f629c18432c,2,146,78,Pacific Surfliner,"8, 217, 10, 55, 1, 52, 405, 101, 170, 227, 5, 605, 73, 39, 23, 805, 27, 22, 133, 134, 110, 91, 166, 710, 118, 34, 57, 2, 60","11, 5, 12, 7",0.7,Y
1749,48e137bc977da88970393f629c18432c,2,148,78,Pacific Surfliner,"8, 217, 10, 55, 1, 52, 405, 101, 170, 227, 5, 605, 73, 39, 23, 805, 27, 22, 133, 134, 110, 91, 166, 710, 118, 34, 57, 2, 60","11, 5, 12, 7",0.7,Y
1750,48e137bc977da88970393f629c18432c,2,149,78,Pacific Surfliner,"8, 217, 10, 55, 1, 52, 405, 101, 170, 5, 605, 73, 39, 23, 805, 27, 22, 133, 134, 110, 91, 710, 118, 34, 57, 2, 60","11, 5, 12, 7",1.1,Y
1751,48e137bc977da88970393f629c18432c,2,151,78,Pacific Surfliner,"8, 217, 10, 55, 1, 52, 405, 101, 170, 5, 605, 73, 39, 23, 805, 27, 22, 133, 134, 110, 91, 710, 118, 34, 57, 2, 60","11, 5, 12, 7",1.1,Y


### Map

In [167]:
m = shn_district_df.explore(
    name="District",
    tiles="CartoDB positron",
    style_kwds={"color": "#9DA4A6", "opacity": 0.5},
    height=500,
    width=1000,
    legend = False
)

In [171]:
southwest_chief = intersecting.loc[ (intersecting.route_name_used ==  "Southwest Chief") 
]

In [173]:
""" southwest_chief.explore(
    m=m,
    cmap="Spectral",
    categorical=True,
    legend=False,
    legend_kwds={"width": 200},
)"""

' southwest_chief.explore(\n    m=m,\n    cmap="Spectral",\n    categorical=True,\n    legend=False,\n    legend_kwds={"width": 200},\n)'

In [157]:
intersecting.loc[
    (intersecting.schedule_gtfs_dataset_key == "a253a8d7acd57657bb98050f37dd6b0f")
    & (intersecting.route_name_used == "RTS") & (intersecting.shape_id == "p_1435829")
].drop(columns = ["geometry"])

Unnamed: 0,n_trips,schedule_gtfs_dataset_key,route_id,route_type,shape_id,route_name_used,route_length_feet,shn_route,District,highway_feet,pct_route_on_hwy
10742,1,a253a8d7acd57657bb98050f37dd6b0f,76743,3,p_1435829,RTS,327002.95,101,1,2552547.02,41.9
10743,1,a253a8d7acd57657bb98050f37dd6b0f,76743,3,p_1435829,RTS,327002.95,299,1,260976.3,1.0
10744,1,a253a8d7acd57657bb98050f37dd6b0f,76743,3,p_1435829,RTS,327002.95,255,1,57340.79,12.1
10745,1,a253a8d7acd57657bb98050f37dd6b0f,76743,3,p_1435829,RTS,327002.95,36,1,240201.81,0.0
10746,1,a253a8d7acd57657bb98050f37dd6b0f,76743,3,p_1435829,RTS,327002.95,200,1,13996.49,0.5
10747,1,a253a8d7acd57657bb98050f37dd6b0f,76743,3,p_1435829,RTS,327002.95,211,1,29101.29,0.0
10748,1,a253a8d7acd57657bb98050f37dd6b0f,76743,3,p_1435829,RTS,327002.95,283,1,1989.75,0.5


In [142]:
open_data_portal_df.route_name_used.nunique()

1434

In [143]:
open_data_portal_df.route_id.nunique()

1755

In [144]:
len(routes_assembled)

7416

In [145]:
routes_assembled.route_id.nunique()

1755

In [146]:
routes_assembled.route_name_used.nunique()

1434

In [147]:
routes_assembled.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 7416 entries, 0 to 7415
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   n_trips                    7416 non-null   int64   
 1   geometry                   7416 non-null   geometry
 2   schedule_gtfs_dataset_key  7416 non-null   object  
 3   route_id                   7416 non-null   object  
 4   route_type                 7416 non-null   object  
 5   shape_id                   7416 non-null   object  
 6   route_name_used            7416 non-null   object  
 7   route_length_feet          7416 non-null   float64 
dtypes: float64(1), geometry(1), int64(1), object(5)
memory usage: 463.6+ KB


In [148]:
routes_assembled.columns

Index(['n_trips', 'geometry', 'schedule_gtfs_dataset_key', 'route_id',
       'route_type', 'shape_id', 'route_name_used', 'route_length_feet'],
      dtype='object')

In [159]:
len(routes_assembled)

7416

In [158]:
len(open_data_portal_df)

7416

In [149]:
open_data = (
    routes_assembled.groupby(["schedule_gtfs_dataset_key"])
    .agg({"route_name_used": "nunique", "route_id": "nunique"})
    .reset_index()
)

In [150]:
open_data_portal_df.columns

Index(['schedule_gtfs_dataset_key', 'route_type', 'shape_id', 'route_id',
       'route_name_used', 'shn_route', 'District',
       'pct_route_on_hwy_across_districts', 'on_shs'],
      dtype='object')

In [151]:
shn_test = (
    open_data_portal_df.groupby(["schedule_gtfs_dataset_key"])
    .agg({"route_name_used": "nunique", "route_id": "nunique"})
    .reset_index()
)

In [152]:
shn_test.sample(3)

Unnamed: 0,schedule_gtfs_dataset_key,route_name_used,route_id
48,4c6b107352b318297bb39173c796f357,18,18
149,d6d11b790b4c9c68760d46c4a7ee8a0c,3,3
163,f0207a9a6faf0275de6a06b49f787e06,5,5


In [153]:
shn_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   schedule_gtfs_dataset_key  181 non-null    object
 1   route_name_used            181 non-null    int64 
 2   route_id                   181 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 4.4+ KB
