# Transit On the SHN 
* [GH Issue](https://github.com/cal-itp/data-analyses/issues/1477)


In [None]:
import geopandas as gpd
import google.auth
import pandas as pd

credentials, project = google.auth.default()

import gcsfs

fs = gcsfs.GCSFileSystem()

In [None]:


from calitp_data_analysis import geography_utils, utils
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import (
    catalog_utils,
    dask_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS



In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
def process_transit_routes() -> gpd.GeoDataFrame:
    """
    Select the most recent transit route to 
    figure out how much of it intersects with 
    the state highway network.
    """
    # Load in the route shapes.
    OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

    subset = [
        "service_date",
        "geometry",
        "portfolio_organization_name",
        "recent_combined_name",
        "schedule_gtfs_dataset_key",
    ]
    op_geography_df = gpd.read_parquet(
        f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
        storage_options={"token": credentials.token},
    )[subset]
    
    # Keep the row for each portfolio_organization_name/recent_combined_name
    # that is the most recent. 
    most_recent_routes = publish_utils.filter_to_recent_date(
        df=op_geography_df,
        group_cols=[
            "portfolio_organization_name",
        ],
    )

    # Calculate the length of route, ensuring that it is in feet.
    most_recent_routes = most_recent_routes.assign(
        route_length_feet=most_recent_routes.geometry.to_crs(
            geography_utils.CA_NAD83Albers_ft
        ).length
    )

    # Drop any duplicates. 
    # This will probably be taken out once the 1:m recent_combined_name
    # to route_id issue is resolved.
    most_recent_routes = most_recent_routes.drop_duplicates(
        subset=["portfolio_organization_name", "recent_combined_name", "service_date"]
    )
    return most_recent_routes

In [None]:
transit_routes = process_transit_routes()

In [None]:
transit_routes.drop(columns=["geometry"]).sample(3)

In [None]:
len(transit_routes)

In [None]:
transit_routes.recent_combined_name.value_counts().head()

In [None]:
transit_routes.portfolio_organization_name.value_counts().head()

## Load in SHS 


In [None]:
def dissolve_shn() -> gpd.GeoDataFrame:
    """
    Dissolve State Highway Network so there will only be one row for each
    route name and route type
    """
    # Read in the dataset and change the CRS to one to feet.
    SHN_FILE = catalog_utils.get_catalog(
        "shared_data_catalog"
    ).state_highway_network.urlpath

    shn = gpd.read_parquet(
        SHN_FILE,
        storage_options={"token": credentials.token},
    ).to_crs(geography_utils.CA_NAD83Albers_ft)

    # Dissolve by route which represents the the route's name and drop the other columns
    # because they are no longer relevant.
    shn_dissolved = (
        shn.dissolve(by=["Route"])
        .reset_index()
        .drop(columns=["County", "District", "Direction", "RouteType"])
    )

    # Rename because I don't want any confusion between SHN route and 
    # transit route.
    shn_dissolved = shn_dissolved.rename(columns={"Route": "shn_route"})

    # Find the length of each highway.
    shn_dissolved = shn_dissolved.assign(
        highway_feet=shn_dissolved.geometry.length,
        shn_route=shn_dissolved.shn_route.astype(int),
    )
    
    # Save this out so I don't have to dissolve it each time.
    shn_dissolved.to_parquet(
        "gs://calitp-analytics-data/data-analyses/state_highway_network/shn_dissolved.parquet",
        filesystem=fs,
    )
    return shn_dissolved

In [None]:
# dissolved = dissolve_shn()

In [None]:
# shn_dissolved.loc[shn_dissolved.shn_route == 210].drop(columns=["geometry"])

In [None]:
# shn_dissolved.loc[shn_dissolved.shn_route == 110].drop(columns=["geometry"])

In [None]:
# shn_dissolved.loc[shn_dissolved.Route == 210].explore()

In [None]:
# shn_dissolved.loc[shn_dissolved.Route == 110].explore()

In [None]:
# len(dissolved)

In [None]:
def buffer_shn(buffer_amount: int) -> gpd.GeoDataFrame:
    """
    Add a buffer to the SHN before overlaying it with 
    transit routes. 
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

    # Read in the dissolved SHN file 
    shn_df = gpd.read_parquet(
        f"{GCS_FILE_PATH}shn_dissolved.parquet",
        storage_options={"token": credentials.token},
    )

    # Buffer the state highway.
    shn_df_buffered = shn_df.assign(
        geometry=shn_df.geometry.buffer(buffer_amount),
    )

    # Save it out so we won't have to buffer over again and
    # can just read it in. 
    shn_df_buffered.to_parquet(
        f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft.parquet", 
        filesystem=fs
    )

    return shn_df_buffered

In [None]:
SHN_HWY_BUFFER_FEET = 50
PARALLEL_HWY_BUFFER_FEET = geography_utils.FEET_PER_MI * 0.5

In [None]:
# intersecting_buffer.shape

In [None]:
# buffered_df = buffer_shn(SHN_HWY_BUFFER_FEET)

In [None]:
# buffered_df.columns

## Overlay

In [None]:
def routes_shn_intersection(
    buffer_amount: int,
) -> gpd.GeoDataFrame:
    """
    Overlay the most recent transit routes with a buffered version
    of the SHN 
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

    # Read in buffered shn here or re buffer if we don't have it available. 
    HWY_FILE = f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft.parquet"

    if fs.exists(HWY_FILE):
        shn_routes_gdf = gpd.read_parquet(
            HWY_FILE, storage_options={"token": credentials.token}
        )
    else:
        shn_routes_gdf = buffer_shn(buffer_amount)

    # Process the most recent transit route geographies and ensure the 
    # CRS matches the SHN routes' GDF so the overlay doesn't go wonky. 
    transit_routes_gdf = process_transit_routes().to_crs(shn_routes_gdf.crs)

    # Overlay transit routes with the SHN geographies. 
    gdf = gpd.overlay(
        transit_routes_gdf, shn_routes_gdf, how="intersection", keep_geom_type=True
    )

    # Calcuate the percent of the transit route that runs on a highway, round it up and 
    # multiply it by 100. Drop the geometry because we want the original transit route
    # shapes. 
    gdf = gdf.assign(
        pct_route_on_hwy=(gdf.geometry.length / gdf.route_length_feet).round(3) * 100,
    ).drop(
        columns=[
            "geometry",
        ]
    )

    # Join back the dataframe above with the original transit route dataframes
    # so we can have the original transit route geographies. 
    gdf2 = pd.merge(
        transit_routes_gdf,
        gdf,
        on=[
            "service_date",
            "portfolio_organization_name",
            "recent_combined_name",
            "schedule_gtfs_dataset_key",
            "route_length_feet",
        ],
        how="inner",
    )

    return gdf2

In [None]:
intersecting = routes_shn_intersection(SHN_HWY_BUFFER_FEET)

In [None]:
intersecting.pct_route_on_hwy.describe()

In [None]:
# transit_routes_gdf = process_transit_routes()

In [None]:
# len(intersecting)

In [None]:
# intersecting.loc[intersecting.pct_route_on_hwy != 0].recent_combined_name.nunique()

## Create final dataframes

In [None]:
def final_transit_route_shs_outputs(gdf: gpd.GeoDataFrame, pct_route_intersection: int):
    """
    Take the gdf with the transit routes overlaid with state highway
    network routes, aggregate it to find the pct filter it to meet a certain threshold, and
    clean it into dataframes to use for display. 
    """
    # Aggregate to find the total pct of a transit route because
    # a route can intersect with one or more state highway
    # so there will only be one row for one transit route instead of
    # many rows for one transit route.
    map_gdf = (
        gdf.groupby(
            [
                "portfolio_organization_name",
                "recent_combined_name",
            ]
        )
        .agg({"pct_route_on_hwy": "sum"})
        .reset_index()
    )

    # Filter out for any pct_route_on_hwy that we deem too low. 
    map_gdf = map_gdf.loc[map_gdf.pct_route_on_hwy > pct_route_intersection]
    
    # Join back to get the original transit route geometries and the names of the 
    # state highways these routes intersect with. This gdf will be used to 
    # display a map.
    map_gdf2 = pd.merge(
        gdf[
            ["portfolio_organization_name", "recent_combined_name", "geometry"]
        ].drop_duplicates(),
        map_gdf,
        on=["portfolio_organization_name", "recent_combined_name"],
    )
    # We want a text table to display.
    # For transit routes that intersect with multiple highways, join the highways
    # together so there will be one row for each unique route.
    text_df = pd.merge(
        gdf.drop(columns=["pct_route_on_hwy"]), map_gdf2.drop(columns = ["geometry"]), how="inner"
    )
    text_df = (
        (
            text_df.groupby(
                [
                    "portfolio_organization_name",
                    "recent_combined_name",
                    "schedule_gtfs_dataset_key",
                    "pct_route_on_hwy",
                ],
                as_index=False,
            )
        )["shn_route"]
        .agg(lambda x: ", ".join(set(x.astype(str))))
        .reset_index(drop=True)
    )

    return map_gdf2, text_df

In [None]:
map_gdf, text_df = final_transit_route_shs_outputs(intersecting, 20)

## Sample Map
* This will be displayed by each Caltrans District as opposed to this sample which shows all the routes across the state that touch a state highway.

In [None]:
len(map_gdf)

* Read in the buffered SHN file and dissolve it again so it's only one row, since we don't care about each individual state route.

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

# Read in buffered shn here or re buffer
HWY_FILE = f"{GCS_FILE_PATH}shn_buffered_{SHN_HWY_BUFFER_FEET}_ft.parquet"

In [None]:
shn_gdf = gpd.read_parquet(HWY_FILE, storage_options={"token": credentials.token})

In [None]:
len(shn_gdf)

In [None]:
shn_gdf2 = shn_gdf.dissolve()

In [None]:
len(shn_gdf2)

In [None]:
m = shn_gdf2.explore(height=250, width=500, name="shs")

In [None]:
m = map_gdf.explore("recent_combined_name", m=m, name="transit_routes")

In [None]:
 m

### Sample Text Table

In [None]:
text_df.sort_values(by = ["pct_route_on_hwy"], ascending = False)