# Transit On the SHN 
* [GH Issue](https://github.com/cal-itp/data-analyses/issues/1477)


In [2]:
import geopandas as gpd
import google.auth
import numpy as np
import pandas as pd

credentials, project = google.auth.default()

import gcsfs

fs = gcsfs.GCSFileSystem()

In [3]:
from calitp_data_analysis import geography_utils, utils
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from shared_utils import (
    catalog_utils,
    dask_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [5]:
def process_transit_routes() -> gpd.GeoDataFrame:
    """
    Select the most recent transit route to
    figure out how much of it intersects with
    the state highway network.
    """
    # Load in the route shapes.
    OPERATOR_ROUTE = GTFS_DATA_DICT.digest_tables.operator_routes_map

    subset = [
        "service_date",
        "geometry",
        "portfolio_organization_name",
        "recent_combined_name",
        # "route_id",
    ]
    op_geography_df = gpd.read_parquet(
        f"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet",
        storage_options={"token": credentials.token},
    )[subset]

    # Keep the row for each portfolio_organization_name/recent_combined_name
    # that is the most recent.
    most_recent_routes = publish_utils.filter_to_recent_date(
        df=op_geography_df,
        group_cols=[
            "portfolio_organization_name",
        ],
    )

    # Calculate the length of route, ensuring that it is in feet.
    most_recent_routes = most_recent_routes.assign(
        route_length_feet=most_recent_routes.geometry.to_crs(
            geography_utils.CA_NAD83Albers_ft
        ).length
    )

    # Drop any duplicates.
    # This will probably be taken out once the 1:m recent_combined_name
    # to route_id issue is resolved.
    most_recent_routes = most_recent_routes.drop_duplicates(
        subset=["portfolio_organization_name", "recent_combined_name", "service_date"]
    )
    return most_recent_routes

In [6]:
# transit_routes = process_transit_routes()

In [7]:
# transit_routes.drop(columns=["geometry"]).sample(3)

In [8]:
# len(transit_routes)

In [9]:
# transit_routes.recent_combined_name.value_counts().head()

In [10]:
# transit_routes.portfolio_organization_name.value_counts().head()

## Load in SHS 


In [11]:
def dissolve_shn(columns_to_dissolve: list, file_name: str) -> gpd.GeoDataFrame:
    """
    Dissolve State Highway Network so there will only be one row for each
    route name and route type
    """
    # Read in the dataset and change the CRS to one to feet.
    SHN_FILE = catalog_utils.get_catalog(
        "shared_data_catalog"
    ).state_highway_network.urlpath

    shn = gpd.read_parquet(
        SHN_FILE,
        storage_options={"token": credentials.token},
    ).to_crs(geography_utils.CA_NAD83Albers_ft)

    # Dissolve by route which represents the the route's name and drop the other columns
    # because they are no longer relevant.
    shn_dissolved = (shn.dissolve(by=columns_to_dissolve).reset_index())[
        columns_to_dissolve + ["geometry"]
    ]

    # Rename because I don't want any confusion between SHN route and
    # transit route.
    shn_dissolved = shn_dissolved.rename(columns={"Route": "shn_route"})

    # Find the length of each highway.
    shn_dissolved = shn_dissolved.assign(
        highway_feet=shn_dissolved.geometry.length,
        shn_route=shn_dissolved.shn_route.astype(int).astype(str),
    )

    # Save this out so I don't have to dissolve it each time.
    shn_dissolved.to_parquet(
        f"gs://calitp-analytics-data/data-analyses/state_highway_network/shn_dissolved_by_{file_name}.parquet",
        filesystem=fs,
    )
    return shn_dissolved

In [12]:
dissolved_route = dissolve_shn(["Route", "District"], "ct_district_route")

In [13]:
dissolved_route.shn_route.unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '20', '22', '23', '24', '25',
       '26', '27', '28', '29', '32', '33', '34', '35', '36', '37', '38',
       '39', '40', '41', '43', '44', '45', '46', '47', '49', '50', '51',
       '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62',
       '63', '65', '66', '67', '68', '70', '71', '72', '73', '74', '75',
       '76', '77', '78', '79', '80', '82', '83', '84', '85', '86', '87',
       '88', '89', '90', '91', '92', '94', '95', '96', '97', '98', '99',
       '101', '103', '104', '105', '107', '108', '109', '110', '111',
       '112', '113', '114', '115', '116', '118', '119', '120', '121',
       '123', '124', '125', '126', '127', '128', '129', '130', '131',
       '132', '133', '134', '135', '136', '137', '138', '139', '140',
       '142', '144', '145', '146', '147', '149', '150', '151', '152',
       '153', '154', '155', '156', '158', '160', '161', '162', '163',

In [14]:
# shn_dissolved.loc[shn_dissolved.shn_route == 210].drop(columns=["geometry"])

In [15]:
# shn_dissolved.loc[shn_dissolved.shn_route == 110].drop(columns=["geometry"])

In [16]:
# shn_dissolved.loc[shn_dissolved.Route == 210].explore()

In [17]:
# shn_dissolved.loc[shn_dissolved.Route == 110].explore()

In [18]:
# len(dissolved)

In [19]:
def buffer_shn(buffer_amount: int, file_name: str) -> gpd.GeoDataFrame:
    """
    Add a buffer to the SHN before overlaying it with
    transit routes.
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

    # Read in the dissolved SHN file
    shn_df = gpd.read_parquet(
        f"{GCS_FILE_PATH}shn_dissolved_by_{file_name}.parquet",
        storage_options={"token": credentials.token},
    )

    # Buffer the state highway.
    shn_df_buffered = shn_df.assign(
        geometry=shn_df.geometry.buffer(buffer_amount),
    )

    # Save it out so we won't have to buffer over again and
    # can just read it in.
    shn_df_buffered.to_parquet(
        f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_{file_name}.parquet",
        filesystem=fs,
    )

    return shn_df_buffered

In [20]:
SHN_HWY_BUFFER_FEET = 50
PARALLEL_HWY_BUFFER_FEET = geography_utils.FEET_PER_MI * 0.5

In [21]:
# intersecting_buffer.shape

In [22]:
# buffered_df = buffer_shn(SHN_HWY_BUFFER_FEET, "ct_district_route")

In [23]:
# buffered_df.loc[buffered_df.shn_route == 5].explore()

In [24]:
shn_district_df = gpd.read_parquet(
    f"gs://calitp-analytics-data/data-analyses/state_highway_network/shn_buffered_50_ft_ct_district_route.parquet",
    storage_options={"token": credentials.token},
)

In [25]:
shn_district_df.shape

(344, 4)

In [26]:
shn_district_df.drop(columns=["geometry"]).sample(3)

Unnamed: 0,shn_route,District,highway_feet
199,138,8,201549.35
262,199,1,235912.88
211,151,2,43383.23


In [27]:
shn_district_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   shn_route     344 non-null    object  
 1   District      344 non-null    int64   
 2   geometry      344 non-null    geometry
 3   highway_feet  344 non-null    float64 
dtypes: float64(1), geometry(1), int64(1), object(1)
memory usage: 10.9+ KB


In [28]:
# buffered_df.columns

## Overlay

In [29]:
def routes_shn_intersection(buffer_amount: int, file_name: str) -> gpd.GeoDataFrame:
    """
    Overlay the most recent transit routes with a buffered version
    of the SHN
    """
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

    # Read in buffered shn here or re buffer if we don't have it available.
    HWY_FILE = f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_{file_name}.parquet"

    if fs.exists(HWY_FILE):
        shn_routes_gdf = gpd.read_parquet(
            HWY_FILE, storage_options={"token": credentials.token}
        )
    else:
        shn_routes_gdf = buffer_shn(buffer_amount)

    # Process the most recent transit route geographies and ensure the
    # CRS matches the SHN routes' GDF so the overlay doesn't go wonky.
    transit_routes_gdf = process_transit_routes().to_crs(shn_routes_gdf.crs)

    # Overlay transit routes with the SHN geographies.
    gdf = gpd.overlay(
        transit_routes_gdf, shn_routes_gdf, how="intersection", keep_geom_type=True
    )

    # Calcuate the percent of the transit route that runs on a highway, round it up and
    # multiply it by 100. Drop the geometry because we want the original transit route
    # shapes.
    gdf = gdf.assign(
        pct_route_on_hwy=(gdf.geometry.length / gdf.route_length_feet).round(3) * 100,
    ).drop(
        columns=[
            "geometry",
        ]
    )

    # Join back the dataframe above with the original transit route dataframes
    # so we can have the original transit route geographies.
    gdf2 = pd.merge(
        transit_routes_gdf,
        gdf,
        on=[
            "service_date",
            "portfolio_organization_name",
            "recent_combined_name",
            "route_length_feet",
        ],
        how="left",
    )

    # Clean up
    gdf2.District = gdf2.District.fillna(0).astype(int)
    return gdf2

In [30]:
intersecting = routes_shn_intersection(SHN_HWY_BUFFER_FEET, "ct_district_route")

In [31]:
intersecting.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 6788 entries, 0 to 6787
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   service_date                 6788 non-null   datetime64[ns]
 1   geometry                     6775 non-null   geometry      
 2   portfolio_organization_name  6788 non-null   object        
 3   recent_combined_name         6788 non-null   object        
 4   route_length_feet            6775 non-null   float64       
 5   shn_route                    6191 non-null   object        
 6   District                     6788 non-null   int64         
 7   highway_feet                 6191 non-null   float64       
 8   pct_route_on_hwy             6191 non-null   float64       
dtypes: datetime64[ns](1), float64(3), geometry(1), int64(1), object(3)
memory usage: 530.3+ KB


In [32]:
intersecting.drop(columns=["geometry", "route_length_feet", "highway_feet"]).sample(10)

Unnamed: 0,service_date,portfolio_organization_name,recent_combined_name,shn_route,District,pct_route_on_hwy
4458,2025-04-16,FlixBus and Greyhound,FlixBus N2002 SF - LA - Palm Springs,80,4,2.1
4774,2025-04-16,FlixBus and Greyhound,Greyhound US0802 San Francisco - Fresno - Los Angeles,110,7,0.0
4273,2025-04-16,FlixBus and Greyhound,FlixBus 2015 San Diego - San Bernardino - Los Angeles,52,11,0.0
1567,2025-05-14,Los Angeles World Airports,LAX to US FlyAway - LAX to Union Station,1,7,3.5
3282,2025-05-14,City and County of San Francisco,28 19TH AVENUE,280,4,0.6
1372,2025-05-14,"University of California, Los Angeles",U4,10,7,0.5
1796,2025-05-14,"San Diego Metropolitan Transit System, Airport, Flagship Cruises",290 Rancho Bernardo Station - Downtown,805,11,0.2
3707,2025-05-14,OmniTrans,3 San Bdno - Baseline - Highland,210,8,0.6
4811,2025-04-16,FlixBus and Greyhound,Greyhound US0802 San Francisco - Fresno - Los Angeles,205,4,0.2
3094,2025-05-14,Yolo County Transportation District,ROUTE 40,5,3,0.6


In [33]:
len(intersecting)

6788

In [34]:
len(
    intersecting.loc[
        (intersecting.pct_route_on_hwy <= 1) & (intersecting.shn_route.notna())
    ]
)

4008

In [35]:
intersecting.loc[
    (intersecting.pct_route_on_hwy <= 1) & (intersecting.shn_route.notna())
].drop(columns=["geometry", "route_length_feet", "highway_feet"]).sample(10)

Unnamed: 0,service_date,portfolio_organization_name,recent_combined_name,shn_route,District,pct_route_on_hwy
1924,2025-05-14,"San Diego Metropolitan Transit System, Airport, Flagship Cruises",929 Downtown - Iris Transit Center,75,11,0.2
1079,2025-05-14,Alameda-Contra Costa Transit District,NX Grand Lake - Laurel Transbay,24,4,0.2
6690,2025-05-14,Santa Clara Valley Transportation Authority,Express 104 Milpitas BART - Stanford Research Park,85,4,0.1
1736,2025-05-14,"San Diego Metropolitan Transit System, Airport, Flagship Cruises",14 Grantville Trolley - Baltimore & Lake Murray,8,11,0.7
4997,2025-04-16,FlixBus and Greyhound,Greyhound US0900 Los Angeles - Las Vegas,60,7,0.0
4665,2025-04-16,FlixBus and Greyhound,Greyhound US0500 Seattle - Sacramento,80,3,0.0
4537,2025-04-16,FlixBus and Greyhound,FlixBus N2003 San Francisco - Los Angeles - San Diego,57,12,0.2
4256,2025-04-16,FlixBus and Greyhound,FlixBus 2007 LA - Sacramento,12,10,0.0
4009,2025-05-14,Sacramento Regional Transit District,Gold Downtown - Folsom,51,3,0.2
1517,2025-05-14,City of Montebello,10 Whittier Blvd,605,7,0.2


In [36]:
intersecting.pct_route_on_hwy.describe()

count   6191.00
mean       6.54
std       15.02
min        0.00
25%        0.10
50%        0.40
75%        3.70
max       95.20
Name: pct_route_on_hwy, dtype: float64

In [37]:
intersecting.recent_combined_name.value_counts().describe()

count   2546.00
mean       2.67
std        4.15
min        1.00
25%        1.00
50%        2.00
75%        3.00
max       72.00
Name: recent_combined_name, dtype: float64

In [38]:
# Find routes that cross multiple districts
multi_district_routes = (
    intersecting.groupby(["portfolio_organization_name", "recent_combined_name"])
    .agg({"District": "nunique"})
    .reset_index()
)

In [68]:
multi_district_routes.loc[
    ~multi_district_routes.portfolio_organization_name.isin(["FlixBus and Greyhound", "Amtrak"])
].sort_values(by=["District"], ascending=False).head(20)

Unnamed: 0,portfolio_organization_name,recent_combined_name,District
1646,Orange County Transportation Authority,1 Long Beach - San Clemente,3
1215,Kern County,100 Bakersfield - Lancaster,3
2309,Stanislaus Regional Transit Authority,BART - Modesto,2
248,Capitol Corridor Joint Powers Authority,CC Capitol Corridor,2
1763,Riverside Transit Agency,200 San Bernardino - Riverside - Anaheim/Disneyland Express,2
2368,Tehama County,GTC Glenn-Tehama Connect,2
1687,Orange County Transportation Authority,60 Long Beach - Tustin,2
2582,Yolo County Transportation District,ROUTE 230,2
1789,Sacramento County,Highway 99 Express,2
132,Amador Regional Transit System,1 Sacramento,2


In [40]:
bakersfield_route = intersecting.loc[
    intersecting.recent_combined_name == "100 Bakersfield - Lancaster"
]

In [41]:
bakersfield_route.District.unique()

array([7, 6, 9])

In [42]:
# bakersfield_route.drop(columns = ["service_date"]).explore()

In [43]:
intersecting.loc[
    intersecting.recent_combined_name == "100 Bakersfield - Lancaster"
].pct_route_on_hwy.sum()

75.4

In [66]:
intersecting.loc[intersecting.recent_combined_name == "Route 785"].drop(
    columns=["geometry"]
)

Unnamed: 0,service_date,portfolio_organization_name,recent_combined_name,route_length_feet,shn_route,District,highway_feet,pct_route_on_hwy
3507,2025-05-14,Antelope Valley Transit Authority,Route 785,384526.24,101,7,876278.76,14.4
3508,2025-05-14,Antelope Valley Transit Authority,Route 785,384526.24,110,7,327921.87,0.0
3509,2025-05-14,Antelope Valley Transit Authority,Route 785,384526.24,405,7,512007.45,0.1
3510,2025-05-14,Antelope Valley Transit Authority,Route 785,384526.24,5,7,967173.52,11.7
3511,2025-05-14,Antelope Valley Transit Authority,Route 785,384526.24,170,7,64634.09,8.5
3512,2025-05-14,Antelope Valley Transit Authority,Route 785,384526.24,134,7,141091.55,0.0
3513,2025-05-14,Antelope Valley Transit Authority,Route 785,384526.24,2,7,451415.95,0.0
3514,2025-05-14,Antelope Valley Transit Authority,Route 785,384526.24,118,7,416699.9,0.1
3515,2025-05-14,Antelope Valley Transit Authority,Route 785,384526.24,210,7,553415.12,0.0
3516,2025-05-14,Antelope Valley Transit Authority,Route 785,384526.24,14,7,583368.45,47.9


## Create final dataframe to add to Transit Routes dataset on the Open Data Portal [here](https://data.ca.gov/dataset/california-transit-routes/resource/3ea7eb11-4fc6-45ed-88f5-b599e38c6b19)

In [44]:
def group_route_district(df: pd.DataFrame, pct_route_on_hwy_agg: str) -> pd.DataFrame:

    # Aggregate by adding all the districts and SHN to a single row, rather than
    # multiple and sum up the total % of SHN a transit route intersects with
    agg1 = (
        df.groupby(
            [
                "portfolio_organization_name",
                "recent_combined_name",
            ],
            as_index=False,
        )[["shn_route", "District", "pct_route_on_hwy_across_districts"]]
        .agg(
            {
                "shn_route": lambda x: ", ".join(set(x.astype(str))),
                "District": lambda x: ", ".join(set(x.astype(str))),
                "pct_route_on_hwy_across_districts": pct_route_on_hwy_agg,
            }
        )
        .reset_index(drop=True)
    )

    # Clean up
    agg1.pct_route_on_hwy_across_districts = (
        agg1.pct_route_on_hwy_across_districts.astype(float).round(2)
    )
    return agg1

In [45]:
# group_route_district()

In [46]:
def create_on_shs_column(df):
    df["on_shs"] = np.where(df["pct_route_on_hwy_across_districts"] == 0, "N", "Y")
    return df

In [47]:
def prep_open_data_portal(gdf: gpd.GeoDataFrame) -> pd.DataFrame:
    """
    Prepare the gdf to join with the existing transit_routes
    dataframe that is published on the Open Data Portal
    """
    # Rename column
    gdf = gdf.rename(columns={"pct_route_on_hwy": "pct_route_on_hwy_across_districts"})
    # Group the dataframe so that one route only has one
    # row instead of multiple rows after finding its
    # intersection with any SHN routes.
    agg1 = group_route_district(gdf, "sum")

    # Add yes/no column to signify if a transit route intersects
    # with a SHN route
    agg1 = create_on_shs_column(agg1)

    return agg1

In [48]:
intersecting.columns

Index(['service_date', 'geometry', 'portfolio_organization_name',
       'recent_combined_name', 'route_length_feet', 'shn_route', 'District',
       'highway_feet', 'pct_route_on_hwy'],
      dtype='object')

In [49]:
open_data_portal_df = prep_open_data_portal(intersecting)

In [50]:
open_data_portal_df.District.unique()

array(['4', '0', '3, 10', '10', '9, 4, 3, 10', '4, 3', '4, 3, 2, 7, 5',
       '7, 5', '8, 7, 12', '8, 7, 11', '12', '7', '8', '3', '6', '5',
       '7, 12', '1', '9', '7, 9', '7, 4, 6, 10', '7, 6, 11, 12',
       '7, 6, 10, 3', '7, 11, 12', '8, 7', '6, 4, 10, 11, 12, 7', '3, 2',
       '6, 3, 10, 11, 12, 7', '11', '7, 9, 6', '6, 9', '4, 1', '2',
       '5, 4', '8, 12', '4, 10', '2, 1'], dtype=object)

In [51]:
open_data_portal_df.on_shs.value_counts()

Y    2021
N     600
Name: on_shs, dtype: int64

In [52]:
open_data_portal_df.loc[
    open_data_portal_df.on_shs == "Y"
].pct_route_on_hwy_across_districts.describe()

count   2021.00
mean      20.02
std       27.36
min        0.10
25%        1.00
50%        4.40
75%       31.60
max       97.80
Name: pct_route_on_hwy_across_districts, dtype: float64

In [53]:
open_data_portal_df.loc[
    open_data_portal_df.on_shs == "N"
].pct_route_on_hwy_across_districts.describe()

count   600.00
mean      0.00
std       0.00
min       0.00
25%       0.00
50%       0.00
75%       0.00
max       0.00
Name: pct_route_on_hwy_across_districts, dtype: float64

### Check a couple of routes 

In [69]:
intersecting.portfolio_organization_name.unique()

array(['Tulare County Regional Transit Agency',
       'Marin County Transit District', 'City of Auburn', 'City of Bell',
       'Santa Barbara Metropolitan Transit District',
       'Los Angeles County Metropolitan Transportation Authority',
       'Tahoe Transportation District',
       'Sonoma-Marin Area Rail Transit District',
       'Sonoma County Transit Schedule', 'Lake Transit Authority',
       'Yuba-Sutter Transit Authority', 'City of Glendale', 'Yurok Tribe',
       'City of Bell Gardens', 'POINT', 'Modoc Transportation Agency',
       'City of Fairfield', 'Alameda-Contra Costa Transit District',
       'City of Cudahy', 'Emeryville Transportation Management Agency',
       'City of Redondo Beach', 'City of West Covina',
       'Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)',
       'Yuma County Intergovernmental Public Transportation Authority',
       'City of Avalon', 'Curry Public Transit', 'City of La Puente',
       

In [72]:
intersecting.loc[intersecting.portfolio_organization_name == 'Golden Gate Bridge, Highway and Transportation District',].drop(
    columns=["geometry"]
).sort_values(by = ["recent_combined_name"])

Unnamed: 0,service_date,portfolio_organization_name,recent_combined_name,route_length_feet,shn_route,District,highway_feet,pct_route_on_hwy
5198,2025-05-14,"Golden Gate Bridge, Highway and Transportation District",101 Santa Rosa - San Francisco,330666.26,1.0,4,944979.07,0.1
5199,2025-05-14,"Golden Gate Bridge, Highway and Transportation District",101 Santa Rosa - San Francisco,330666.26,101.0,4,1824371.76,76.0
5200,2025-05-14,"Golden Gate Bridge, Highway and Transportation District",101 Santa Rosa - San Francisco,330666.26,131.0,4,35125.0,0.0
5201,2025-05-14,"Golden Gate Bridge, Highway and Transportation District",101 Santa Rosa - San Francisco,330666.26,580.0,4,621177.64,0.0
5202,2025-05-14,"Golden Gate Bridge, Highway and Transportation District",101 Santa Rosa - San Francisco,330666.26,37.0,4,225675.01,0.0
5203,2025-05-14,"Golden Gate Bridge, Highway and Transportation District",101 Santa Rosa - San Francisco,330666.26,12.0,4,478781.0,0.1
5204,2025-05-14,"Golden Gate Bridge, Highway and Transportation District",114 Mill Valley - San Francisco,97788.44,1.0,4,944979.07,3.4
5205,2025-05-14,"Golden Gate Bridge, Highway and Transportation District",114 Mill Valley - San Francisco,97788.44,101.0,4,1824371.76,34.0
5206,2025-05-14,"Golden Gate Bridge, Highway and Transportation District",114 Mill Valley - San Francisco,97788.44,80.0,4,748779.57,0.1
5207,2025-05-14,"Golden Gate Bridge, Highway and Transportation District",130 San Rafael - San Francisco,124935.82,1.0,4,944979.07,0.1


In [78]:
intersecting.loc[intersecting.recent_combined_name == '154 Novato - San Francisco',].pct_route_on_hwy.sum()

63.1

In [79]:
open_data_portal_df.loc[open_data_portal_df.recent_combined_name == '154 Novato - San Francisco']

Unnamed: 0,portfolio_organization_name,recent_combined_name,shn_route,District,pct_route_on_hwy_across_districts,on_shs
1179,"Golden Gate Bridge, Highway and Transportation District",154 Novato - San Francisco,"101, 131, 580, 80, 1, 37",4,63.1,Y


In [73]:
intersecting.loc[intersecting.recent_combined_name == '101 Santa Rosa - San Francisco',].pct_route_on_hwy.sum()

76.19999999999999

In [74]:
open_data_portal_df.loc[open_data_portal_df.recent_combined_name == '101 Santa Rosa - San Francisco']

Unnamed: 0,portfolio_organization_name,recent_combined_name,shn_route,District,pct_route_on_hwy_across_districts,on_shs
1174,"Golden Gate Bridge, Highway and Transportation District",101 Santa Rosa - San Francisco,"101, 131, 580, 1, 37, 12",4,76.2,Y


In [75]:
intersecting.loc[intersecting.recent_combined_name == '132 San Anselmo - San Francisco',].pct_route_on_hwy.sum()

56.600000000000016

In [77]:
intersecting.loc[intersecting.recent_combined_name == '132 San Anselmo - San Francisco',].shn_route.nunique()

5

In [76]:
open_data_portal_df.loc[open_data_portal_df.recent_combined_name == '132 San Anselmo - San Francisco']

Unnamed: 0,portfolio_organization_name,recent_combined_name,shn_route,District,pct_route_on_hwy_across_districts,on_shs
1177,"Golden Gate Bridge, Highway and Transportation District",132 San Anselmo - San Francisco,"101, 131, 580, 80, 1",4,56.6,Y


## Create final dataframes for portfolio

In [55]:
def categorize_percentiles(df):
    bins = [20, 41, 61, 81, np.inf]
    labels = ["20-40", "41-60", "61-80", "81-100"]
    df["percentile_route"] = pd.cut(
        df["pct_route_on_hwy"], bins=bins, labels=labels, right=False
    ).astype(str)
    return df

In [56]:
def final_transit_route_shs_outputs(
    intersecting_gdf: gpd.GeoDataFrame,
    open_data_df: pd.DataFrame,
    pct_route_intersection: int,
    district: str,
):
    """
    Take the dataframes from prep_open_data_portal and routes_shn_intersection.
    Prepare them for display on the GTFS Caltrans District Digest.

    intersecting_gdf: geodataframe created by
    open_data_df: dataframe created by
    pct_route_intersection: cutoff of the % of the transit route intersecting with the SHN
    district: the Caltrans district we are interested in.
    """
    # Filter out for any pct_route_on_hwy that we deem too low & for the relevant district.
    open_data_df = open_data_df.loc[
        (open_data_df.pct_route_on_hwy_across_districts > pct_route_intersection)
        & (open_data_df.District.str.contains(district))
    ]
    # intersecting_gdf.District = intersecting_gdf.District
    intersecting_gdf = intersecting_gdf.loc[
        intersecting_gdf.District.astype(str).str.contains(district)
    ]

    # Join back to get the original transit route geometries and the names of the
    # state highways these routes intersect with. This gdf will be used to
    # display a map.
    map_gdf = pd.merge(
        intersecting_gdf[
            ["portfolio_organization_name", "recent_combined_name", "geometry"]
        ].drop_duplicates(),
        open_data_df,
        on=["portfolio_organization_name", "recent_combined_name"],
    )

    # Add column for color scale when mapping
    # map_gdf = categorize_percentiles(map_gdf)

    # We want a text table to display.
    # Have to rejoin and to find only the SHN routes that are in the district
    # we are interested in.
    text_table_df = pd.merge(
        intersecting_gdf[
            [
                "portfolio_organization_name",
                "recent_combined_name",
                "shn_route",
                "District",
            ]
        ],
        open_data_df[
            [
                "portfolio_organization_name",
                "recent_combined_name",
                "pct_route_on_hwy_across_districts",
            ]
        ],
        on=["portfolio_organization_name", "recent_combined_name"],
    )

    # Now we have to aggregate again so each route will only have one row with the
    # district and SHN route info delinated by commas if there are multiple values.
    text_table = group_route_district(text_table_df, "max")

    # Rename for clarity
    text_table = text_table.rename(
        columns={
            "shn_route": f"shn_routes_in_d_{district}",
        }
    )

    return map_gdf, text_table

### Example using 100 Bakersfield - Lancaster which crosses D6, D7, D9

In [57]:
map_gdf_d7, text_df_d7 = final_transit_route_shs_outputs(
    intersecting, open_data_portal_df, 20, "7"
)

In [58]:
map_gdf_d9, text_df_d9 = final_transit_route_shs_outputs(
    intersecting, open_data_portal_df, 20, "9"
)

In [59]:
map_gdf_d6, text_df_d6 = final_transit_route_shs_outputs(
    intersecting, open_data_portal_df, 20, "6"
)

In [60]:
text_df_d7.loc[text_df_d7.recent_combined_name == "100 Bakersfield - Lancaster"]

Unnamed: 0,portfolio_organization_name,recent_combined_name,shn_routes_in_d_7,District,pct_route_on_hwy_across_districts
75,Kern County,100 Bakersfield - Lancaster,"14, 138",7,75.4


In [61]:
text_df_d7.loc[text_df_d7.recent_combined_name == "Route 785"]

Unnamed: 0,portfolio_organization_name,recent_combined_name,shn_routes_in_d_7,District,pct_route_on_hwy_across_districts
3,Antelope Valley Transit Authority,Route 785,"101, 14, 118, 210, 405, 134, 2, 5, 170, 110",7,82.7


In [62]:
text_df_d6.loc[text_df_d6.recent_combined_name == "100 Bakersfield - Lancaster"]

Unnamed: 0,portfolio_organization_name,recent_combined_name,shn_routes_in_d_6,District,pct_route_on_hwy_across_districts
21,Kern County,100 Bakersfield - Lancaster,"184, 204, 178, 223, 58",6,75.4


In [63]:
text_df_d9.loc[text_df_d9.recent_combined_name == "100 Bakersfield - Lancaster"]

Unnamed: 0,portfolio_organization_name,recent_combined_name,shn_routes_in_d_9,District,pct_route_on_hwy_across_districts
8,Kern County,100 Bakersfield - Lancaster,"202, 14, 58",9,75.4


## Sample Map
* This will be displayed by each Caltrans District as opposed to this sample which shows all the routes across the state that touch a state highway.

* Read in the buffered SHN file and dissolve it again so it's only one row, since we don't care about each individual state route.
* Amanda, note to self: should save this out to GCS because there's no point in dissolving over & over again
* Bug in geopandas [documented here](https://github.com/geopandas/geopandas/issues/3194), you can't specify a column and color.

In [None]:
def dissolve_buffered_for_map(buffer_amount: str) -> gpd.GeoDataFrame:
    GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"
    # Read in buffered shn here
    HWY_FILE = (
        f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_ft_ct_district_route.parquet"
    )
    gdf = gpd.read_parquet(HWY_FILE, storage_options={"token": credentials.token})

    # Dissolve by district
    gdf2 = gdf.dissolve("District").reset_index()[["geometry", "District", "shn_route"]]

    # Save
    gdf2.to_parquet(
        f"{GCS_FILE_PATH}shn_buffered_{buffer_amount}_gtfs_digest.parquet",
        filesystem=fs,
    )

In [None]:
gtfs_digest_shn = dissolve_buffered_for_map(SHN_HWY_BUFFER_FEET)

In [None]:
GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/state_highway_network/"

In [None]:
shn_gdf = gpd.read_parquet(
    f"{GCS_FILE_PATH}shn_buffered_{SHN_HWY_BUFFER_FEET}_gtfs_digest.parquet",
    storage_options={"token": credentials.token},
)

In [None]:
shn_gdf.columns

In [None]:
shn_gdf.District.unique()

In [None]:
""" m = shn_gdf.loc[shn_gdf.District == 7].explore(
    height=500,
    width=1000,
    style_kwds={"color": "#9DA4A6", "weight": 6, "opacity": 0.5},
    tiles="CartoDB positron",
    name="shs",
)"""

In [None]:
m

In [None]:
cmap_colors = [
    "#93c3db",
    "#144c87",
    "#8c1024",
    "#e4846c",
]

In [None]:
cmap_colors_continuous = [
    "#93c3db",
    "#82b4d1",
    "#71a5c7",
    "#6096bd",
    "#4f87b3",
    "#3e78a9",
    "#2d699f",
    "#1c5a95",
    "#0b4b8b",
    "#003c81",
]

In [None]:
map_gdf_d7.columns

In [None]:
# map_gdf_d7.explore(
#    "percentile_route", m=m, categorical=True, cmap=cmap_colors, legend=True
# )

In [None]:
# map_gdf_d7.explore("pct_route_on_hwy_across_districts", m=m, cmap= 'Blues', legend = True)

### Sample Text Table

In [None]:
text_df_d7.sort_values(by=["pct_route_on_hwy_across_districts"], ascending=False)