# Bus Stops on SHN in CA

Uriel's request:

Can you help me get a copy of the dataset (Excel or .csv) for bus stops that are on the SHS?

I am looking for an Excel or .csv dataset that shows only the bus stops on the SHS.  A filtered, tabulated data of them based on a snapshot of the most recent dataset is fine.  Gillian wants to move this sub-dataset into Entur for protoyping purposes.

In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(200_000_000_000)

import geopandas as gpd
import pandas as pd
from siuba import *

from segment_speed_utils import helpers
from shared_utils import catalog_utils, rt_dates, gtfs_utils_v2, rt_utils

SCHED_GCS = "gs://calitp-analytics-data/data-analyses/gtfs_schedule/"

In [2]:
pd.set_option('display.max_rows', None)

In [None]:
shared_data_catalog = catalog_utils.get_catalog("shared_data_catalog")

In [None]:
shn = shared_data_catalog.state_highway_network.read()

# 100 ft buffer around SHN
shn = shn.assign(
    geometry_buffered = (shn.geometry.to_crs("EPSG:2229")
                         .buffer(100)
                         .to_crs("EPSG:4326")
                        )
)

In [None]:
shn.plot()

In [None]:
ca_stops = shared_data_catalog.ca_transit_stops.read()

In [None]:
ca_stops['date'] = rt_dates.DATES['aug2024'] #  change to match most recent open data upload...

In [None]:
# wasn't finding `route_type` during sjoin_shs, changed to `routetypes`

unique_stop_cols = ["agency", "stop_id", "stop_name",
                        "RouteType", "date"]

In [None]:
def sjoin_shs(stops: gpd.GeoDataFrame):
    
    stops_on_shn = gpd.sjoin(
        stops, 
        shn[
            ["Route", "RouteType", "geometry_buffered"]
        ].set_geometry("geometry_buffered"),
        how = "inner",
        predicate = "intersects"
    ).drop(columns = "index_right")

    print(stops_on_shn[unique_stop_cols].drop_duplicates().shape)
    return stops_on_shn

In [None]:
display(
    ca_stops.columns,
    ca_stops.routetypes.unique()
)

In [None]:
display(
    shn.columns,
    shn.RouteType.unique()
)

In [None]:
def process_for_export(stops: gpd.GeoDataFrame):

    # Export / rename columns for clarity / get it as csv so back out lat/lon
    stops_for_export = stops[
        unique_stop_cols + ["Route", "RouteType", "geometry"]
    ].rename(columns = {
        "Route": "shn_route",
        "RouteType": "shn_route_type"}
    ).drop_duplicates().reset_index(drop=True)

    stops_for_export = stops_for_export.assign(
        x = stops_for_export.geometry.x,
        y = stops_for_export.geometry.y,
    ).drop(columns = "geometry")

    return stops_for_export

In [None]:
aug_shs_joined = sjoin_shs(ca_stops)

In [None]:
aug_shs_joined.columns

In [None]:
# aug_shs_joined.explore()

# adding SBMTD

## where'd it go?

In [None]:
analysis_date = rt_dates.DATES['aug2024'] #  start with same date as open data run (Aug 14)

In [None]:
sb = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=analysis_date)

In [None]:
sb = sb >> filter(_.name.str.contains('SB'))
sb

In [None]:
sb_stops = gtfs_utils_v2.get_stops(selected_date=analysis_date, operator_feeds=sb.feed_key)

In [None]:
sb_stops >> head(3) #  empty

### classic upload with coverage gap (uploaded ~8/14, service in feed starts ~8/19)

https://github.com/cal-itp/data-infra/issues/1300

## here it is

In [None]:
analysis_date = '2024-08-21'

In [None]:
sb = gtfs_utils_v2.schedule_daily_feed_to_gtfs_dataset_name(selected_date=analysis_date)
sb = sb >> filter(_.name.str.contains('SB'))
sb

In [None]:
sb_stops = gtfs_utils_v2.get_stops(selected_date=analysis_date, operator_feeds=sb.feed_key)

In [None]:
sb_trips = gtfs_utils_v2.get_trips(selected_date=analysis_date, operator_feeds=sb.feed_key)

In [None]:
sb_trips.empty

In [None]:
sb_stops >> head(2)

In [None]:
#  lifted from https://github.com/cal-itp/data-analyses/blob/main/open_data/create_stops_data.py
export_stops_path = 'gs://calitp-analytics-data/data-analyses/traffic_ops/export/ca_transit_stops_'

In [None]:
analysis_date = rt_dates.DATES['jul2024']

In [None]:
jul_stops = gpd.read_parquet(f'{export_stops_path}{analysis_date}.parquet', filters=[('agency', '==', 'Santa Barbara Metropolitan Transit District')])

In [None]:
jul_stops['date'] = analysis_date

## now B-Line

In [None]:
analysis_date = rt_dates.DATES['mar2024']

In [None]:
mar_stops = gpd.read_parquet(f'{export_stops_path}{analysis_date}.parquet'
                             , filters=[('agency', '==', 'Butte County Association of Governments')]
                            )
mar_stops['date'] = analysis_date

## now LBT

In [None]:
analysis_date = rt_dates.DATES['may2024']

In [None]:
may_stops = gpd.read_parquet(f'{export_stops_path}{analysis_date}.parquet'
                             , filters=[('agency', '==', 'Long Beach Transit')]
                            )
may_stops['date'] = analysis_date

## concat and process

In [None]:
stops_to_add = pd.concat([jul_stops, mar_stops, may_stops])

In [None]:
# stops_to_add.explore()

In [None]:
additional_shs_joined = sjoin_shs(stops_to_add)

In [None]:
# additional_shs_joined.explore()

# new combined export

In [None]:
all_spatial = pd.concat([aug_shs_joined, additional_shs_joined])

In [None]:
stops_for_export = process_for_export(all_spatial)

In [None]:
stops_for_export.to_csv("ca_stops_revised.csv", index=False)

## a map?

using webapp

In [None]:
shn = gpd.read_parquet(rt_utils.SHN_PATH)[['Route', 'County', 'District',
                                           'RouteType', 'geometry']]

In [None]:
to_map = all_spatial.drop(columns=['base64_url'])
# to_map['color'] = (10, 29, 245)

In [None]:
import calitp_data_analysis

In [None]:
len(to_map.agency.unique())

In [None]:
#calitp_data_analysis.calitp_color_palette  #  doesn't work?

In [None]:
CALITP_CATEGORY_BOLD_COLORS = [
    "#136C97",  # darker blue
    "#E16B26",  # orange
    "#F6BF16",  # yellow
    "#00896B",  # green
    "#7790A3",  # lighter blue
    "#5B559C",  # purple
]

In [None]:
full_categories = CALITP_CATEGORY_BOLD_COLORS * 20

In [None]:
color_dict = dict(zip(to_map.agency.unique(), full_categories))

In [None]:
#  https://www.30secondsofcode.org/python/s/hex-to-rgb/
def hex_to_rgb(hex):
    return tuple(int(hex[i:i+2], 16) for i in (0, 2, 4))

In [None]:
to_map['color'] = to_map.agency.apply(lambda x: hex_to_rgb(color_dict[x][1:]))

In [None]:
export_result = rt_utils.set_state_export(shn, subfolder = 'shs_stops/', filename = 'shs',
                                map_type = 'state_highway_network')
spa_map_state = export_result['state_dict']

combined_state = rt_utils.set_state_export(
                    to_map, subfolder = 'shs_stops/', filename=f'stops4',
                    existing_state=spa_map_state, map_title=f'SHS with Stops Sep 2024')

In [None]:
combined_state

# Identify 5311 agencies from list


In [3]:
stops_for_export = pd.read_csv("ca_stops_revised.csv")

In [None]:
display(
    stops_for_export.info(),
    stops_for_export["agency"].nunique()
)

In [4]:
from calitp_data_analysis.tables import tbls

# read in suggested bridge table 
#bridge_org = (tbls.mart_transit_database.bridge_organizations_x_funding_programs()
#        >> filter(_._is_current == True) # when `True`, get 21 rows?
#        >> collect()
#)

In [None]:
# what about dim_annual_funding_sources?
# data from 2022 NTD

#fund_source = (tbls.mart_ntd.dim_annual_funding_sources()
#        >> filter(
#           # _._is_current == True,
#            _.fta_rural_progam_5311 > 0,
#            _.uza_name.str.contains(", CA")
#        ) # when `True`, get  rows?
#        >> collect()
#)

In [None]:
# dim_funding_programs

#fund_program = (tbls.mart_transit_database.dim_funding_programs()
#        >> filter(_._is_current == True) # when `True`, get  rows?
#        >> collect()
#)

## table that list services with the funding programs (5311)

In [5]:
bridge_service = (tbls.mart_transit_database.bridge_services_x_funding_programs()
        >> filter(
            _._is_current == True,
            _.funding_program_name == "5311"
        ) 
        >> collect()
)

In [6]:
service_keep_cols =[
    "service_key",
    "service_name",
    "funding_program_key",
    "funding_program_name"
]

bridge_service = bridge_service[service_keep_cols]

bridge_service.shape

(139, 4)

In [None]:
display(
    bridge_service.info(),
    bridge_service.sample(3)
)

In [7]:
bridge_service_keys = list(bridge_service["service_key"].unique())

In [8]:
len(bridge_service_keys)

139

## table to get agency name from services

In [9]:

gtfs_provider = (tbls.mart_transit_database.dim_provider_gtfs_data()
        >> filter(
            _._is_current == True,
            _.service_key.isin(bridge_service_keys)
        )
        >> collect()
)

In [None]:
display(
    gtfs_provider.info()
)

In [10]:
provider_keep_col =[
    "organization_key",
    "organization_name",
    "service_key",
    "service_name",

]

gtfs_provider = gtfs_provider[provider_keep_col]
gtfs_provider.shape

(165, 4)

In [None]:
gtfs_provider.head()

## merge services with 5311 funding list (`bridge_service`) to services-to-agency list (`gtfs_provider`) = agencies that receive 5311

In [12]:
orgs_5311 = gtfs_provider.merge(bridge_service, on=["service_key","service_name"], how="inner")

orgs_5311.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165 entries, 0 to 164
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   organization_key      165 non-null    object
 1   organization_name     165 non-null    object
 2   service_key           165 non-null    object
 3   service_name          165 non-null    object
 4   funding_program_key   165 non-null    object
 5   funding_program_name  165 non-null    object
dtypes: object(6)
memory usage: 9.0+ KB


In [None]:
orgs_5311.sort_values(by="organization_name").head()

## merge agencies-with-stops-on-shn (`stops_for_export`) to 5311-agencies (`orgs_5311`).

In [15]:
# inspect
display(
    stops_for_export.info(), # +7,000 stops on the SHN
    orgs_5311.info() # 165 servies that get 5311 funds
)

#merge on `agency` and `organization_name`

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7216 entries, 0 to 7215
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   agency            7216 non-null   object 
 1   stop_id           7216 non-null   object 
 2   stop_name         7216 non-null   object 
 3   shn_route_type    7216 non-null   object 
 4   date              7216 non-null   object 
 5   shn_route         7216 non-null   int64  
 6   shn_route_type.1  7216 non-null   object 
 7   x                 7216 non-null   float64
 8   y                 7216 non-null   float64
dtypes: float64(2), int64(1), object(6)
memory usage: 507.5+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 165 entries, 0 to 164
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   organization_key      165 non-null    object
 1   organization_name     165 non-null    object
 2   serv

None

None

In [13]:
org_on_shn_stops_5311 = stops_for_export.merge(
    orgs_5311, 
    left_on="agency", 
    right_on="organization_name", 
    how="inner",
    indicator=True
)

org_on_shn_stops_5311.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8394 entries, 0 to 8393
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   agency                8394 non-null   object  
 1   stop_id               8394 non-null   object  
 2   stop_name             8394 non-null   object  
 3   shn_route_type        8394 non-null   object  
 4   date                  8394 non-null   object  
 5   shn_route             8394 non-null   int64   
 6   shn_route_type.1      8394 non-null   object  
 7   x                     8394 non-null   float64 
 8   y                     8394 non-null   float64 
 9   organization_key      8394 non-null   object  
 10  organization_name     8394 non-null   object  
 11  service_key           8394 non-null   object  
 12  service_name          8394 non-null   object  
 13  funding_program_key   8394 non-null   object  
 14  funding_program_name  8394 non-null   object  
 15  _mer

In [None]:
org_on_shn_stops_5311["_merge"].value_counts()

In [None]:
org_on_shn_stops_5311.head()

In [None]:
display(
    stops_for_export["agency"].nunique(),
    orgs_5311["organization_name"].nunique(),
    org_on_shn_stops_5311["agency"].nunique()
)

In [None]:
org_on_shn_stops_5311.to_csv("ca_stops_revised_5311.csv", index=False)

## lots of rows, lets try getting the unique agencies from `stops_for_export` and `orgs_5311` first, then join

In [45]:
# can i group by agencies and get a count of stops?
stops_agg = stops_for_export.groupby("agency").agg(
    stop_counts=("stop_id","count")
)

# get list of just unique agency names from each list 
agency_stops = stops_for_export["agency"].unique()
agency_5311 = orgs_5311["organization_name"].unique()

# convert array to DF
df_stops = pd.DataFrame(agency_array, columns=["agency"])
df_5311 = pd.DataFrame(agency_5311, columns=["agency"])

# inspect
display(
    df_stops.info(), # 150 unique agencies with stops on SHN
    df_5311.info() # 90 unique agencies that get 5311 funds
)
    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   agency  150 non-null    object
dtypes: object(1)
memory usage: 1.3+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   agency  90 non-null     object
dtypes: object(1)
memory usage: 848.0+ bytes


None

None

In [59]:
# merge aggreated stops to 5311 agencies
test_merge_2 = stops_agg.merge(
    df_5311, 
    on="agency",
    how="inner",
    indicator=True
)

# inspect
test_merge_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69 entries, 0 to 68
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   agency       69 non-null     object  
 1   stop_counts  69 non-null     int64   
 2   _merge       69 non-null     category
dtypes: category(1), int64(1), object(1)
memory usage: 1.8+ KB


In [60]:
# spot check
test_merge_2

Unnamed: 0,agency,stop_counts,_merge
0,Amador Regional Transit System,22,both
1,Antelope Valley Transit Authority,53,both
2,Basin Transit,79,both
3,Butte County Association of Governments,106,both
4,Calaveras Transit Agency,12,both
5,Central Contra Costa Transit Authority,12,both
6,City of Arcata,37,both
7,City of Arvin,6,both
8,City of Escalon,9,both
9,City of Eureka,37,both


In [61]:
test_merge_2.to_csv("ca_stops_revised_5311_2.csv", index=False)