In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import branca
import folium
from shared_utils import gtfs_utils_v2

from siuba import *
import pandas as pd
import geopandas as gpd 

import datetime as dt
import time

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import sys
sys.path.append('../rt_segment_speeds/segment_speed_utils')

from helpers import *

In [4]:
analysis_date = "2022-03-15"
agency_name = "SBMTD"

In [5]:
get_trips = import_scheduled_trips(analysis_date=analysis_date, columns =[
        "gtfs_dataset_key", "feed_key", "name", "trip_id", 
        "shape_id", "shape_array_key", 
        "route_id", "route_key", "direction_id"
    ], get_pandas=True)

In [6]:
def compute_feed_key(agency_name):
    filtered = get_trips[get_trips['name'].str.contains(agency_name, na=False)]
    if not filtered.empty:
        return filtered.iloc[0]['feed_key']
    else:
        return None

In [7]:
feed_key = compute_feed_key(agency_name)

In [8]:
stops_data = import_scheduled_stops(analysis_date).drop_duplicates().reset_index(drop=True)
if feed_key is not None:
    stops_data = stops_data[stops_data['feed_key'].isin([feed_key])]

In [9]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'
yr_sbmtd_raw = pd.read_excel(f'{GCS_FILE_PATH}SYSTEM WIDE STOP USAGE FY21-22.xlsx', sheet_name=None)

In [10]:
for key, value in yr_sbmtd_raw.items():
    value['daytype'] = f'{key}'

list(yr_sbmtd_raw.values()) [0]['daytype']

0       WKDY
1       WKDY
2       WKDY
3       WKDY
4       WKDY
        ... 
2210    WKDY
2211    WKDY
2212    WKDY
2213    WKDY
2214    WKDY
Name: daytype, Length: 2215, dtype: object

In [11]:
yr_sbmtd_all = pd.concat(yr_sbmtd_raw, ignore_index=True)

In [12]:
# standardize stop ids
day_cols = {'WKDY': 'weekday_ons', 'SAT': 'sat_ons', 'SUN': 'sun_ons'}

yr_sbmtd_all = (yr_sbmtd_all
                 >> mutate(STOP_ID_clean = if_else(_.STOP_ID!=99999,_.STOP_ID,_.STOP_ID_NEAR),
                           DAY_TYPE = _.daytype.apply(lambda x: day_cols[x]))
               )


In [13]:
name_id_dict = (yr_sbmtd_all 
                >> distinct(_.STOP_ID_clean,_.STOP_NAME)
                >> arrange(_.STOP_ID_clean,_.STOP_NAME)
                >> filter(_.STOP_NAME.notna()) #this expression drops NAs
                >> rename(STOP_NAME_clean = "STOP_NAME")
               )

In [14]:
yr_sbmtd_grouped = (yr_sbmtd_all
                >> left_join(_,name_id_dict)    
                >> group_by(_.STOP_ID_clean,_.STOP_NAME_clean, _.DAY_TYPE)
                >> summarize(stop_total_ons = _.XBOARDINGS.sum())
                )

yr_sbmtd_grouped >> head(5)

Unnamed: 0,STOP_ID_clean,STOP_NAME_clean,DAY_TYPE,stop_total_ons
0,-51,Pacific Oaks/Phelps,sat_ons,0
1,-51,Pacific Oaks/Phelps,sun_ons,0
2,-51,Pacific Oaks/Phelps,weekday_ons,2
3,-49,Pueblo/Castillo Out,sat_ons,116
4,-49,Pueblo/Castillo Out,sun_ons,109


In [15]:
yr_sbmtd_grouped = (yr_sbmtd_grouped
                     >> mutate(STOP_ID_clean = _.STOP_ID_clean.astype(str))
                     >> rename(stop_id = _.STOP_ID_clean)
                     >> spread("DAY_TYPE", "stop_total_ons")
                     >> mutate(feed_key = feed_key)
                    )

yr_sbmtd_grouped >> head (5)

Unnamed: 0,stop_id,STOP_NAME_clean,sat_ons,sun_ons,weekday_ons,feed_key
0,-1,Hollister/Sumida,319.0,381.0,3926.0,52201caab047b98ae19b7547c0d7c2ad
1,-10,Hollister/Robin Hill,29.0,13.0,173.0,52201caab047b98ae19b7547c0d7c2ad
2,-11,Hollister/Willow Springs,78.0,80.0,658.0,52201caab047b98ae19b7547c0d7c2ad
3,-12,Hollister/Los Carneros Way,40.0,22.0,396.0,52201caab047b98ae19b7547c0d7c2ad
4,-13,Hollister/Cremona,49.0,43.0,659.0,52201caab047b98ae19b7547c0d7c2ad


In [16]:
stops_to_join = (stops_data
                 >> select(_.feed_key, _.stop_id, _.stop_name, _.geometry)
                )

In [17]:
yr_sbmtd_geo_code = stops_to_join >> inner_join(_, yr_sbmtd_grouped, on = ['feed_key', 'stop_id'])

In [18]:
yr_sbmtd_geo_code.info() 

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 336 entries, 0 to 335
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   feed_key         336 non-null    object  
 1   stop_id          336 non-null    object  
 2   stop_name        336 non-null    object  
 3   geometry         336 non-null    geometry
 4   STOP_NAME_clean  336 non-null    object  
 5   sat_ons          282 non-null    float64 
 6   sun_ons          275 non-null    float64 
 7   weekday_ons      336 non-null    float64 
dtypes: float64(3), geometry(1), object(4)
memory usage: 23.6+ KB


In [19]:
stops_remainder = (stops_to_join 
    >> anti_join(_, yr_sbmtd_grouped, on = ['feed_key', 'stop_id'])
                    )

len(stops_remainder)

271

In [21]:
yr_sbmtd_remainders = (yr_sbmtd_grouped 
    >> anti_join(_, stops_to_join, on = ['feed_key', 'stop_id'])
                    )

len(yr_sbmtd_remainders)

419