In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import branca
import folium
from shared_utils import gtfs_utils_v2

from siuba import *
import pandas as pd
import geopandas as gpd 

import datetime as dt
import time

import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
import sys
sys.path.append('../rt_segment_speeds/segment_speed_utils')

from helpers import *

In [4]:
#Selecting Analysis Date and Agency
analysis_date = "2022-03-15"
agency_name = "Salinas"

In [5]:
get_trips = import_scheduled_trips(analysis_date=analysis_date, columns =[
        "gtfs_dataset_key", "feed_key", "name", "trip_id", 
        "shape_id", "shape_array_key", 
        "route_id", "route_key", "direction_id"
    ], get_pandas=True)

In [6]:
def compute_feed_key(agency_name):
    filtered = get_trips[get_trips['name'].str.contains(agency_name, na=False)]
    if not filtered.empty:
        return filtered.iloc[0]['feed_key']
    else:
        return None

In [7]:
feed_key = compute_feed_key(agency_name)

In [8]:
stops_data = import_scheduled_stops(analysis_date).drop_duplicates().reset_index(drop=True)
if feed_key is not None:
    stops_data = stops_data[stops_data['feed_key'].isin([feed_key])]

In [9]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'
yr_mst_raw = pd.read_excel(f'{GCS_FILE_PATH}MST Stop Level Data 9.1.21-8.31.22.xlsx')

In [10]:
yr_mst_raw

Unnamed: 0,Stop_ID,Stop_Name,Latitude,Longitude,Yearly_Boardings,Schedule
0,2,DELMONTECENTER/GATE1,36.584641,-121.897424,9287,Weekday
1,2,DELMONTECENTER/GATE1,36.584644,-121.897400,2240,Saturday
2,2,DELMONTECENTER/GATE1,36.584670,-121.897378,1740,Sunday
3,3,DELMONTECENTER/GATE2,36.584786,-121.897339,1757,Weekday
4,3,DELMONTECENTER/GATE2,36.584781,-121.897337,616,Saturday
...,...,...,...,...,...,...
2747,9304,MARINATRANSITEXCHANGEGATE4,36.683744,-121.794475,1288,Saturday
2748,9304,MARINATRANSITEXCHANGEGATE4,36.683754,-121.794503,1044,Sunday
2749,9305,MARINATRANSITEXCHANGEGATE5,36.683734,-121.794164,251,Weekday
2750,9305,MARINATRANSITEXCHANGEGATE5,36.683892,-121.794205,616,Saturday


In [11]:
yr_mst_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2752 entries, 0 to 2751
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Stop_ID           2752 non-null   int64  
 1   Stop_Name         2752 non-null   object 
 2   Latitude          2752 non-null   float64
 3   Longitude         2752 non-null   float64
 4   Yearly_Boardings  2752 non-null   int64  
 5   Schedule          2752 non-null   object 
dtypes: float64(2), int64(2), object(2)
memory usage: 129.1+ KB


In [12]:
# stash id/name xwalk in environment - known dup name issue
mst_stopnames = (yr_mst_raw
                 >> distinct(_.Stop_ID,_.Stop_Name)
                 >> mutate(Stop_ID = _.Stop_ID.astype('int64').astype(str))
                 >> rename(stop_id = _.Stop_ID)
                )

In [13]:
day_cols = {'Weekday': 'weekday_ons', 'Saturday': 'sat_ons', 'Sunday': 'sun_ons'}

yr_mst_grouped = (yr_mst_raw
                 >> mutate(Stop_ID = _.Stop_ID.astype('int64').astype(str))
                 >> mutate(DAY_TYPE = _.Schedule.apply(lambda x: day_cols[x]))
                 >> group_by(_.Stop_ID, _.DAY_TYPE)
                 >> summarize(stop_total_ons = _.Yearly_Boardings.sum())
                )

In [14]:
yr_mst_grouped.head(3)

Unnamed: 0,Stop_ID,DAY_TYPE,stop_total_ons
0,1001,sat_ons,112
1,1001,sun_ons,58
2,1001,weekday_ons,753


In [15]:
yr_mst_grouped = (yr_mst_grouped
                     >> spread("DAY_TYPE", "stop_total_ons")
                     >> rename(stop_id = _.Stop_ID)
                     >> mutate(feed_key = feed_key)
                     >> mutate(name = 'Monterey Salinas Schedule')
                    )

In [16]:
yr_mst_grouped.head(3)

Unnamed: 0,stop_id,sat_ons,sun_ons,weekday_ons,feed_key,name
0,1001,112.0,58.0,753.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule
1,1004,672.0,754.0,8534.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule
2,1007,112.0,58.0,753.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule


In [17]:
len(yr_mst_grouped)

974

In [18]:
stops_to_join = (stops_data
                 >> select(_.feed_key, _.stop_id, _.stop_name, _.geometry)
                )

In [19]:
yr_mst_joined = stops_to_join >> inner_join(_, yr_mst_grouped, on = ['feed_key', 'stop_id'])

In [20]:
yr_mst_joined.head(3)

Unnamed: 0,feed_key,stop_id,stop_name,geometry,sat_ons,sun_ons,weekday_ons,name
0,efb3d4ea58f58d541e2c452e253bec5d,1001,Fremont / Hilby Avenue,POINT (-165123.570 -155506.782),112.0,58.0,753.0,Monterey Salinas Schedule
1,efb3d4ea58f58d541e2c452e253bec5d,1004,Fremont / Trinity Avenue,POINT (-165043.149 -155309.483),672.0,754.0,8534.0,Monterey Salinas Schedule
2,efb3d4ea58f58d541e2c452e253bec5d,1007,Fremont / Elm Avenue,POINT (-164902.027 -154954.295),112.0,58.0,753.0,Monterey Salinas Schedule


In [21]:
stops_remainder = (stops_to_join 
    >> anti_join(_, yr_mst_grouped, on = ['feed_key', 'stop_id'])
                    )

len(stops_remainder)

185

In [22]:
yr_mst_remainder = (yr_mst_grouped 
    >> anti_join(_, stops_to_join, on = ['feed_key', 'stop_id'])
                    )


len(yr_mst_remainder)

209

In [23]:
yr_mst_remainder = (yr_mst_remainder
                    >> mutate(stop_id = _.stop_id.apply(lambda x: x.zfill(4)))
                   )

In [24]:
# try join again
yr_mst_joined_zero = stops_remainder >> inner_join(_, yr_mst_remainder, on = ['feed_key', 'stop_id'])
yr_mst_joined_zero.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 163 entries, 0 to 162
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   feed_key     163 non-null    object  
 1   stop_id      163 non-null    object  
 2   stop_name    163 non-null    object  
 3   geometry     163 non-null    geometry
 4   sat_ons      136 non-null    float64 
 5   sun_ons      136 non-null    float64 
 6   weekday_ons  163 non-null    float64 
 7   name         163 non-null    object  
dtypes: float64(3), geometry(1), object(4)
memory usage: 11.5+ KB


In [25]:
# check out remainder not joined
stops_leftovers = stops_remainder >> anti_join(_, yr_mst_remainder, on = ['feed_key', 'stop_id'])
stops_leftovers

Unnamed: 0,feed_key,stop_id,stop_name,geometry
16722,efb3d4ea58f58d541e2c452e253bec5d,1200,Ardennes / Aachen Rd,POINT (-161587.746 -152086.439)
16723,efb3d4ea58f58d541e2c452e253bec5d,1201,Ardennes / 214 Ardennes Circle,POINT (-161338.819 -152109.640)
16724,efb3d4ea58f58d541e2c452e253bec5d,1202,Ardennes / Remagen (Fitch),POINT (-161134.083 -152220.921)
16725,efb3d4ea58f58d541e2c452e253bec5d,1204,Ardennes / Tunisia Rd,POINT (-160847.306 -152466.372)
16726,efb3d4ea58f58d541e2c452e253bec5d,1206,Ardennes / 2775 Ardennes,POINT (-161070.900 -152688.444)
16727,efb3d4ea58f58d541e2c452e253bec5d,1208,Ardennes / Metz,POINT (-161520.892 -152566.152)
16728,efb3d4ea58f58d541e2c452e253bec5d,1210,Ardennes / Hatten Rd,POINT (-161755.732 -152529.654)
17003,efb3d4ea58f58d541e2c452e253bec5d,2933,San Miguel Canyon Rd / # 1469,POINT (-150143.829 -129411.686)
17004,efb3d4ea58f58d541e2c452e253bec5d,2935,San Miguel Canyon Rd / Woodland Hills,POINT (-150465.732 -128846.788)
17026,efb3d4ea58f58d541e2c452e253bec5d,2977,San Miguel Canyon Rd / #1335,POINT (-150474.871 -128839.843)


In [26]:
riders_leftovers = yr_mst_remainder >> anti_join(_, stops_remainder, on = ['feed_key', 'stop_id'])

In [27]:
riders_leftovers >> left_join(_,mst_stopnames)

Unnamed: 0,stop_id,sat_ons,sun_ons,weekday_ons,feed_key,name,Stop_Name
0,1101,0.0,0.0,251.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule,INTERGARRISON/OTTERSPORTSCT
1,1101,0.0,0.0,251.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule,INTERGARRISON/OTTERSPORTSCTR
2,1394,0.0,,0.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule,SCHILLING/COUNTYGOVERNMENTC
3,1601,8680.0,8062.0,23343.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule,DELMONTE/CONFERENCECENTER
4,1603,22624.0,22562.0,74296.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule,CANNERYROW/PRESCOTT
5,1604,19936.0,17922.0,33634.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule,OLIVIER/FISHERMAN'SWHARF
6,1611,1288.0,1856.0,5020.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule,FRANKLIN/ALVARADO
7,1612,168.0,58.0,502.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule,ALVARADO/FRANKLIN
8,1614,4480.0,4640.0,20582.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule,CANNERYROW/DRAKE
9,1616,28952.0,31204.0,74045.0,efb3d4ea58f58d541e2c452e253bec5d,Monterey Salinas Schedule,CANNERYROW/AQUARIUM


In [28]:
# set tables together
gdfs = [yr_mst_joined_zero,yr_mst_joined]

yr_mst_geo_all = pd.concat(gdfs, ignore_index=True)

yr_mst_geo_all

Unnamed: 0,feed_key,stop_id,stop_name,geometry,sat_ons,sun_ons,weekday_ons,name
0,efb3d4ea58f58d541e2c452e253bec5d,0002,Del Monte Center / Gate 1,POINT (-169532.243 -157455.258),2240.0,1740.0,9287.0,Monterey Salinas Schedule
1,efb3d4ea58f58d541e2c452e253bec5d,0003,Del Monte Center / Gate 2,POINT (-169524.827 -157441.956),616.0,348.0,1757.0,Monterey Salinas Schedule
2,efb3d4ea58f58d541e2c452e253bec5d,0004,Del Monte Center / Gate 3,POINT (-169673.036 -157470.002),56.0,58.0,502.0,Monterey Salinas Schedule
3,efb3d4ea58f58d541e2c452e253bec5d,0006,6th / Mission Street,POINT (-171687.391 -160632.627),4312.0,4234.0,21837.0,Monterey Salinas Schedule
4,efb3d4ea58f58d541e2c452e253bec5d,0011,Northridge Mall,POINT (-147895.267 -143297.134),9128.0,13224.0,40662.0,Monterey Salinas Schedule
...,...,...,...,...,...,...,...,...
923,efb3d4ea58f58d541e2c452e253bec5d,9209,Salinas Transit Center / Gate 9,POINT (-147849.267 -147674.435),1960.0,1914.0,13554.0,Monterey Salinas Schedule
924,efb3d4ea58f58d541e2c452e253bec5d,9301,Marina Transit Exchange Gate 1,POINT (-160095.188 -146568.934),952.0,754.0,7781.0,Monterey Salinas Schedule
925,efb3d4ea58f58d541e2c452e253bec5d,9302,Marina Transit Exchange Gate 2,POINT (-160106.261 -146588.067),896.0,812.0,7279.0,Monterey Salinas Schedule
926,efb3d4ea58f58d541e2c452e253bec5d,9303,Marina Transit Exchange Gate 3,POINT (-160117.309 -146605.866),2184.0,1798.0,7530.0,Monterey Salinas Schedule


In [29]:
# fill nan with 0
values = {"sat_ons": 0, "sun_ons": 0, "weekday_ons": 0}
yr_mst_geo_all = yr_mst_geo_all.fillna(value=values)

In [30]:
# check 0s
mst_0check = (yr_mst_geo_all >>
                    filter(_.weekday_ons==0)
                   )
mst_0check.explore("weekday_ons", legend=True)

In [31]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/ahsc_grant'
yr_mst_joined.to_parquet(f"{GCS_FILE_PATH}/ridership_mst_08_26_2024.parquet")