In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [3]:
from fuzzywuzzy import process, fuzz



In [4]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import branca
import folium
from shared_utils import gtfs_utils_v2

from siuba import *
import pandas as pd
import geopandas as gpd 

import datetime as dt
import time

import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
import sys
sys.path.append('../rt_segment_speeds/segment_speed_utils')

from helpers import *

In [6]:
analysis_date = "2022-03-15"
agency_name = "SBMTD"

In [7]:
get_trips = import_scheduled_trips(analysis_date=analysis_date, columns =[
        "gtfs_dataset_key", "feed_key", "name", "trip_id", 
        "shape_id", "shape_array_key", 
        "route_id", "route_key", "direction_id"
    ], get_pandas=True)

In [8]:
def compute_feed_key(agency_name):
    filtered = get_trips[get_trips['name'].str.contains(agency_name, na=False)]
    if not filtered.empty:
        return filtered.iloc[0]['feed_key']
    else:
        return None

In [9]:
feed_key = compute_feed_key(agency_name)

In [10]:
stops_data = import_scheduled_stops(analysis_date).drop_duplicates().reset_index(drop=True)
if feed_key is not None:
    stops_data = stops_data[stops_data['feed_key'].isin([feed_key])]

In [11]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'
yr_sbmtd_raw = pd.read_excel(f'{GCS_FILE_PATH}SYSTEM WIDE STOP USAGE FY21-22.xlsx', sheet_name=None)

In [12]:
for key, value in yr_sbmtd_raw.items():
    value['daytype'] = f'{key}'

list(yr_sbmtd_raw.values()) [0]['daytype']

0       WKDY
1       WKDY
2       WKDY
3       WKDY
4       WKDY
        ... 
2210    WKDY
2211    WKDY
2212    WKDY
2213    WKDY
2214    WKDY
Name: daytype, Length: 2215, dtype: object

In [13]:
yr_sbmtd_all = pd.concat(yr_sbmtd_raw, ignore_index=True)

In [14]:
# standardize stop ids
day_cols = {'WKDY': 'weekday_ons', 'SAT': 'sat_ons', 'SUN': 'sun_ons'}

yr_sbmtd_all = (yr_sbmtd_all
                 >> mutate(STOP_ID_clean = if_else(_.STOP_ID!=99999,_.STOP_ID,_.STOP_ID_NEAR),
                           DAY_TYPE = _.daytype.apply(lambda x: day_cols[x]))
               )


In [15]:
name_id_dict = (yr_sbmtd_all 
                >> distinct(_.STOP_ID_clean,_.STOP_NAME)
                >> arrange(_.STOP_ID_clean,_.STOP_NAME)
                >> filter(_.STOP_NAME.notna()) #this expression drops NAs
                >> rename(STOP_NAME_clean = "STOP_NAME")
               )

In [16]:
yr_sbmtd_grouped = (yr_sbmtd_all
                >> left_join(_,name_id_dict)    
                >> group_by(_.STOP_ID_clean,_.STOP_NAME_clean, _.DAY_TYPE)
                >> summarize(stop_total_ons = _.XBOARDINGS.sum())
                )

yr_sbmtd_grouped >> head(5)

Unnamed: 0,STOP_ID_clean,STOP_NAME_clean,DAY_TYPE,stop_total_ons
0,-51,Pacific Oaks/Phelps,sat_ons,0
1,-51,Pacific Oaks/Phelps,sun_ons,0
2,-51,Pacific Oaks/Phelps,weekday_ons,2
3,-49,Pueblo/Castillo Out,sat_ons,116
4,-49,Pueblo/Castillo Out,sun_ons,109


In [17]:
yr_sbmtd_grouped = (yr_sbmtd_grouped
                     >> mutate(STOP_ID_clean = _.STOP_ID_clean.astype(str))
                     >> rename(stop_id = _.STOP_ID_clean)
                     >> rename(stop_name = _.STOP_NAME_clean)
                     >> spread("DAY_TYPE", "stop_total_ons")
                     >> mutate(feed_key = feed_key)
                     >> mutate(name = 'SBMTD Schedule')
                    )

yr_sbmtd_grouped >> head (5)

Unnamed: 0,stop_id,stop_name,sat_ons,sun_ons,weekday_ons,feed_key,name
0,-1,Hollister/Sumida,319.0,381.0,3926.0,52201caab047b98ae19b7547c0d7c2ad,SBMTD Schedule
1,-10,Hollister/Robin Hill,29.0,13.0,173.0,52201caab047b98ae19b7547c0d7c2ad,SBMTD Schedule
2,-11,Hollister/Willow Springs,78.0,80.0,658.0,52201caab047b98ae19b7547c0d7c2ad,SBMTD Schedule
3,-12,Hollister/Los Carneros Way,40.0,22.0,396.0,52201caab047b98ae19b7547c0d7c2ad,SBMTD Schedule
4,-13,Hollister/Cremona,49.0,43.0,659.0,52201caab047b98ae19b7547c0d7c2ad,SBMTD Schedule


In [18]:
stops_data = (
    stops_data
    >> rename(STOP_NAME=_.stop_name)  
)


In [19]:
stops_to_join = (stops_data
                 >> select(_.feed_key, _.stop_id, _.STOP_NAME, _.geometry)
                )

In [20]:
stop_name_to_id = stops_to_join.set_index('STOP_NAME')['stop_id'].to_dict()

In [21]:
def get_best_match(name, choices, scorer=fuzz.ratio, threshold=90):
    best_match, score = process.extractOne(name, choices, scorer=scorer)
    if score >= threshold:
        return best_match
    else:
        return None

In [22]:
stops_to_join['matched_stop_name'] = stops_to_join['STOP_NAME'].apply(lambda x: get_best_match(x, yr_sbmtd_grouped['stop_name'].unique()))
stops_to_join['matched_stop_id'] = stops_to_join['matched_stop_name'].map(stop_name_to_id)

In [23]:
fuzzy_matches = stops_to_join.dropna(subset=['matched_stop_name'])

In [24]:
name_to_stop_id_mapping = fuzzy_matches.set_index('matched_stop_name')['stop_id'].to_dict()

In [25]:
yr_sbmtd_grouped_updated = yr_sbmtd_grouped.copy()
yr_sbmtd_grouped_updated['stop_id'] = yr_sbmtd_grouped_updated['stop_name'].map(name_to_stop_id_mapping).fillna(yr_sbmtd_grouped_updated['stop_id'])

In [26]:
final_join = pd.merge(
    stops_to_join,
    yr_sbmtd_grouped_updated,
    on=['feed_key', 'stop_id'],
    how='inner'
)

In [27]:
final_join.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 602 entries, 0 to 601
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   feed_key           602 non-null    object  
 1   stop_id            602 non-null    object  
 2   STOP_NAME          602 non-null    object  
 3   geometry           602 non-null    geometry
 4   matched_stop_name  602 non-null    object  
 5   matched_stop_id    26 non-null     object  
 6   stop_name          602 non-null    object  
 7   sat_ons            500 non-null    float64 
 8   sun_ons            466 non-null    float64 
 9   weekday_ons        602 non-null    float64 
 10  name               602 non-null    object  
dtypes: float64(3), geometry(1), object(7)
memory usage: 56.4+ KB


In [28]:
final_join.head(10)

Unnamed: 0,feed_key,stop_id,STOP_NAME,geometry,matched_stop_name,matched_stop_id,stop_name,sat_ons,sun_ons,weekday_ons,name
0,52201caab047b98ae19b7547c0d7c2ad,1,Modoc & Portesuello,POINT (25170.737 -398993.625),Modoc/Portesuello,,Modoc/Portesuello,1573.0,1287.0,13964.0,SBMTD Schedule
1,52201caab047b98ae19b7547c0d7c2ad,2,Milpas & Montecito,POINT (29196.552 -399052.308),Milpas/Montecito,,Milpas/Montecito,3901.0,3139.0,30225.0,SBMTD Schedule
2,52201caab047b98ae19b7547c0d7c2ad,4,Cathedral Oaks & Camino Del Rio,POINT (20247.563 -395908.292),Cathedral Oaks/Camino Del Rio,,Cathedral Oaks/Camino Del Rio,,,1.0,SBMTD Schedule
3,52201caab047b98ae19b7547c0d7c2ad,5,Via Real & Sandpiper MHP,POINT (42143.060 -400995.911),Via Real/Sandpiper MHP,,Via Real/Sandpiper MHP,217.0,109.0,1485.0,SBMTD Schedule
4,52201caab047b98ae19b7547c0d7c2ad,6,UCSB Elings Hall Outbound,POINT (14738.802 -400125.683),UCSB Elings Hall Outbound,6.0,UCSB Elings Hall Outbound,1374.0,1199.0,9777.0,SBMTD Schedule
5,52201caab047b98ae19b7547c0d7c2ad,10,Anapamu & Santa Barbara,POINT (27354.741 -398937.880),Anapamu/Santa Barbara,,Anapamu/Santa Barbara,238.0,164.0,1705.0,SBMTD Schedule
6,52201caab047b98ae19b7547c0d7c2ad,16,Seville & Embarcadero Del Mar,POINT (13110.869 -400474.402),Seville/Embarcadero Del Mar,,Seville/Embarcadero Del Mar,197.0,242.0,1706.0,SBMTD Schedule
7,52201caab047b98ae19b7547c0d7c2ad,16,Seville & Embarcadero Del Mar,POINT (13110.869 -400474.402),Seville/Embarcadero Del Mar,,Seville/Embarcadero Del Mar,53.0,73.0,1191.0,SBMTD Schedule
8,52201caab047b98ae19b7547c0d7c2ad,18,Embarcadero & Sabado Tarde,POINT (13206.817 -400596.547),Embarcadero/Sabado Tarde,,Embarcadero/Sabado Tarde,254.0,326.0,2763.0,SBMTD Schedule
9,52201caab047b98ae19b7547c0d7c2ad,18,Embarcadero & Sabado Tarde,POINT (13206.817 -400596.547),Embarcadero/Sabado Tarde,,Embarcadero/Sabado Tarde,99.0,99.0,1279.0,SBMTD Schedule


In [29]:
strings_to_drop = ['Fairview Ave/Encina Road', 'Encina/Fairview 164']

In [30]:
final_join = final_join[~final_join['stop_name'].str.contains('|'.join(strings_to_drop))]

In [31]:
final_join.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 600 entries, 0 to 601
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   feed_key           600 non-null    object  
 1   stop_id            600 non-null    object  
 2   STOP_NAME          600 non-null    object  
 3   geometry           600 non-null    geometry
 4   matched_stop_name  600 non-null    object  
 5   matched_stop_id    26 non-null     object  
 6   stop_name          600 non-null    object  
 7   sat_ons            498 non-null    float64 
 8   sun_ons            464 non-null    float64 
 9   weekday_ons        600 non-null    float64 
 10  name               600 non-null    object  
dtypes: float64(3), geometry(1), object(7)
memory usage: 56.2+ KB


In [32]:
stops_remainder = (
    stops_to_join
    >> anti_join(_, yr_sbmtd_grouped_updated, on=['feed_key', 'stop_id'])
)

len(stops_remainder)

149

In [33]:
yr_sbmtd_remainders = (yr_sbmtd_grouped_updated 
    >> anti_join(_, stops_to_join, on = ['feed_key', 'stop_id'])
                    )

len(yr_sbmtd_remainders)

153

In [34]:
columns_to_keep = ['feed_key', 'stop_id', 'STOP_NAME', 'geometry', 'sat_ons', 'sun_ons', 'weekday_ons', 'name']
final_join = final_join[columns_to_keep]

In [35]:
final_join.head(4)

Unnamed: 0,feed_key,stop_id,STOP_NAME,geometry,sat_ons,sun_ons,weekday_ons,name
0,52201caab047b98ae19b7547c0d7c2ad,1,Modoc & Portesuello,POINT (25170.737 -398993.625),1573.0,1287.0,13964.0,SBMTD Schedule
1,52201caab047b98ae19b7547c0d7c2ad,2,Milpas & Montecito,POINT (29196.552 -399052.308),3901.0,3139.0,30225.0,SBMTD Schedule
2,52201caab047b98ae19b7547c0d7c2ad,4,Cathedral Oaks & Camino Del Rio,POINT (20247.563 -395908.292),,,1.0,SBMTD Schedule
3,52201caab047b98ae19b7547c0d7c2ad,5,Via Real & Sandpiper MHP,POINT (42143.060 -400995.911),217.0,109.0,1485.0,SBMTD Schedule


In [36]:
final_join = final_join.rename(columns={'STOP_NAME': 'stop_name'})

In [37]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/ahsc_grant'
final_join.to_parquet(f"{GCS_FILE_PATH}/ridership_sbmtd_08_26_2024.parquet")