In [1]:
pip install shared_utils

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [3]:
from fuzzywuzzy import process, fuzz



In [4]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000)

import branca
import folium
from shared_utils import gtfs_utils_v2

import pandas as pd
import geopandas as gpd 

import datetime as dt
import time
import re
import google.auth
import os
import gcsfs
credentials, project = google.auth.default()
fs = gcsfs.GCSFileSystem()


In [5]:
import sys
sys.path.append('../rt_segment_speeds/segment_speed_utils')

from helpers import *

In [6]:
#Selecting analysis date and agency
analysis_date = "2022-06-01"
agency_name = "SBMTD"

In [7]:
#Getting GTFS trips data 
get_trips = import_scheduled_trips(analysis_date=analysis_date, columns =[
        "gtfs_dataset_key", "feed_key", "name", "trip_id", 
        "shape_id", "shape_array_key", 
        "route_id", "route_key", "direction_id"
    ], get_pandas=True)

In [8]:
#Function to find feed key for the selected agency 
def compute_feed_key(agency_name):
    filtered = get_trips[get_trips['name'].str.contains(agency_name, na=False)]
    if not filtered.empty:
        return filtered.iloc[0]['feed_key']
    else:
        return None

In [9]:
feed_key = compute_feed_key(agency_name)

In [10]:
#Getting stops data for the selected feed key
stops_data = import_scheduled_stops(analysis_date).drop_duplicates().reset_index(drop=True)
if feed_key is not None:
    stops_data = stops_data[stops_data['feed_key'].isin([feed_key])]

In [11]:
# Ridership information for selected agencies 
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'
yr_sbmtd_raw = pd.read_excel(f'{GCS_FILE_PATH}SYSTEM WIDE STOP USAGE FY21-22.xlsx', sheet_name=None)

In [12]:
# Assigning the key of each item in yr_sbmtd_raw to the 'daytype' field of its corresponding value
for key, value in yr_sbmtd_raw.items():
    value['daytype'] = f'{key}'

list(yr_sbmtd_raw.values()) [0]['daytype']

0       WKDY
1       WKDY
2       WKDY
3       WKDY
4       WKDY
        ... 
2210    WKDY
2211    WKDY
2212    WKDY
2213    WKDY
2214    WKDY
Name: daytype, Length: 2215, dtype: object

In [13]:
# Concatenating all DataFrames in yr_sbmtd_raw into a single DataFrame, resetting the index.
yr_sbmtd_all = pd.concat(yr_sbmtd_raw, ignore_index=True)

In [14]:
# Standardizing stop ids
day_cols = {'WKDY': 'weekday_ons', 'SAT': 'sat_ons', 'SUN': 'sun_ons'}

yr_sbmtd_all['STOP_ID_clean'] = yr_sbmtd_all.apply(
    lambda row: row['STOP_ID'] if ['STOP_ID'] != 99999 else ['STOP_ID_NEAR'],
    axis=1
)
yr_sbmtd_all['DAY_TYPE'] = yr_sbmtd_all['daytype'].map(day_cols)


In [15]:
yr_sbmtd_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4425 entries, 0 to 4424
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ROUTE_NUMBER     4425 non-null   int64 
 1   ROUTE_NAME       4425 non-null   object
 2   DIRECTION_NAME   4425 non-null   object
 3   ROUTE_DIRECTION  4425 non-null   object
 4   STOP_NAME        3640 non-null   object
 5   STOP_ID          4425 non-null   int64 
 6   STOP_ID_NEAR     4425 non-null   int64 
 7   XBOARDINGS       4425 non-null   int64 
 8   XALIGHTINGS      4425 non-null   int64 
 9   XWHEELCHAIRS     4425 non-null   int64 
 10  XBICYCLES        4425 non-null   int64 
 11  XTRIPS           4425 non-null   int64 
 12  daytype          4425 non-null   object
 13  STOP_ID_clean    4425 non-null   int64 
 14  DAY_TYPE         4425 non-null   object
dtypes: int64(9), object(6)
memory usage: 518.7+ KB


In [16]:
name_id_dict = yr_sbmtd_all[['STOP_ID_clean', 'STOP_NAME']].drop_duplicates()
name_id_dict = name_id_dict[name_id_dict['STOP_NAME'].notna()]
name_id_dict = name_id_dict.sort_values(by=['STOP_ID_clean', 'STOP_NAME'])
name_id_dict = name_id_dict.rename(columns={'STOP_NAME': 'STOP_NAME_clean'})

In [17]:
yr_sbmtd_grouped = pd.merge(yr_sbmtd_all,
                            name_id_dict,
                            how = 'left',
                            on = 'STOP_ID_clean'
                           )


In [18]:
yr_sbmtd_grouped = yr_sbmtd_grouped.groupby(
    ['STOP_ID_clean', 'STOP_NAME_clean', 'DAY_TYPE'],
    as_index=False
)['XBOARDINGS'].sum()

In [19]:
yr_sbmtd_grouped = yr_sbmtd_grouped.rename(columns={'XBOARDINGS': 'stop_total_ons', 'STOP_ID_clean': 'stop_id', 'STOP_NAME_clean': 'stop_name' })

In [20]:
yr_sbmtd_grouped.head(5)

Unnamed: 0,stop_id,stop_name,DAY_TYPE,stop_total_ons
0,-51,Pacific Oaks/Phelps,sat_ons,0
1,-51,Pacific Oaks/Phelps,sun_ons,0
2,-51,Pacific Oaks/Phelps,weekday_ons,2
3,-49,Pueblo/Castillo Out,sat_ons,116
4,-49,Pueblo/Castillo Out,sun_ons,109


In [21]:
yr_sbmtd_grouped['stop_id'] = yr_sbmtd_grouped['stop_id'].astype(str)

In [22]:
#Pivoting day type values to seperate columns 
yr_sbmtd_grouped= yr_sbmtd_grouped.pivot_table(
    index=['stop_id', 'stop_name'],
    columns='DAY_TYPE',
    values = 'stop_total_ons'
).reset_index()
    

In [23]:
yr_sbmtd_grouped['feed_key'] = feed_key

In [24]:
yr_sbmtd_grouped['name'] = 'SBMTD Schedule'

In [25]:
yr_sbmtd_grouped.head(5)

DAY_TYPE,stop_id,stop_name,sat_ons,sun_ons,weekday_ons,feed_key,name
0,-1,Hollister/Sumida,319.0,381.0,3926.0,3551cafd288e0f647ff54627e26d0479,SBMTD Schedule
1,-10,Hollister/Robin Hill,29.0,13.0,173.0,3551cafd288e0f647ff54627e26d0479,SBMTD Schedule
2,-11,Hollister/Willow Springs,78.0,80.0,658.0,3551cafd288e0f647ff54627e26d0479,SBMTD Schedule
3,-12,Hollister/Los Carneros Way,40.0,22.0,396.0,3551cafd288e0f647ff54627e26d0479,SBMTD Schedule
4,-13,Hollister/Cremona,49.0,41.0,632.0,3551cafd288e0f647ff54627e26d0479,SBMTD Schedule


In [26]:
stops_data.columns

Index(['feed_key', 'service_date', 'feed_timezone',
       'first_stop_arrival_datetime_pacific',
       'last_stop_departure_datetime_pacific', 'stop_id', 'stop_key',
       'stop_name', 'stop_event_count', 'route_type_0', 'route_type_1',
       'route_type_2', 'route_type_3', 'route_type_4', 'route_type_5',
       'route_type_6', 'route_type_7', 'route_type_11', 'route_type_12',
       'missing_route_type', 'geometry'],
      dtype='object')

In [27]:
stops_data = stops_data.rename(columns = {'stop_name': 'STOP_NAME'})

In [28]:
stops_to_join = stops_data[['feed_key', 'stop_id', 'STOP_NAME', 'geometry']]

In [29]:
# Creating a dictionary mapping STOP_NAME to stop_id from the stops_to_join DataFrame.
stop_name_to_id = stops_to_join.set_index('STOP_NAME')['stop_id'].to_dict()

In [30]:
# Function to fuzzy match 
def get_best_match(name, choices, scorer=fuzz.ratio, threshold=90):
    best_match, score = process.extractOne(name, choices, scorer=scorer)
    if score >= threshold:
        return best_match
    else:
        return None

In [31]:
# Matching STOP_NAME in stops_to_join with unique stop names from yr_sbmtd_grouped and mapping the corresponding stop IDs
stops_to_join['matched_stop_name'] = stops_to_join['STOP_NAME'].apply(lambda x: get_best_match(x, yr_sbmtd_grouped['stop_name'].unique()))
stops_to_join['matched_stop_id'] = stops_to_join['matched_stop_name'].map(stop_name_to_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [32]:
fuzzy_matches = stops_to_join.dropna(subset=['matched_stop_name'])

In [33]:
# Creating a dictionary mapping matched_stop_name to stop_id from fuzzy matches 
name_to_stop_id_mapping = fuzzy_matches.set_index('matched_stop_name')['stop_id'].to_dict()

In [34]:
#Updating the stop_id in yr_sbmtd_grouped by mapping stop names to IDs and filling missing values with the original stop_id
yr_sbmtd_grouped_updated = yr_sbmtd_grouped.copy()
yr_sbmtd_grouped_updated['stop_id'] = yr_sbmtd_grouped_updated['stop_name'].map(name_to_stop_id_mapping).fillna(yr_sbmtd_grouped_updated['stop_id'])

In [35]:
final_join = pd.merge(
    stops_to_join,
    yr_sbmtd_grouped_updated,
    on=['feed_key', 'stop_id'],
    how='inner'
)

In [36]:
final_join.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 602 entries, 0 to 601
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   feed_key           602 non-null    object  
 1   stop_id            602 non-null    object  
 2   STOP_NAME          602 non-null    object  
 3   geometry           602 non-null    geometry
 4   matched_stop_name  602 non-null    object  
 5   matched_stop_id    25 non-null     object  
 6   stop_name          602 non-null    object  
 7   sat_ons            500 non-null    float64 
 8   sun_ons            466 non-null    float64 
 9   weekday_ons        602 non-null    float64 
 10  name               602 non-null    object  
dtypes: float64(3), geometry(1), object(7)
memory usage: 56.4+ KB


In [37]:
final_join.head(10)

Unnamed: 0,feed_key,stop_id,STOP_NAME,geometry,matched_stop_name,matched_stop_id,stop_name,sat_ons,sun_ons,weekday_ons,name
0,3551cafd288e0f647ff54627e26d0479,46,Cathedral Oaks & Alpha Resource,POINT (20567.039 -396065.461),Cathedral Oaks/Alpha Resource,,Cathedral Oaks/Alpha Resource,,,0.0,SBMTD Schedule
1,3551cafd288e0f647ff54627e26d0479,51,Cota & Olive,POINT (28360.098 -399196.672),Cota/Olive,,Cota/Olive,,,1.0,SBMTD Schedule
2,3551cafd288e0f647ff54627e26d0479,66,Cota & State,POINT (27887.765 -399736.855),Cota/State,,Cota/State,,,4.0,SBMTD Schedule
3,3551cafd288e0f647ff54627e26d0479,242,Cathedral Oaks & Fairview,POINT (15508.924 -396051.856),Cathedral Oaks/Fairview,,Cathedral Oaks/Fairview,,,178.0,SBMTD Schedule
4,3551cafd288e0f647ff54627e26d0479,253,Arrellaga & De La Vina,POINT (26458.625 -398989.032),Arrellaga/De La Vina,,Arrellaga/De La Vina,,,0.0,SBMTD Schedule
5,3551cafd288e0f647ff54627e26d0479,344,Evergreen & Redwood,POINT (9729.281 -397332.330),Evergreen/Redwood,,Evergreen/Redwood,,,2.0,SBMTD Schedule
6,3551cafd288e0f647ff54627e26d0479,345,Brandon & Durham,POINT (9939.578 -397982.951),Brandon/Durham,,Brandon/Durham,,,603.0,SBMTD Schedule
7,3551cafd288e0f647ff54627e26d0479,346,Brandon & Calle Real,POINT (9996.620 -398131.590),Brandon/Calle Real,,Brandon/Calle Real,,,0.0,SBMTD Schedule
8,3551cafd288e0f647ff54627e26d0479,347,Placer & Del Norte,POINT (10725.350 -397719.124),Placer/Del Norte,,Placer/Del Norte,,,739.0,SBMTD Schedule
9,3551cafd288e0f647ff54627e26d0479,393,Cathedral Oaks & The Country Club,POINT (16063.041 -396072.603),Cathedral Oaks/The Country Club,,Cathedral Oaks/The Country Club,,,148.0,SBMTD Schedule


In [38]:
strings_to_drop = ['Fairview Ave/Encina Road', 'Encina/Fairview 164']

In [39]:
final_join = final_join[~final_join['stop_name'].str.contains('|'.join(strings_to_drop))]

In [40]:
final_join.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 600 entries, 0 to 601
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   feed_key           600 non-null    object  
 1   stop_id            600 non-null    object  
 2   STOP_NAME          600 non-null    object  
 3   geometry           600 non-null    geometry
 4   matched_stop_name  600 non-null    object  
 5   matched_stop_id    25 non-null     object  
 6   stop_name          600 non-null    object  
 7   sat_ons            498 non-null    float64 
 8   sun_ons            464 non-null    float64 
 9   weekday_ons        600 non-null    float64 
 10  name               600 non-null    object  
dtypes: float64(3), geometry(1), object(7)
memory usage: 56.2+ KB


In [41]:
stops_merged = pd.merge(stops_to_join,
                        yr_sbmtd_grouped_updated,
                        on = ['feed_key', 'stop_id'],
                        how='left',
                        indicator=True
                       )

In [42]:
stops_merged.columns

Index(['feed_key', 'stop_id', 'STOP_NAME', 'geometry', 'matched_stop_name',
       'matched_stop_id', 'stop_name', 'sat_ons', 'sun_ons', 'weekday_ons',
       'name', '_merge'],
      dtype='object')

In [43]:
stops_remainder = stops_merged[stops_merged['_merge'] == 'left_only'].drop(columns=['_merge'])

In [44]:
len(stops_remainder)

149

In [45]:
yr_sbmtd_grouped_merged = pd.merge(stops_to_join,
                        yr_sbmtd_grouped_updated,
                        on = ['feed_key', 'stop_id'],
                        how='right',
                        indicator=True
                       )

In [46]:
yr_sbmtd_remainder = yr_sbmtd_grouped_merged[yr_sbmtd_grouped_merged['_merge'] == 'right_only'].drop(columns=['_merge'])

In [47]:
len(yr_sbmtd_remainder)

153

In [48]:
yr_sbmtd_remainder.head(10)

Unnamed: 0,feed_key,stop_id,STOP_NAME,geometry,matched_stop_name,matched_stop_id,stop_name,sat_ons,sun_ons,weekday_ons,name
35,3551cafd288e0f647ff54627e26d0479,-49,,,,,Pueblo/Castillo Out,116.0,109.0,2734.0,SBMTD Schedule
45,3551cafd288e0f647ff54627e26d0479,100003,,,,,AbreCDSO,,,0.0,SBMTD Schedule
46,3551cafd288e0f647ff54627e26d0479,100004,,,,,AlamBasO,,,0.0,SBMTD Schedule
47,3551cafd288e0f647ff54627e26d0479,100005,,,,,AlamPadN,0.0,0.0,0.0,SBMTD Schedule
48,3551cafd288e0f647ff54627e26d0479,100006,,,,,AlamPadO,0.0,0.0,0.0,SBMTD Schedule
49,3551cafd288e0f647ff54627e26d0479,100008,,,,,ArBuBePa,0.0,0.0,0.0,SBMTD Schedule
50,3551cafd288e0f647ff54627e26d0479,100011,,,,,BranEver,,,0.0,SBMTD Schedule
51,3551cafd288e0f647ff54627e26d0479,100096,,,,,CaOaAlph,,,0.0,SBMTD Schedule
52,3551cafd288e0f647ff54627e26d0479,100098,,,,,CaOaFaiO,,,0.0,SBMTD Schedule
54,3551cafd288e0f647ff54627e26d0479,100100,,,,,CaOaTurn,,,0.0,SBMTD Schedule


In [49]:
columns_to_keep = ['feed_key', 'stop_id', 'STOP_NAME', 'geometry', 'sat_ons', 'sun_ons', 'weekday_ons', 'name']
final_join = final_join[columns_to_keep]

In [50]:
final_join.head(4)

Unnamed: 0,feed_key,stop_id,STOP_NAME,geometry,sat_ons,sun_ons,weekday_ons,name
0,3551cafd288e0f647ff54627e26d0479,46,Cathedral Oaks & Alpha Resource,POINT (20567.039 -396065.461),,,0.0,SBMTD Schedule
1,3551cafd288e0f647ff54627e26d0479,51,Cota & Olive,POINT (28360.098 -399196.672),,,1.0,SBMTD Schedule
2,3551cafd288e0f647ff54627e26d0479,66,Cota & State,POINT (27887.765 -399736.855),,,4.0,SBMTD Schedule
3,3551cafd288e0f647ff54627e26d0479,242,Cathedral Oaks & Fairview,POINT (15508.924 -396051.856),,,178.0,SBMTD Schedule


In [51]:
final_join = final_join.rename(columns={'STOP_NAME': 'stop_name'})

In [52]:
def export_gdf(gdf, filename: str):
    
    gdf.to_parquet(f"{filename}.parquet")
    
    fs.put(
        f"{filename}.parquet",
        f"{GCS_FILE_PATH}/{filename}.parquet",
        token = credentials.token
    )
    
    os.remove(f"{filename}.parquet")
    print(f"saved {GCS_FILE_PATH}/{filename}.parquet")
    
    return

In [53]:
GCS_FILE_PATH  = 'gs://calitp-analytics-data/data-analyses/ahsc_grant'
export_gdf(final_join, "ridership_sbmtd_08_26_2024.parquet")

saved gs://calitp-analytics-data/data-analyses/ahsc_grant/ridership_sbmtd_08_26_2024.parquet.parquet
