In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

from calitp.tables import tbl
from calitp import query_sql

import pandas as pd
import geopandas as gpd
from siuba import *

import shared_utils



# Read In / Transform SB MTD Ridership Data

In [2]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'

In [3]:
# reads in multiple sheets as a dictionary
yr_sbmtd_raw = pd.read_excel(f'{GCS_FILE_PATH}SYSTEM WIDE STOP USAGE FY21-22.xlsx', sheet_name=None)

In [4]:
# add sheet names (keys) as column names, then concatenate

for key, value in yr_sbmtd_raw.items():
    value['daytype'] = f'{key}'

    
list(yr_sbmtd_raw.values()) [0]['daytype']

0       WKDY
1       WKDY
2       WKDY
3       WKDY
4       WKDY
        ... 
2210    WKDY
2211    WKDY
2212    WKDY
2213    WKDY
2214    WKDY
Name: daytype, Length: 2215, dtype: object

In [14]:
yr_sbmtd_all = pd.concat(yr_sbmtd_raw, ignore_index=True)

yr_sbmtd_all

Unnamed: 0,ROUTE_NUMBER,ROUTE_NAME,DIRECTION_NAME,ROUTE_DIRECTION,STOP_NAME,STOP_ID,STOP_ID_NEAR,XBOARDINGS,XALIGHTINGS,XWHEELCHAIRS,XBICYCLES,XTRIPS,daytype
0,2,2: Eastside,EAST SB,2E,,99999,4,41,1,-5,0,18,WKDY
1,2,2: Eastside,EAST SB,2E,,99999,7,94,15,-43,0,86,WKDY
2,3,3: Oak Park,DOWNTOWN SB,3D,Treasure/Calle Real,406,406,445,434,-1,0,4349,WKDY
3,5,5: Mesa/La Cumbre,DOWNTOWN SB,5D,Transit Center,4,4,9,8833,-1412,0,2956,WKDY
4,5,5: Mesa/La Cumbre,DOWNTOWN SB,5D,La Cumbre Plaza/Plaza Ave,364,364,1453,64,-14,0,1999,WKDY
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4420,24,24X: UCSB Express,CAMINO REAL MKT,24C,TC,100222,100222,0,0,0,0,900,SUN
4421,25,25: Ellwood,CAMINO REAL MKT,25C,Hollister/Storke,-15,-15,0,497,-13,0,409,SUN
4422,25,25: Ellwood,WINCHESTER CYN,25W,,99999,143,17,19,-3,0,19,SUN
4423,28,28: UCSB Shuttle,UCSB,28U,UCSB North Hall Outbound,42,42,4,2646,-61,0,569,SUN


In [15]:
# standardize stop ids
day_cols = {'WKDY': 'weekday_ons', 'SAT': 'sat_ons', 'SUN': 'sun_ons'}

yr_sbmtd_all = (yr_sbmtd_all
                 >> mutate(STOP_ID_clean = if_else(_.STOP_ID!=99999,_.STOP_ID,_.STOP_ID_NEAR),
                           DAY_TYPE = _.daytype.apply(lambda x: day_cols[x]))
               )


In [50]:
yr_sbmtd_all >> filter(_.STOP_NAME=="State/Highway 154")

Unnamed: 0,ROUTE_NUMBER,ROUTE_NAME,DIRECTION_NAME,ROUTE_DIRECTION,STOP_NAME,STOP_ID,STOP_ID_NEAR,XBOARDINGS,XALIGHTINGS,XWHEELCHAIRS,XBICYCLES,XTRIPS,daytype,STOP_ID_clean,DAY_TYPE
44,2440,2440,HOLLISTER/UNIV,2440H,State/Highway 154,257,257,0,0,0,0,2,WKDY,257,weekday_ons
215,6,6: Goleta,DOWNTOWN SB,6D,State/Highway 154,-33,-33,3354,1542,-65,0,8012,WKDY,-33,weekday_ons
708,2430,2430: La Colina Jr. High,HOLLISTER AVE,2430H,State/Highway 154,257,257,0,0,0,0,72,WKDY,257,weekday_ons
1072,11,11: UCSB,UCSB/STORKE RD,11U,State/Highway 154,257,257,5829,6128,-256,0,7762,WKDY,257,weekday_ons
1132,6,6: Goleta,CAMINO REAL MKT,6C,State/Highway 154,257,257,4793,5509,-263,0,5662,WKDY,257,weekday_ons
1760,11,11: UCSB,DOWNTOWN SB,11D,State/Highway 154,-33,-33,3033,1744,-66,0,7708,WKDY,-33,weekday_ons
2430,6,6: Goleta,DOWNTOWN SB,6D,State/Highway 154,-33,-33,349,138,-9,0,1057,SAT,-33,sat_ons
2687,11,11: UCSB,UCSB/STORKE RD,11U,State/Highway 154,257,257,739,863,-44,0,1384,SAT,257,sat_ons
2824,6,6: Goleta,CAMINO REAL MKT,6C,State/Highway 154,257,257,474,579,-29,0,938,SAT,257,sat_ons
2953,11,11: UCSB,DOWNTOWN SB,11D,State/Highway 154,-33,-33,446,197,-12,0,1143,SAT,-33,sat_ons


In [18]:
# want to keep stop name as recommended by sbmtd staff
name_id_dict = (yr_sbmtd_all 
                >> distinct(_.STOP_ID_clean,_.STOP_NAME)
                >> arrange(_.STOP_ID_clean,_.STOP_NAME)
                >> filter(_.STOP_NAME.notna()) #this expression drops NAs
                >> rename(STOP_NAME_clean = "STOP_NAME")
               )

name_id_dict.STOP_ID_clean.value_counts()

-51        1
 840       1
 816       1
 819       1
 823       1
          ..
 351       1
 353       1
 354       1
 355       1
 100314    1
Name: STOP_ID_clean, Length: 755, dtype: int64

In [35]:
yr_sbmtd_grouped = (yr_sbmtd_all    
                >> left_join(_,name_id_dict)    
                >> group_by(_.STOP_ID_clean,_.STOP_NAME_clean, _.DAY_TYPE)
                >> summarize(stop_total_ons = _.XBOARDINGS.sum())
                )

yr_sbmtd_grouped >> head(5)

Unnamed: 0,STOP_ID_clean,STOP_NAME_clean,DAY_TYPE,stop_total_ons
0,-51,Pacific Oaks/Phelps,sat_ons,0
1,-51,Pacific Oaks/Phelps,sun_ons,0
2,-51,Pacific Oaks/Phelps,weekday_ons,2
3,-49,Pueblo/Castillo Out,sat_ons,116
4,-49,Pueblo/Castillo Out,sun_ons,109


In [36]:
yr_sbmtd_grouped = (yr_sbmtd_grouped
                     >> spread("DAY_TYPE", "stop_total_ons")
                     #>> rename(stop_id = _.STOP_ID)
                     >> mutate(calitp_itp_id = 293,
                              stop_code=_.STOP_ID_clean.apply(str))
                    )

yr_sbmtd_grouped >> head (5)

Unnamed: 0,STOP_ID_clean,STOP_NAME_clean,sat_ons,sun_ons,weekday_ons,calitp_itp_id,stop_code
0,-51,Pacific Oaks/Phelps,0.0,0.0,2.0,293,-51
1,-49,Pueblo/Castillo Out,116.0,109.0,2734.0,293,-49
2,-45,Cathedral Oaks and Brandon,61.0,42.0,411.0,293,-45
3,-39,Hollister/Los Carneros Road,95.0,64.0,879.0,293,-39
4,-38,Hollister/Los Carneros Way,77.0,96.0,680.0,293,-38


In [30]:
yr_sbmtd_grouped >> filter(_.STOP_NAME_clean.str.contains("Via Real"))

Unnamed: 0,STOP_ID_clean,STOP_NAME_clean,sat_ons,sun_ons,weekday_ons,calitp_itp_id
110,98,Via Real/Mark,626.0,411.0,5019.0,293
112,100,Via Real/Santa Ynez,782.0,596.0,6276.0,293
113,102,Via Real/Sandpiper MHP,217.0,109.0,1485.0,293
319,435,Via Real/West Padaro,33.0,5.0,309.0,293
320,436,Via Real/Toro Canyon,31.0,10.0,429.0,293
321,437,Via Real/Sentar,91.0,65.0,898.0,293
322,438,Via Real/Nidever,48.0,47.0,325.0,293
323,439,Via Real/Gallup + Stribling,39.0,24.0,398.0,293
324,440,Via Real/East Padaro,110.0,89.0,644.0,293
325,441,Via Real/Via Real Flowers,20.0,5.0,222.0,293


In [37]:
len(yr_sbmtd_grouped)

755

In [22]:
# use same date as LA Metro for now
sbmtd_stops = shared_utils.gtfs_utils.get_stops('2022-03-15', [293])
len(sbmtd_stops)

607

In [32]:
sbmtd_stops >> filter(_.stop_name.str.contains("Via Real")) >> arrange(_.stop_code)

Unnamed: 0,feed_key,stop_key,date,calitp_itp_id,calitp_hash,zone_id,parent_station,stop_name,calitp_url_number,calitp_deleted_at,...,level_id,stop_url,wheelchair_boarding,stop_code,stop_timezone,calitp_extracted_at,stop_id,stop_desc,tts_stop_name,geometry
235,1180277299546152894,1149916131022778597,2022-03-15,293,fdmzDWezp7mt0q442ZpnDA==,,,Via Real & Santa Ynez,0,2022-04-04,...,,,2,100,,2021-04-15,3,,,POINT (-119.52695 34.40457)
539,1180277299546152894,-2138777912413780574,2022-03-15,293,r+B5rsGq40eyUgiXb9JqfQ==,,,Via Real & Sandpiper MHP,0,2022-04-04,...,,,2,102,,2021-04-15,5,,,POINT (-119.54147 34.40622)
439,1180277299546152894,9084726472797872778,2022-03-15,293,aijyCCXyFuCPrNp0H95qcA==,,,Via Real & Lomita Lane,0,2022-04-04,...,,,2,1089,,2021-04-15,899,,,POINT (-119.48940 34.38601)
213,1180277299546152894,7631661558827589319,2022-03-15,293,1ubg3WtDa3FUH2+kGWKNuQ==,,,Via Real & West Padaro,0,2022-04-04,...,,,2,435,,2021-04-15,299,,,POINT (-119.58276 34.41768)
451,1180277299546152894,-1466857557900525618,2022-03-15,293,seVdg0GyWC/EWdeK61ZPsg==,,,Via Real & Toro Canyon,0,2022-04-04,...,,,2,436,,2021-04-15,300,,,POINT (-119.57533 34.41609)
316,1180277299546152894,-5317311687603958615,2022-03-15,293,FX3M2W0EMySgyC2lUvxkeA==,,,Via Real & Sentar,0,2022-04-04,...,,,2,437,,2021-04-15,301,,,POINT (-119.56916 34.41676)
92,1180277299546152894,-5031227876227791962,2022-03-15,293,52g+6FCBgckaWsZbVzN24Q==,,,Via Real & Nidever,0,2022-04-04,...,,,2,438,,2021-04-15,302,,,POINT (-119.56036 34.41646)
94,1180277299546152894,-7197668384083967006,2022-03-15,293,H8rGG/R4wRfjLwozXXK3hg==,,,Via Real & Gallup & Stribling,0,2022-04-04,...,,,2,439,,2021-04-15,303,,,POINT (-119.55746 34.41472)
346,1180277299546152894,-7295506662825558284,2022-03-15,293,8Y6JWxiMjbMLgKLdj/Mtew==,,,Via Real & East Padaro,0,2022-04-04,...,,,2,440,,2021-04-15,305,,,POINT (-119.55215 34.41136)
548,1180277299546152894,-1697107597783680901,2022-03-15,293,8R5BARBgPcq+vjcBHbvCQw==,,,Via Real & Via Real Flowers,0,2022-04-04,...,,,2,441,,2021-04-15,306,,,POINT (-119.54742 34.40727)


In [39]:
stops_to_join = (sbmtd_stops 
                 >> select(_.calitp_itp_id, _.stop_id, _.stop_code,_.stop_name, _.geometry)
                )


In [46]:
stops_to_join >> filter(_.stop_name=="State & Highway 154")

Unnamed: 0,calitp_itp_id,stop_id,stop_code,stop_name,geometry
12,293,136,257,State & Highway 154,POINT (-119.76050 34.44069)
563,293,811,824,State & Highway 154,POINT (-119.75885 34.44032)


In [49]:
yr_sbmtd_grouped >> filter(_.stop_code.isin(["257","824"]))

Unnamed: 0,STOP_ID_clean,STOP_NAME_clean,sat_ons,sun_ons,weekday_ons,calitp_itp_id,stop_code
197,257,State/Highway 154,1213.0,883.0,10653.0,293,257


In [51]:
yr_sbmtd_grouped_geo = stops_to_join >> inner_join(_, yr_sbmtd_grouped, on = ['calitp_itp_id', 'stop_code'])

In [52]:
yr_sbmtd_grouped_geo.info() 

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 571 entries, 0 to 570
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   calitp_itp_id    571 non-null    int64   
 1   stop_id          571 non-null    object  
 2   stop_code        571 non-null    object  
 3   stop_name        571 non-null    object  
 4   geometry         571 non-null    geometry
 5   STOP_ID_clean    571 non-null    int64   
 6   STOP_NAME_clean  571 non-null    object  
 7   sat_ons          470 non-null    float64 
 8   sun_ons          436 non-null    float64 
 9   weekday_ons      571 non-null    float64 
dtypes: float64(3), geometry(1), int64(2), object(4)
memory usage: 49.1+ KB


Issue: negative stop ids are not matching stop codes in warehouse data

Approach:
- inner join on stop code
- anti-join both datasets to get remainders
- join remainders on name (levenshtein edit distance or just change "/" to " & ")
- concatenate

In [45]:
shared_utils.rt_utils.show_full_df(yr_sbmtd_grouped_geo >> filter(_.geometry.isna()))

Unnamed: 0,calitp_itp_id,stop_id,stop_code,stop_name,geometry,STOP_ID_clean,STOP_NAME_clean,sat_ons,sun_ons,weekday_ons
607,293,,-51,,,-51.0,Pacific Oaks/Phelps,0.0,0.0,2.0
608,293,,-49,,,-49.0,Pueblo/Castillo Out,116.0,109.0,2734.0
609,293,,-45,,,-45.0,Cathedral Oaks and Brandon,61.0,42.0,411.0
610,293,,-39,,,-39.0,Hollister/Los Carneros Road,95.0,64.0,879.0
611,293,,-38,,,-38.0,Hollister/Los Carneros Way,77.0,96.0,680.0
612,293,,-37,,,-37.0,Hollister/Ward,288.0,169.0,2699.0
613,293,,-36,,,-36.0,State/Valerio,323.0,252.0,1798.0
614,293,,-35,,,-35.0,Hollister/Arboleda,645.0,551.0,7015.0
615,293,,-34,,,-34.0,State/Pueblo,295.0,187.0,2139.0
616,293,,-33,,,-33.0,State/Highway 154,795.0,649.0,6398.0


In [None]:
# fill nan with 0
values = {"sat_ons": 0, "sun_ons": 0, "weekday_ons": 0}
mar_metro_joined = mar_metro_joined.fillna(value=values)

In [None]:
# check 0s
mar_metro_0check = (mar_metro_joined >>
                    filter(_.weekday_ons==0)
                   )
mar_metro_0check.explore("weekday_ons", legend=True)

In [None]:
shared_utils.utils.geoparquet_gcs_export?

In [None]:
shared_utils.utils.geoparquet_gcs_export(mar_metro_joined, GCS_FILE_PATH, 'rider_cleaned_182_2022_03.parquet')

## Exploratory Data Analysis

In [None]:
# weekday ons map
mar_metro_joined.explore("weekday_ons", legend=True, tiles="CartoDB positron")

In [None]:
import matplotlib.pyplot as plt

mar_metro_joined.weekday_ons.plot.hist(grid=True, bins=100, rwidth=0.9, log=True,
                                          title='Weekday Total Bus Ridership - March 2022')
plt.xlabel('Riders')
plt.ylabel('Number of Stops')

In [None]:
mar_metro_joined >> filter(_.weekday_ons>30000)