In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

from calitp.tables import tbl
from calitp import query_sql

import pandas as pd
import geopandas as gpd
from siuba import *

import shared_utils



# Read In / Transform LA Metro Ridership Data

In [2]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'

In [3]:
mar_metro_raw = pd.read_excel(f'{GCS_FILE_PATH}rider_182_2022_03.xlsx')

In [4]:
mar_metro_raw = mar_metro_raw.iloc[:-2,:] # drop leftover rows from Metro's query (not actual data)

In [5]:
mar_metro_raw >> head(3)

Unnamed: 0,YEAR_MONTH,LINE,DIRECTION,DAY_TYPE,STOP_ID,ORDER_NUM,STOP_NAME,STOP_LAT,STOP_LONG,Total_Ons,Total_Offs
0,2022-03,2.0,East,DX,11426.0,1.0,LE CONTE / BROXTON,34.063594,-118.446732,3041.0,130.0
1,2022-03,2.0,East,DX,2939.0,2.0,LE CONTE / WESTWOOD,34.063623,-118.445005,4974.0,160.0
2,2022-03,2.0,East,DX,2938.0,3.0,LE CONTE / TIVERTON,34.063664,-118.442602,1062.0,31.0


In [6]:
day_cols = {'DX': 'weekday_ons', 'SA': 'sat_ons', 'SU': 'sun_ons'}

mar_metro_grouped = (mar_metro_raw
                 >> mutate(STOP_ID = _.STOP_ID.astype('int64').astype(str))
                 >> mutate(DAY_TYPE = _.DAY_TYPE.apply(lambda x: day_cols[x]))
                 >> group_by(_.STOP_ID, _.DAY_TYPE)
                 >> summarize(stop_total_ons = _.Total_Ons.sum())
                )

In [7]:
mar_metro_grouped >> head(3)

Unnamed: 0,STOP_ID,DAY_TYPE,stop_total_ons
0,1,sat_ons,16.0
1,1,sun_ons,24.0
2,1,weekday_ons,214.0


In [8]:
mar_metro_grouped = (mar_metro_grouped
                     >> spread("DAY_TYPE", "stop_total_ons")
                     >> rename(stop_id = _.STOP_ID)
                     >> mutate(calitp_itp_id = 182)
                    )

In [9]:
mar_metro_grouped >> head(3)

Unnamed: 0,stop_id,sat_ons,sun_ons,weekday_ons,calitp_itp_id
0,1,16.0,24.0,214.0,182
1,10000002,20.0,7.0,102.0,182
2,10012,,,14.0,182


In [12]:
len(mar_metro_grouped)

12159

In [10]:
metro_stops = shared_utils.gtfs_utils.get_stops('2022-03-15', [182])
len(metro_stops)

12546

In [11]:
stops_to_join = metro_stops >> select(_.calitp_itp_id, _.stop_id, _.geometry)
len(stops_to_join)

12546

In [13]:
mar_metro_joined = stops_to_join >> full_join(_, mar_metro_grouped, on = ['calitp_itp_id', 'stop_id'])

In [14]:
mar_metro_joined.info() # lookin' good!

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 12552 entries, 0 to 12551
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   calitp_itp_id  12552 non-null  int64   
 1   stop_id        12552 non-null  object  
 2   geometry       12546 non-null  geometry
 3   sat_ons        11749 non-null  float64 
 4   sun_ons        11749 non-null  float64 
 5   weekday_ons    12157 non-null  float64 
dtypes: float64(3), geometry(1), int64(1), object(1)
memory usage: 686.4+ KB


In [18]:
# about 400 stops in 3/15 stops data that aren't associated with Mar ridership; fill with 0
values = {"sat_ons": 0, "sun_ons": 0, "weekday_ons": 0}
mar_metro_joined.fillna(value=values)

Unnamed: 0,calitp_itp_id,stop_id,geometry,sat_ons,sun_ons,weekday_ons
0,182,12118,POINT (-118.28437 34.08156),9.0,13.0,51.0
1,182,5636,POINT (-118.29036 33.85010),10.0,10.0,173.0
2,182,12933,POINT (-118.53593 34.22100),129.0,113.0,1875.0
3,182,13458,POINT (-118.28280 33.94752),9.0,7.0,73.0
4,182,2351,POINT (-117.99940 33.88811),11.0,3.0,65.0
...,...,...,...,...,...,...
12547,182,2507,,0.0,1.0,1.0
12548,182,2741,,4.0,3.0,20.0
12549,182,3034,,0.0,0.0,2.0
12550,182,3258,,5.0,0.0,22.0


In [19]:
shared_utils.utils.geoparquet_gcs_export?

[0;31mSignature:[0m
[0mshared_utils[0m[0;34m.[0m[0mutils[0m[0;34m.[0m[0mgeoparquet_gcs_export[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mgdf[0m[0;34m:[0m [0mgeopandas[0m[0;34m.[0m[0mgeodataframe[0m[0;34m.[0m[0mGeoDataFrame[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mGCS_FILE_PATH[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mFILE_NAME[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Save geodataframe as parquet locally,
then move to GCS bucket and delete local file.

gdf: geopandas.GeoDataFrame
GCS_FILE_PATH: str. Ex: gs://calitp-analytics-data/data-analyses/my-folder/
FILE_NAME: str. Filename.
[0;31mFile:[0m      ~/data-analyses/_shared_utils/shared_utils/utils.py
[0;31mType:[0m      function


In [20]:
shared_utils.utils.geoparquet_gcs_export(mar_metro_joined, GCS_FILE_PATH, 'rider_cleaned_182_2022_03.parquet')