In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

import shared_utils

from siuba import *
import pandas as pd
import geopandas as gpd

pd.set_option('display.max_columns', None) 

from calitp import get_engine

engine = get_engine()
connection = engine.connect()



# Join Trips per Stop, Ridership, and ACS data

In [2]:
GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'

In [3]:
# start with trips per stop and ridership
stoptrips = gpd.read_parquet(f"{GCS_FILE_PATH}tbl1_trips_perstop.parquet")
stoptrips >> head (5)

Unnamed: 0,stop_desc,location_type,route_type,stop_name,stop_id,stop_code,calitp_itp_id,geometry,n_trips_weekday,n_trips_sat,n_trips_sun,n_routes_weekday,n_routes_sat,n_routes_sun
0,,,3,State & Alamar,423,584,293,POINT (-119.72521 34.43897),60.0,56.0,55.0,3.0,2.0,2.0
1,,,3,Encina & Fairview,55,164,293,POINT (-119.82988 34.44293),25.0,11.0,11.0,1.0,1.0,1.0
2,,,3,Veronica Springs & Torino,239,367,293,POINT (-119.74372 34.42502),12.0,12.0,8.0,1.0,1.0,1.0
3,,,3,Haley & Garden,63,182,293,POINT (-119.69240 34.41963),25.0,21.0,14.0,1.0,1.0,1.0
4,,,3,East Valley & San Ysidro,509,716,293,POINT (-119.63232 34.43696),12.0,10.0,,1.0,1.0,


In [4]:
ridership_metro = gpd.read_parquet(f"{GCS_FILE_PATH}rider_cleaned_182_2022_FYEstimate.parquet")
ridership_metro >> head (5)

Unnamed: 0,calitp_itp_id,stop_id,stop_name,geometry,STOP_NAME,sat_ons,sun_ons,weekday_ons
0,182,9552,Del Mar / Madre,POINT (-118.08128 34.14212),DEL MAR / MADRE,95.297717,35.736644,726.645092
1,182,3465,Fair Oaks / Holly,POINT (-118.15038 34.14794),FAIR OAKS / HOLLY,869.591668,964.889385,4907.832428
2,182,6091,Saticoy / Laurel Canyon,POINT (-118.39667 34.20838),SATICOY / LAUREL CANYON,119.122146,95.297717,1977.427629
3,182,20148,Magnolia / Clybourn,POINT (-118.35489 34.16509),MAGNOLIA / CLYBOURN,131.034361,71.473288,1179.309248
4,182,15285,Venice / Maplewood,POINT (-118.44436 33.99850),VENICE / MAPLEWOOD,35.736644,23.824429,3561.752174


In [5]:
ridership_sbmtd = gpd.read_parquet(f"{GCS_FILE_PATH}rider_cleaned_293_2021_07_to_2022_06.parquet")
ridership_sbmtd >> head (5) 

Unnamed: 0,calitp_itp_id,stop_id,stop_code,stop_name,geometry,STOP_ID_clean,STOP_NAME_clean,sat_ons,sun_ons,weekday_ons
0,293,799,289,Hollister & Pine,POINT (-119.82623 34.43571),-21,Hollister/Pine,4187.0,3546.0,30178.0
1,293,802,294,Hollister & El Mercado,POINT (-119.76226 34.44044),-24,Hollister/El Mercado,354.0,240.0,3689.0
2,293,815,930,Hollister & Ward,POINT (-119.81637 34.43514),-37,Hollister/Ward,288.0,169.0,2699.0
3,293,811,824,State & Highway 154,POINT (-119.75885 34.44032),-33,State/Highway 154,795.0,649.0,6398.0
4,293,780,126,Anacapa & Anapamu,POINT (-119.70295 34.42408),-2,Anacapa/Anapamu,788.0,503.0,5576.0


In [6]:
ridership_mst = gpd.read_parquet(f"{GCS_FILE_PATH}rider_cleaned_208_2021_09_to_2022_08.parquet")
ridership_mst >> head (5) 

Unnamed: 0,calitp_itp_id,stop_id,stop_name,geometry,sat_ons,sun_ons,weekday_ons
0,208,633,Hilby / Mescal,POINT (-121.82737 36.60219),0.0,0.0,251.0
1,208,446,Carmel Valley Rd / Boronda,POINT (-121.74623 36.49383),56.0,0.0,251.0
2,208,231,Ocean View / Moss,POINT (-121.92068 36.62763),0.0,0.0,251.0
3,208,4,Del Monte Center / Gate 3,POINT (-121.89893 36.58445),56.0,58.0,502.0
4,208,465,Carmel Valley Rd / Loma Del Rey,POINT (-121.81701 36.52727),0.0,0.0,0.0


In [7]:
# contatenate gdfs, keeping common columns
ridership_all = pd.concat([ridership_metro,ridership_sbmtd,ridership_mst], join='inner', ignore_index="True")
ridership_all.sample(5)

Unnamed: 0,calitp_itp_id,stop_id,stop_name,geometry,sat_ons,sun_ons,weekday_ons
11569,182,2759,Martin Luther King Jr / Crenshaw,POINT (-118.33630 34.01167),3692.786535,3359.244526,32353.574935
13223,208,2845,Salinas / Fruitland,POINT (-121.75223 36.87802),0.0,0.0,0.0
6974,182,17364,Rosecrans / Dwight,POINT (-118.24274 33.90343),119.122146,273.980936,1763.007765
4933,182,6155,Sepulveda / Rayen,POINT (-118.46745 34.23219),1357.992468,1107.835961,15521.615663
8730,182,16326,Sunset / Doheny Drive,POINT (-118.38958 34.09075),202.507649,47.648859,1894.042126


In [8]:
# join together, keep buses, create total trips per weekday
trips_ridership_joined = (stoptrips
                          >> full_join(_,ridership_all)
                          )

trips_ridership_joined >> head (5)

Unnamed: 0,stop_desc,location_type,route_type,stop_name,stop_id,stop_code,calitp_itp_id,geometry,n_trips_weekday,n_trips_sat,n_trips_sun,n_routes_weekday,n_routes_sat,n_routes_sun,sat_ons,sun_ons,weekday_ons
0,,,3,State & Alamar,423,584,293,POINT (-119.72521 34.43897),60.0,56.0,55.0,3.0,2.0,2.0,567.0,566.0,4556.0
1,,,3,Encina & Fairview,55,164,293,POINT (-119.82988 34.44293),25.0,11.0,11.0,1.0,1.0,1.0,840.0,806.0,10303.0
2,,,3,Veronica Springs & Torino,239,367,293,POINT (-119.74372 34.42502),12.0,12.0,8.0,1.0,1.0,1.0,28.0,10.0,175.0
3,,,3,Haley & Garden,63,182,293,POINT (-119.69240 34.41963),25.0,21.0,14.0,1.0,1.0,1.0,148.0,85.0,978.0
4,,,3,East Valley & San Ysidro,509,716,293,POINT (-119.63232 34.43696),12.0,10.0,,1.0,1.0,,27.0,0.0,176.0


In [9]:
len(trips_ridership_joined)

13746

In [10]:
# project
trips_ridership_joined = trips_ridership_joined.to_crs(shared_utils.geography_utils.CA_NAD83Albers)

# keep point geometry in intermediate table
stop_geom = (trips_ridership_joined
             >> select(_.calitp_itp_id,_.stop_id,_.geometry)
            )

stop_geom.head(5)

Unnamed: 0,calitp_itp_id,stop_id,geometry
0,293,423,POINT (25245.404 -397427.355)
1,293,55,POINT (15628.312 -397010.130)
2,293,239,POINT (23548.788 -398980.627)
3,293,63,POINT (28266.459 -399563.850)
4,293,509,POINT (33779.852 -397621.940)


In [11]:
len(stop_geom)

13746

In [12]:
# add .25mi (10min walk) buffers to stops
# this replaces our point geometry with polygons
trips_ridership_joined.geometry = trips_ridership_joined.buffer(402.336)

In [13]:
# ACS data
acs_ca = gpd.read_parquet(f"{GCS_FILE_PATH}acs_tbl_ca.parquet")
acs_ca >> head (5)

Unnamed: 0,ALAND,geometry,geo_id,total_pop,households,not_us_citizen_pop,black_pop,hispanic_pop,inc_extremelylow,inc_verylow,inc_low,pop_determined_poverty_status,poverty,no_car,no_cars,male_youth,female_youth,male_seniors,female_seniors,youth_pop,seniors_pop
0,3837562,"POLYGON ((-118.58119 34.14318, -118.58099 34.1...",6037137504,2073.0,694.0,23.0,19.0,64.0,30.0,29.0,102.0,2073.0,90.0,12.0,10.0,266.0,244.0,266.0,297.0,510.0,563.0
1,4472196,"POLYGON ((-118.60573 34.14585, -118.60561 34.1...",6037138000,4673.0,1784.0,198.0,325.0,393.0,270.0,124.0,196.0,4673.0,386.0,0.0,19.0,598.0,425.0,549.0,487.0,1023.0,1036.0
2,1152031,"POLYGON ((-118.53082 34.18024, -118.52952 34.1...",6037139200,5840.0,2172.0,815.0,153.0,1330.0,242.0,315.0,548.0,5840.0,602.0,66.0,108.0,777.0,634.0,309.0,662.0,1411.0,971.0
3,1213095,"POLYGON ((-121.50218 38.55643, -121.50184 38.5...",6067002300,3342.0,1629.0,79.0,55.0,666.0,150.0,67.0,187.0,3342.0,127.0,13.0,31.0,362.0,410.0,242.0,291.0,772.0,533.0
4,3224718,"POLYGON ((-121.50970 38.54070, -121.50960 38.5...",6067002400,4685.0,2011.0,43.0,135.0,440.0,109.0,132.0,256.0,4679.0,159.0,17.0,147.0,302.0,823.0,451.0,682.0,1125.0,1133.0


In [14]:
len(acs_ca)

8057

In [15]:
# join to job data
jobdata=(pd.read_parquet("gs://calitp-analytics-data/data-analyses/ahsc_grant/job_density")
         >> select(_.geo_id,_.jobs_total)
        )

acs_ca = (acs_ca
          >> inner_join(_,jobdata)
         )

In [16]:
len(acs_ca)

8027

In [17]:
# project
acs_ca = acs_ca.to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [18]:
# join to bus buffers - output is stop-buffer level
stops_acs_joined = trips_ridership_joined.sjoin(acs_ca, how='left', predicate='intersects')

# a tract can be associated with multiple stops
stops_acs_joined.geo_id.value_counts()

06037207710    149
06037207900    126
06037207301    126
06037532400    115
06037226002    108
              ... 
06037701801      1
06037430400      1
06037264103      1
06037301203      1
06037267902      1
Name: geo_id, Length: 1916, dtype: int64

In [19]:
# each stop can be associated with multiple tracts 
stops_acs_joined >> count(_.calitp_itp_id,_.stop_id)

Unnamed: 0,calitp_itp_id,stop_id,n
0,182,1,1
1,182,10000002,4
2,182,10012,2
3,182,10033,3
4,182,10034,4
...,...,...,...
13741,293,96,4
13742,293,97,3
13743,293,98,4
13744,293,983,2


In [21]:
# roll back up to stop level - sum the counts of people/households in tracts touching the buffer
stops_acs_rollup = (stops_acs_joined
                    >> group_by(_.calitp_itp_id,_.stop_id, _.stop_name, _.n_trips_weekday,_.n_trips_sat,_.n_trips_sun,
                                _.n_routes_weekday,_.n_routes_sat,_.n_routes_sun,
                                _.sat_ons,_.sun_ons,_.weekday_ons)
                    >> summarize(sum_tracts = _.geo_id.nunique(),
                                 sum_total_pop = _.total_pop.sum(),
                                 sum_households = _.households.sum(),
                                 sum_not_us_citizen_pop = _.not_us_citizen_pop.sum(),
                                 sum_black_pop = _.black_pop.sum(),
                                 sum_hispanic_pop = _.hispanic_pop.sum(),
                                 sum_youth_pop = _.youth_pop.sum(),
                                 sum_seniors_pop = _.seniors_pop.sum(),
                                 sum_inc_extremelylow = _.inc_extremelylow.sum(),
                                 sum_inc_verylow = _.inc_verylow.sum(),
                                 sum_inc_low = _.inc_low.sum(),
                                 sum_pop_determined_poverty_status = _.pop_determined_poverty_status.sum(), #denominator for poverty rate
                                 sum_poverty = _.poverty.sum(),
                                 sum_no_car = _.no_car.sum(), #workers without access to car
                                 sum_no_cars = _.no_cars.sum(), #households without car
                                 sum_land_area = _.ALAND.sum(),
                                 sum_jobs=_.jobs_total.sum()
                                )
                    >> ungroup()
                   )

stops_acs_rollup >> head (5)                    

Unnamed: 0,calitp_itp_id,stop_id,stop_name,n_trips_weekday,n_trips_sat,n_trips_sun,n_routes_weekday,n_routes_sat,n_routes_sun,sat_ons,sun_ons,weekday_ons,sum_tracts,sum_total_pop,sum_households,sum_not_us_citizen_pop,sum_black_pop,sum_hispanic_pop,sum_youth_pop,sum_seniors_pop,sum_inc_extremelylow,sum_inc_verylow,sum_inc_low,sum_pop_determined_poverty_status,sum_poverty,sum_no_car,sum_no_cars,sum_land_area,sum_jobs
0,182,1,Paramount / Slauson,61.0,55.0,55.0,2.0,2.0,2.0,190.595434,285.893151,2549.213931,1,4031.0,1093.0,696.0,15.0,3766.0,1172.0,656.0,222.0,303.0,197.0,3866.0,338.0,42.0,60.0,3707188,7122.0
1,182,10000002,Artesia / Downey,24.0,15.0,15.0,1.0,1.0,1.0,238.244293,83.385502,1215.045892,4,16203.0,4603.0,3061.0,2968.0,9900.0,6442.0,1482.0,975.0,1090.0,1146.0,16001.0,2844.0,266.0,364.0,3243477,4193.0
2,182,10033,Hyde Park / Gay,50.0,30.0,30.0,1.0,1.0,1.0,262.068722,285.893151,2442.003999,3,14102.0,4613.0,2455.0,5559.0,7016.0,4291.0,1841.0,1357.0,911.0,817.0,13952.0,2738.0,128.0,524.0,2714150,1011.0
3,182,10034,Hyde Park / Long,50.0,30.0,30.0,1.0,1.0,1.0,83.385502,59.561073,1572.412331,4,18631.0,6567.0,2914.0,8455.0,8467.0,5378.0,2549.0,2147.0,1300.0,1201.0,18481.0,3814.0,290.0,945.0,3565703,1809.0
4,182,10035,Hyde Park / Park,50.0,30.0,30.0,1.0,1.0,1.0,452.664156,405.015297,4419.431628,3,15654.0,5108.0,2848.0,4657.0,9669.0,5232.0,1532.0,1324.0,1213.0,1232.0,15469.0,3570.0,207.0,443.0,2568500,1777.0


In [22]:
# derived variables
stops_acs_rollup = (stops_acs_rollup
                     >> mutate(land_area_sqkm=_.sum_land_area/1000000,
                               pop_density = _.sum_total_pop/_.land_area_sqkm,
                               job_density = _.sum_jobs/_.land_area_sqkm,
                               pct_not_us_citizen_pop = _.sum_not_us_citizen_pop/_.sum_total_pop,
                               pct_black_pop = _.sum_black_pop/_.sum_total_pop,
                               pct_hispanic_pop = _.sum_hispanic_pop/_.sum_total_pop,
                               pct_youth_pop = _.sum_youth_pop/_.sum_total_pop,
                               pct_seniors_pop = _.sum_seniors_pop/_.sum_total_pop,
                               pct_inc_extremelylow = _.sum_inc_extremelylow/_.sum_households,
                               pct_inc_verylow = _.sum_inc_verylow/_.sum_households,
                               pct_inc_low = _.sum_inc_low/_.sum_households,
                               pct_poverty = _.sum_poverty/_.sum_pop_determined_poverty_status,
                               pct_pop_workers_no_car = _.sum_no_car/_.sum_total_pop,
                               pct_hh_no_cars = _.sum_no_cars/_.sum_households
                              ) 
                    )   

len(stops_acs_rollup)

13011

In [23]:
# join back point geometry and save out
stops_acs_rollup_geom = (stop_geom
                         >> left_join(_,stops_acs_rollup)
                        )

stops_acs_rollup_geom.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 13746 entries, 0 to 13745
Data columns (total 44 columns):
 #   Column                             Non-Null Count  Dtype   
---  ------                             --------------  -----   
 0   calitp_itp_id                      13746 non-null  int64   
 1   stop_id                            13746 non-null  object  
 2   geometry                           13746 non-null  geometry
 3   stop_name                          13011 non-null  object  
 4   n_trips_weekday                    13011 non-null  float64 
 5   n_trips_sat                        13011 non-null  float64 
 6   n_trips_sun                        13011 non-null  float64 
 7   n_routes_weekday                   13011 non-null  float64 
 8   n_routes_sat                       13011 non-null  float64 
 9   n_routes_sun                       13011 non-null  float64 
 10  sat_ons                            13011 non-null  float64 
 11  sun_ons                          

In [24]:
shared_utils.utils.geoparquet_gcs_export(stops_acs_rollup_geom, GCS_FILE_PATH, 'analytical_tbl.parquet')