In [2]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

import branca
import folium
import shared_utils

from siuba import *
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd

pd.set_option('display.max_columns', None) 

import datetime as dt
import time

from calitp import get_engine
from calitp.tables import tbls

engine = get_engine()
connection = engine.connect()

GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'

ModuleNotFoundError: No module named 'shared_utils'

# Creating trips per am peak / midday / pm / weekend by stop
This is assembles the by-stop analytical file. This version loops over all operators with GTFS Schedule data, for recent analysis days. It then joins ACS characteristics.

## File Assembly

In [2]:
# set date parameters
analysis_wkd = dt.date(2022,11,2)
analysis_sat = dt.date(2022,11,5)
analysis_sun = dt.date(2022,11,6)

In [70]:
# read in ACS data
acs_ca = gpd.read_parquet(f"{GCS_FILE_PATH}acs_tbl_ca.parquet")

# join to job data
jobdata=(pd.read_parquet(f"{GCS_FILE_PATH}job_density")
         >> select(_.geo_id,_.jobs_total)
        )

acs_ca = (acs_ca
          >> inner_join(_,jobdata)
         )

#project
acs_ca = acs_ca.to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [10]:
# get all operators
airtable_itp_ids = (
    tbls.airtable.california_transit_organizations()
    >> filter(_.at_least_one_gtfs_feed_for_any_service==1)
    >> select(_.itp_id, _.name, _.caltrans_district)
    >> collect()
).dropna()

In [12]:
airtable_itp_ids = (airtable_itp_ids 
                    >> mutate(itp_id = _.itp_id.astype('int64'))
                    >> arrange(_.caltrans_district)
                   )

airtable_itp_ids

Unnamed: 0,itp_id,name,caltrans_district
4,42,Blue Lake Rancheria,01 - Eureka
13,18,City of Arcata,01 - Eureka
20,108,City of Eureka,01 - Eureka
98,261,Redwood Coast Transit Authority,01 - Eureka
100,135,Humboldt Transit Authority,01 - Eureka
...,...,...,...
22,154,City of Laguna Beach,12 - Irvine
57,14,Anaheim Transportation Network,12 - Irvine
111,235,Orange County Transportation Authority,12 - Irvine
150,394,City of San Juan Capistrano,12 - Irvine


In [73]:
#get table of trip keys for different days
def daily_trips(day_type, analysis_dt):
    trips_df = (shared_utils.gtfs_utils.get_trips(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        trip_cols = ["calitp_itp_id","trip_id","route_id"]
    ) 
                    ) 
    #routes contains route type - filter to bus
    routes_df = (shared_utils.gtfs_utils.get_route_info(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        route_cols = ["calitp_itp_id","route_type","route_id"]
    ) 
                    ) 

    trips_df = (trips_df
                     >> left_join(_,routes_df)
                     >> filter(_.route_type=="3")
               )
    # stop times contains how many trips go by a stop
    stoptimes_df = (shared_utils.gtfs_utils.get_stop_times(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        get_df = True
    )
        >> left_join(_,trips_df)
        >> group_by(_.calitp_itp_id,_.stop_id)
        >> summarize(n_trips = _.trip_id.nunique(),
                     n_routes = _.route_id.nunique()
                    )
        >> ungroup()
        >> mutate(daytype = day_type,
                  analysis_date = analysis_dt
                 )
                       )
    #stop geometry
    stops_geo = shared_utils.gtfs_utils.get_stops(
                     selected_date = analysis_dt, 
                     itp_id_list = [itp_id]
    )
    
    stoptimes_geo = (stops_geo.merge(stoptimes_df, on = ["calitp_itp_id","stop_id"])
                     >> select (_.calitp_itp_id,_.stop_desc,_.stop_name,_.stop_id,
                                _.geometry,_.n_trips,_.n_routes,_.daytype,_.analysis_date
                               )
                     >> mutate (point_geometry = _.geometry)
                    ).to_crs(shared_utils.geography_utils.CA_NAD83Albers)
    
    #replace geometry with a .25mi buffer
    stoptimes_geo.geometry = stoptimes_geo.buffer(402.336)
    
    #join w/ acs data
    stops_acs_joined = stoptimes_geo.sjoin(acs_ca, how='left', predicate='intersects')
    
    #roll back up to stop level, put back point geometry
    stops_acs_rollup = (stops_acs_joined
                    >> group_by(_.calitp_itp_id,_.stop_id, _.stop_name,
                                _.n_trips,_.n_routes,_.daytype,_.analysis_date)
                    >> summarize(sum_tracts = _.geo_id.nunique(),
                                 sum_total_pop = _.total_pop.sum(),
                                 sum_households = _.households.sum(),
                                 sum_not_us_citizen_pop = _.not_us_citizen_pop.sum(),
                                 sum_youth_pop = _.youth_pop.sum(),
                                 sum_seniors_pop = _.seniors_pop.sum(),
                                 sum_pop_determined_poverty_status = _.pop_determined_poverty_status.sum(), #denominator for poverty rate
                                 sum_poverty = _.poverty.sum(),
                                 sum_no_car = _.no_car.sum(), #workers without access to car
                                 sum_no_cars = _.no_cars.sum(), #households without car
                                 sum_land_area = _.ALAND.sum(),
                                 sum_jobs=_.jobs_total.sum()
                                )
                    >> ungroup()
                    >> mutate(land_area_sqkm=_.sum_land_area/1000000,
                           pop_density = _.sum_total_pop/_.land_area_sqkm,
                           job_density = _.sum_jobs/_.land_area_sqkm,
                           pct_not_us_citizen_pop = (_.sum_not_us_citizen_pop/_.sum_total_pop)*100,
                           pct_youth_pop = (_.sum_youth_pop/_.sum_total_pop)*100,
                           pct_seniors_pop = (_.sum_seniors_pop/_.sum_total_pop)*100,
                           pct_poverty = (_.sum_poverty/_.sum_pop_determined_poverty_status)*100,
                           pct_pop_workers_no_car = (_.sum_no_car/_.sum_total_pop)*100,
                           pct_hh_no_cars = (_.sum_no_cars/_.sum_households)*100
                          ) 
                    >> left_join(_,(stops_geo 
                                    >> select(_.calitp_itp_id,_.stop_name,_.stop_id, _.geometry)
                                   )
                                )
                   )
    
    return stops_acs_rollup

Testing with 1 operator

In [71]:
for itp_id in [300]:
    print(f"processing agency {itp_id}...")
    stops_weekday = daily_trips("Weekday",analysis_wkd)
    stops_saturday = daily_trips("Saturday",analysis_sat)
    stops_sunday = daily_trips("Sunday",analysis_sun)
    stoptimes_all = pd.concat([stops_weekday,stops_saturday,stops_sunday], ignore_index=True)
    shared_utils.utils.geoparquet_gcs_export(stoptimes_all, f"{GCS_FILE_PATH}/tool_data", f"tbl1_trips_perstop")

In [72]:
stops_weekday.head(10)

Unnamed: 0,calitp_itp_id,stop_id,stop_name,n_trips,n_routes,daytype,analysis_date,sum_tracts,sum_total_pop,sum_households,sum_not_us_citizen_pop,sum_youth_pop,sum_seniors_pop,sum_pop_determined_poverty_status,sum_poverty,sum_no_car,sum_no_cars,sum_land_area,sum_jobs,land_area_sqkm,pop_density,job_density,pct_not_us_citizen_pop,pct_youth_pop,pct_seniors_pop,pct_poverty,pct_pop_workers_no_car,pct_hh_no_cars,geometry
0,300,1001,20TH SB & DELAWARE FS,26,1,Weekday,2022-11-02,4,22554.0,9447.0,2829.0,6660.0,2987.0,22391.0,3244.0,491.0,912.0,6952011,29902.0,6.952011,3244.241127,4301.20148,0.125432,0.295291,0.132438,0.14488,0.02177,0.096539,POINT (-118.47195 34.02168)
1,300,1002,20TH SB & PEARL NS,26,1,Weekday,2022-11-02,4,22554.0,9447.0,2829.0,6660.0,2987.0,22391.0,3244.0,491.0,912.0,6952011,29902.0,6.952011,3244.241127,4301.20148,0.125432,0.295291,0.132438,0.14488,0.02177,0.096539,POINT (-118.46741 34.01687)
2,300,1017,ROBERTSON NB & CASHIO NS,1,1,Weekday,2022-11-02,3,11551.0,5050.0,1419.0,3416.0,1818.0,11533.0,1730.0,457.0,804.0,1729846,3390.0,1.729846,6677.473024,1959.712021,0.122847,0.295732,0.157389,0.150004,0.039564,0.159208,POINT (-118.38411 34.05184)
3,300,1021,NATIONAL WB & BAGLEY FS,44,1,Weekday,2022-11-02,4,16341.0,7915.0,1528.0,3578.0,1947.0,16298.0,1635.0,188.0,591.0,4230729,6780.0,4.230729,3862.45491,1602.560693,0.093507,0.218958,0.119148,0.100319,0.011505,0.074668,POINT (-118.39628 34.03140)
4,300,1023,NATIONAL WB & CASTLE HEIGHTS NS,44,1,Weekday,2022-11-02,4,15550.0,7694.0,1614.0,3188.0,1705.0,15507.0,1692.0,200.0,500.0,3609918,4622.0,3.609918,4307.57707,1280.361493,0.103794,0.205016,0.109646,0.109112,0.012862,0.064986,POINT (-118.39976 34.03143)
5,300,1027,OCEAN PARK WB & BEVERLEY NS,35,1,Weekday,2022-11-02,4,20777.0,10914.0,1933.0,4204.0,3024.0,20747.0,1955.0,679.0,1197.0,4163446,9043.0,4.163446,4990.337331,2171.998868,0.093036,0.202339,0.145546,0.09423,0.03268,0.109676,POINT (-118.47870 34.00564)
6,300,1028,OLIVE NB & 9TH FS,3,1,Weekday,2022-11-02,3,15289.0,9872.0,2748.0,2666.0,1575.0,15255.0,3445.0,1515.0,2717.0,2434388,96808.0,2.434388,6280.428592,39766.873645,0.179737,0.174374,0.103015,0.225828,0.099091,0.275223,POINT (-118.25797 34.04393)
7,300,1030,PALMS WB & JASMINE FS,44,1,Weekday,2022-11-02,7,26287.0,13148.0,3713.0,5280.0,2378.0,26014.0,2623.0,500.0,876.0,4439091,6769.0,4.439091,5921.707845,1524.861734,0.141249,0.20086,0.090463,0.10083,0.019021,0.066626,POINT (-118.40668 34.02741)
8,300,1031,PALMS WB & MOTOR FS,44,1,Weekday,2022-11-02,6,25059.0,12308.0,4436.0,4703.0,2244.0,24732.0,2261.0,684.0,860.0,2680431,5436.0,2.680431,9348.869641,2028.032059,0.177022,0.187677,0.089549,0.09142,0.027296,0.069873,POINT (-118.40891 34.02624)
9,300,104,SANTA MONICA EB & 3RD FS,48,3,Weekday,2022-11-02,1,4598.0,3022.0,512.0,648.0,949.0,4598.0,811.0,258.0,831.0,1789786,23956.0,1.789786,2569.022218,13384.840422,0.111353,0.140931,0.206394,0.176381,0.056111,0.274983,POINT (-118.49565 34.01588)
