In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

import branca
import folium
import shared_utils

from siuba import *
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd

pd.set_option('display.max_columns', None) 

import datetime as dt
import time

from calitp import get_engine
from calitp.tables import tbls

engine = get_engine()
connection = engine.connect()

GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'



# Creating trips per am peak / midday / pm / weekend by stop
This is assembles the by-stop analytical file. This version loops over all operators with GTFS Schedule data, for recent analysis days. It then joins ACS characteristics.

## File Assembly

In [2]:
# set date parameters
analysis_wkd = dt.date(2022,11,2)
analysis_sat = dt.date(2022,11,5)
analysis_sun = dt.date(2022,11,6)

In [3]:
# read in ACS data
acs_ca = gpd.read_parquet(f"{GCS_FILE_PATH}acs_tbl_ca.parquet")

# join to job data
jobdata=(pd.read_parquet(f"{GCS_FILE_PATH}job_density")
         >> select(_.geo_id,_.jobs_total)
        )

acs_ca = (acs_ca
          >> inner_join(_,jobdata)
         )

#project
acs_ca = acs_ca.to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [4]:
# get all operators
airtable_itp_ids = (
    tbls.airtable.california_transit_organizations()
    >> filter(_.at_least_one_gtfs_feed_for_any_service==1)
    >> select(_.itp_id, _.name, _.caltrans_district)
    >> collect()
).dropna()

In [5]:
airtable_itp_ids = (airtable_itp_ids 
                    >> mutate(itp_id = _.itp_id.astype('int64'))
                    >> arrange(_.caltrans_district)
                   )

airtable_itp_ids

Unnamed: 0,itp_id,name,caltrans_district
4,42,Blue Lake Rancheria,01 - Eureka
13,18,City of Arcata,01 - Eureka
20,108,City of Eureka,01 - Eureka
98,261,Redwood Coast Transit Authority,01 - Eureka
100,135,Humboldt Transit Authority,01 - Eureka
...,...,...,...
22,154,City of Laguna Beach,12 - Irvine
57,14,Anaheim Transportation Network,12 - Irvine
111,235,Orange County Transportation Authority,12 - Irvine
150,394,City of San Juan Capistrano,12 - Irvine


In [26]:
#get table of trip keys for different days
def daily_trips(day_type, analysis_dt):
    trips_df = (shared_utils.gtfs_utils.get_trips(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        trip_cols = ["calitp_itp_id","trip_id","route_id"]
    ) 
                    ) 
    
    if trips_df.empty:
        return pd.DataFrame()
    
    #routes contains route type - filter to bus
    routes_df = (shared_utils.gtfs_utils.get_route_info(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        route_cols = ["calitp_itp_id","route_type","route_id"]
    ) 
                    ) 

    trips_df = (trips_df
                     >> left_join(_,routes_df)
                     >> filter(_.route_type=="3")
               )
    # stop times contains how many trips go by a stop
    stoptimes_df = (shared_utils.gtfs_utils.get_stop_times(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        get_df = True
    )
        >> left_join(_,trips_df)
        >> group_by(_.calitp_itp_id,_.stop_id)
        >> summarize(n_trips = _.trip_id.nunique(),
                     n_routes = _.route_id.nunique()
                    )
        >> ungroup()
        >> mutate(daytype = day_type,
                  analysis_date = analysis_dt
                 )
                       )
    #stop geometry
    stops_geo = shared_utils.gtfs_utils.get_stops(
                     selected_date = analysis_dt, 
                     itp_id_list = [itp_id]
    )
    
    stoptimes_geo = (stops_geo.merge(stoptimes_df, on = ["calitp_itp_id","stop_id"])
                     >> select (_.calitp_itp_id,_.stop_desc,_.stop_name,_.stop_id,
                                _.geometry,_.n_trips,_.n_routes,_.daytype,_.analysis_date
                               )
                     >> mutate (point_geometry = _.geometry)
                    ).to_crs(shared_utils.geography_utils.CA_NAD83Albers)
    
    #replace geometry with a .25mi buffer
    stoptimes_geo.geometry = stoptimes_geo.buffer(402.336)
    
    #join w/ acs data
    stops_acs_joined = stoptimes_geo.sjoin(acs_ca, how='left', predicate='intersects')
    
    #roll back up to stop level
    stops_acs_rollup = (stops_acs_joined
                    >> group_by(_.calitp_itp_id,_.stop_id, _.stop_name,
                                _.n_trips,_.n_routes,_.daytype,_.analysis_date)
                    >> summarize(sum_tracts = _.geo_id.nunique(),
                                 sum_total_pop = _.total_pop.sum(),
                                 sum_households = _.households.sum(),
                                 sum_not_us_citizen_pop = _.not_us_citizen_pop.sum(),
                                 sum_youth_pop = _.youth_pop.sum(),
                                 sum_seniors_pop = _.seniors_pop.sum(),
                                 sum_pop_determined_poverty_status = _.pop_determined_poverty_status.sum(), #denominator for poverty rate
                                 sum_poverty = _.poverty.sum(),
                                 sum_no_car = _.no_car.sum(), #workers without access to car
                                 sum_no_cars = _.no_cars.sum(), #households without car
                                 sum_land_area = _.ALAND.sum(),
                                 sum_jobs=_.jobs_total.sum()
                                )
                    >> ungroup()
                    >> mutate(land_area_sqkm=_.sum_land_area/1000000,
                           pop_density = _.sum_total_pop/_.land_area_sqkm,
                           job_density = _.sum_jobs/_.land_area_sqkm,
                           pct_not_us_citizen_pop = (_.sum_not_us_citizen_pop/_.sum_total_pop)*100,
                           pct_youth_pop = (_.sum_youth_pop/_.sum_total_pop)*100,
                           pct_seniors_pop = (_.sum_seniors_pop/_.sum_total_pop)*100,
                           pct_poverty = (_.sum_poverty/_.sum_pop_determined_poverty_status)*100,
                           pct_pop_workers_no_car = (_.sum_no_car/_.sum_total_pop)*100,
                           pct_hh_no_cars = (_.sum_no_cars/_.sum_households)*100
                          )
                       )
    
    #put back point geometry
    stops_acs_rollup_gpd = (stops_geo 
                            >> select(_.calitp_itp_id,_.stop_name,_.stop_id, _.geometry)
                            >> right_join(_,stops_acs_rollup)
                           )
    
    return stops_acs_rollup_gpd

In [None]:
for itp_id in airtable_itp_ids['itp_id'].tolist():
    print(f"processing agency {itp_id}...")
    stops_weekday = daily_trips("Weekday",analysis_wkd)
    stops_saturday = daily_trips("Saturday",analysis_sat)
    stops_sunday = daily_trips("Sunday",analysis_sun)
    stoptimes_all = pd.concat([stops_weekday,stops_saturday,stops_sunday], ignore_index=True)
    shared_utils.utils.geoparquet_gcs_export(stoptimes_all, f"{GCS_FILE_PATH}tool_data/", f"trips_perstop_{itp_id}")

processing agency 42...
processing agency 18...
processing agency 108...
processing agency 261...
processing agency 135...
processing agency 198...
processing agency 159...
processing agency 344...
processing agency 329...
processing agency 254...
processing agency 259...
processing agency 83...
processing agency 334...
processing agency 162...
processing agency 331...
processing agency 271...
processing agency 351...
processing agency 23...
processing agency 221...
processing agency 47...
processing agency 372...
processing agency 251...
processing agency 101...
processing agency 376...
processing agency 273...
processing agency 81...
processing agency 13...
processing agency 194...
processing agency 315...
processing agency 246...
processing agency 356...
processing agency 61...
processing agency 279...
processing agency 280...
processing agency 110...
processing agency 247...
processing agency 350...
processing agency 167...
processing agency 310...
processing agency 314...
processi