In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

import branca
import folium
import shared_utils

from siuba import *
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd

pd.set_option('display.max_columns', None) 

import datetime as dt
import time

from calitp import get_engine
from calitp.tables import tbls

engine = get_engine()
connection = engine.connect()

GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'



# Creating trips per am peak / midday / pm / weekend by stop
This is assembles the by-stop analytical file. This version loops over all operators with GTFS Schedule data, for recent analysis days. It then joins ACS characteristics.

## File Assembly

In [2]:
# set date parameters
analysis_wkd = dt.date(2022,11,2)
analysis_sat = dt.date(2022,11,5)
analysis_sun = dt.date(2022,11,6)

In [3]:
# read in ACS data
acs_ca = gpd.read_parquet(f"{GCS_FILE_PATH}acs_tbl_ca.parquet")

# join to job data
jobdata=(pd.read_parquet(f"{GCS_FILE_PATH}job_density")
         >> select(_.geo_id,_.jobs_total)
        )

acs_ca = (acs_ca
          >> inner_join(_,jobdata)
         )

#project
acs_ca = acs_ca.to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [4]:
# get all operators
airtable_itp_ids = (
    tbls.airtable.california_transit_organizations()
    >> filter(_.at_least_one_gtfs_feed_for_any_service==1)
    >> select(_.itp_id, _.name, _.caltrans_district)
    >> collect()
).dropna()

In [5]:
airtable_itp_ids = (airtable_itp_ids 
                    >> mutate(itp_id = _.itp_id.astype('int64'))
                    >> arrange(_.caltrans_district)
                   )

airtable_itp_ids

Unnamed: 0,itp_id,name,caltrans_district
4,42,Blue Lake Rancheria,01 - Eureka
13,18,City of Arcata,01 - Eureka
20,108,City of Eureka,01 - Eureka
98,261,Redwood Coast Transit Authority,01 - Eureka
100,135,Humboldt Transit Authority,01 - Eureka
...,...,...,...
22,154,City of Laguna Beach,12 - Irvine
57,14,Anaheim Transportation Network,12 - Irvine
111,235,Orange County Transportation Authority,12 - Irvine
150,394,City of San Juan Capistrano,12 - Irvine


In [6]:
#get table of trip keys for different days
def daily_trips(day_type, analysis_dt):
    trips_df = (shared_utils.gtfs_utils.get_trips(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        trip_cols = ["calitp_itp_id","trip_id","route_id"]
    ) 
                    ) 
    
    if trips_df.empty:
        return pd.DataFrame()
    
    #routes contains route type - filter to bus
    routes_df = (shared_utils.gtfs_utils.get_route_info(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        route_cols = ["calitp_itp_id","route_type","route_id"]
    ) 
                    ) 

    trips_df = (trips_df
                     >> left_join(_,routes_df)
                     >> filter(_.route_type=="3")
               )
    # stop times contains how many trips go by a stop
    stoptimes_df = (shared_utils.gtfs_utils.get_stop_times(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        get_df = True
    )
        >> left_join(_,trips_df)
        >> group_by(_.calitp_itp_id,_.stop_id)
        >> summarize(n_trips = _.trip_id.nunique(),
                     n_routes = _.route_id.nunique()
                    )
        >> ungroup()
        >> mutate(daytype = day_type,
                  analysis_date = analysis_dt
                 )
                       )
    #stop geometry
    stops_geo = shared_utils.gtfs_utils.get_stops(
                     selected_date = analysis_dt, 
                     itp_id_list = [itp_id]
    )
    
    stoptimes_geo = (stops_geo.merge(stoptimes_df, on = ["calitp_itp_id","stop_id"])
                     >> select (_.calitp_itp_id,_.stop_desc,_.stop_name,_.stop_id,
                                _.geometry,_.n_trips,_.n_routes,_.daytype,_.analysis_date
                               )
                     >> mutate (point_geometry = _.geometry)
                    ).to_crs(shared_utils.geography_utils.CA_NAD83Albers)
    
    #replace geometry with a .25mi buffer
    stoptimes_geo.geometry = stoptimes_geo.buffer(402.336)
    
    #join w/ acs data
    stops_acs_joined = stoptimes_geo.sjoin(acs_ca, how='left', predicate='intersects')
    
    #roll back up to stop level
    stops_acs_rollup = (stops_acs_joined
                    >> group_by(_.calitp_itp_id,_.stop_id, _.stop_name,
                                _.n_trips,_.n_routes,_.daytype,_.analysis_date)
                    >> summarize(sum_tracts = _.geo_id.nunique(),
                                 sum_total_pop = _.total_pop.sum(),
                                 sum_households = _.households.sum(),
                                 sum_not_us_citizen_pop = _.not_us_citizen_pop.sum(),
                                 sum_youth_pop = _.youth_pop.sum(),
                                 sum_seniors_pop = _.seniors_pop.sum(),
                                 sum_pop_determined_poverty_status = _.pop_determined_poverty_status.sum(), #denominator for poverty rate
                                 sum_poverty = _.poverty.sum(),
                                 sum_no_car = _.no_car.sum(), #workers without access to car
                                 sum_no_cars = _.no_cars.sum(), #households without car
                                 sum_land_area = _.ALAND.sum(),
                                 sum_jobs=_.jobs_total.sum()
                                )
                    >> ungroup()
                    >> mutate(land_area_sqkm=_.sum_land_area/1000000,
                           pop_density = _.sum_total_pop/_.land_area_sqkm,
                           job_density = _.sum_jobs/_.land_area_sqkm,
                           pct_not_us_citizen_pop = (_.sum_not_us_citizen_pop/_.sum_total_pop)*100,
                           pct_youth_pop = (_.sum_youth_pop/_.sum_total_pop)*100,
                           pct_seniors_pop = (_.sum_seniors_pop/_.sum_total_pop)*100,
                           pct_poverty = (_.sum_poverty/_.sum_pop_determined_poverty_status)*100,
                           pct_pop_workers_no_car = (_.sum_no_car/_.sum_total_pop)*100,
                           pct_hh_no_cars = (_.sum_no_cars/_.sum_households)*100
                          )
                       )
    
    #put back point geometry
    stops_acs_rollup_gpd = (stops_geo 
                            >> select(_.calitp_itp_id,_.stop_name,_.stop_id, _.geometry)
                            >> right_join(_,stops_acs_rollup)
                           )
    
    return stops_acs_rollup_gpd

In [7]:
for itp_id in airtable_itp_ids['itp_id'].tolist():
    print(f"processing agency {itp_id} starting {dt.datetime.now()}...")
    stops_weekday = daily_trips("Weekday",analysis_wkd)
    stops_saturday = daily_trips("Saturday",analysis_sat)
    stops_sunday = daily_trips("Sunday",analysis_sun)
    stoptimes_all = pd.concat([stops_weekday,stops_saturday,stops_sunday], ignore_index=True)
    shared_utils.utils.geoparquet_gcs_export(stoptimes_all, f"{GCS_FILE_PATH}tool_data/", f"trips_perstop_{itp_id}")

processing agency 42 starting 2022-11-21 18:01:20.224278...
processing agency 18 starting 2022-11-21 18:01:51.858258...
processing agency 108 starting 2022-11-21 18:02:22.836837...
processing agency 261 starting 2022-11-21 18:02:55.015861...
processing agency 135 starting 2022-11-21 18:03:23.382525...
processing agency 198 starting 2022-11-21 18:03:54.164330...
processing agency 159 starting 2022-11-21 18:04:33.654365...
processing agency 344 starting 2022-11-21 18:05:03.870159...
processing agency 329 starting 2022-11-21 18:05:20.318013...
processing agency 254 starting 2022-11-21 18:05:46.836048...
processing agency 259 starting 2022-11-21 18:05:51.434541...
processing agency 83 starting 2022-11-21 18:06:24.503583...
processing agency 334 starting 2022-11-21 18:06:41.576114...
processing agency 162 starting 2022-11-21 18:07:08.164352...
processing agency 331 starting 2022-11-21 18:07:34.082020...
processing agency 271 starting 2022-11-21 18:08:13.708577...
processing agency 351 start

DatabaseError: (google.cloud.bigquery.dbapi.exceptions.DatabaseError) 403 Custom quota exceeded: Your usage exceeded the custom quota for QueryUsagePerDay, which is set by your administrator. For more information, see https://cloud.google.com/bigquery/cost-controls

Location: us-west2
Job ID: 0ee2d333-1292-49dd-854c-350b3a7a45c4

[SQL: SELECT DISTINCT `anon_1`.`feed_key`, `anon_1`.`stop_key`, `anon_1`.`date`, `anon_1`.`calitp_itp_id`, `anon_1`.`stop_id`, `anon_1`.`zone_id`, `anon_1`.`tts_stop_name`, `anon_1`.`stop_name`, `anon_1`.`wheelchair_boarding`, `anon_1`.`location_type`, `anon_1`.`calitp_extracted_at`, `anon_1`.`calitp_deleted_at`, `anon_1`.`stop_desc`, `anon_1`.`level_id`, `anon_1`.`stop_lat`, `anon_1`.`stop_timezone`, `anon_1`.`calitp_url_number`, `anon_1`.`stop_lon`, `anon_1`.`platform_code`, `anon_1`.`calitp_hash`, `anon_1`.`stop_code`, `anon_1`.`stop_url`, `anon_1`.`parent_station` 
FROM (SELECT `anon_2`.`feed_key` AS `feed_key`, `anon_2`.`stop_key` AS `stop_key`, `anon_2`.`date` AS `date`, `anon_2`.`calitp_itp_id` AS `calitp_itp_id`, `anon_2`.`stop_id` AS `stop_id`, `anon_2`.`zone_id` AS `zone_id`, `anon_2`.`tts_stop_name` AS `tts_stop_name`, `anon_2`.`stop_name` AS `stop_name`, `anon_2`.`wheelchair_boarding` AS `wheelchair_boarding`, `anon_2`.`location_type` AS `location_type`, `anon_2`.`calitp_extracted_at` AS `calitp_extracted_at`, `anon_2`.`calitp_deleted_at` AS `calitp_deleted_at`, `anon_2`.`stop_desc` AS `stop_desc`, `anon_2`.`level_id` AS `level_id`, `anon_2`.`stop_lat` AS `stop_lat`, `anon_2`.`stop_timezone` AS `stop_timezone`, `anon_2`.`calitp_url_number` AS `calitp_url_number`, `anon_2`.`stop_lon` AS `stop_lon`, `anon_2`.`platform_code` AS `platform_code`, `anon_2`.`calitp_hash` AS `calitp_hash`, `anon_2`.`stop_code` AS `stop_code`, `anon_2`.`stop_url` AS `stop_url`, `anon_2`.`parent_station` AS `parent_station` 
FROM (SELECT `anon_3`.`feed_key` AS `feed_key`, `anon_3`.`stop_key` AS `stop_key`, `anon_3`.`date` AS `date`, `anon_4`.`calitp_itp_id` AS `calitp_itp_id`, `anon_4`.`stop_id` AS `stop_id`, `anon_4`.`zone_id` AS `zone_id`, `anon_4`.`tts_stop_name` AS `tts_stop_name`, `anon_4`.`stop_name` AS `stop_name`, `anon_4`.`wheelchair_boarding` AS `wheelchair_boarding`, `anon_4`.`location_type` AS `location_type`, `anon_4`.`calitp_extracted_at` AS `calitp_extracted_at`, `anon_4`.`calitp_deleted_at` AS `calitp_deleted_at`, `anon_4`.`stop_desc` AS `stop_desc`, `anon_4`.`level_id` AS `level_id`, `anon_4`.`stop_lat` AS `stop_lat`, `anon_4`.`stop_timezone` AS `stop_timezone`, `anon_4`.`calitp_url_number` AS `calitp_url_number`, `anon_4`.`stop_lon` AS `stop_lon`, `anon_4`.`platform_code` AS `platform_code`, `anon_4`.`calitp_hash` AS `calitp_hash`, `anon_4`.`stop_code` AS `stop_code`, `anon_4`.`stop_url` AS `stop_url`, `anon_4`.`parent_station` AS `parent_station` 
FROM (SELECT `views.gtfs_schedule_fact_daily_feed_stops_1`.`feed_key` AS `feed_key`, `views.gtfs_schedule_fact_daily_feed_stops_1`.`stop_key` AS `stop_key`, `views.gtfs_schedule_fact_daily_feed_stops_1`.`date` AS `date` 
FROM `views.gtfs_schedule_fact_daily_feed_stops` AS `views.gtfs_schedule_fact_daily_feed_stops_1` 
WHERE `views.gtfs_schedule_fact_daily_feed_stops_1`.`date` = %(date_1:DATE)s AND `views.gtfs_schedule_fact_daily_feed_stops_1`.`calitp_extracted_at` <= %(calitp_extracted_at_1:DATE)s AND `views.gtfs_schedule_fact_daily_feed_stops_1`.`calitp_deleted_at` >= %(calitp_deleted_at_1:DATE)s) AS `anon_3` JOIN (SELECT DISTINCT `views.gtfs_schedule_dim_stops_1`.`calitp_itp_id` AS `calitp_itp_id`, `views.gtfs_schedule_dim_stops_1`.`calitp_url_number` AS `calitp_url_number`, `views.gtfs_schedule_dim_stops_1`.`stop_id` AS `stop_id`, `views.gtfs_schedule_dim_stops_1`.`tts_stop_name` AS `tts_stop_name`, `views.gtfs_schedule_dim_stops_1`.`stop_lat` AS `stop_lat`, `views.gtfs_schedule_dim_stops_1`.`stop_lon` AS `stop_lon`, `views.gtfs_schedule_dim_stops_1`.`zone_id` AS `zone_id`, `views.gtfs_schedule_dim_stops_1`.`parent_station` AS `parent_station`, `views.gtfs_schedule_dim_stops_1`.`stop_code` AS `stop_code`, `views.gtfs_schedule_dim_stops_1`.`stop_name` AS `stop_name`, `views.gtfs_schedule_dim_stops_1`.`stop_desc` AS `stop_desc`, `views.gtfs_schedule_dim_stops_1`.`stop_url` AS `stop_url`, `views.gtfs_schedule_dim_stops_1`.`location_type` AS `location_type`, `views.gtfs_schedule_dim_stops_1`.`stop_timezone` AS `stop_timezone`, `views.gtfs_schedule_dim_stops_1`.`wheelchair_boarding` AS `wheelchair_boarding`, `views.gtfs_schedule_dim_stops_1`.`level_id` AS `level_id`, `views.gtfs_schedule_dim_stops_1`.`platform_code` AS `platform_code`, `views.gtfs_schedule_dim_stops_1`.`calitp_extracted_at` AS `calitp_extracted_at`, `views.gtfs_schedule_dim_stops_1`.`calitp_hash` AS `calitp_hash`, `views.gtfs_schedule_dim_stops_1`.`stop_key` AS `stop_key`, `views.gtfs_schedule_dim_stops_1`.`calitp_deleted_at` AS `calitp_deleted_at` 
FROM `views.gtfs_schedule_dim_stops` AS `views.gtfs_schedule_dim_stops_1` 
WHERE `views.gtfs_schedule_dim_stops_1`.`calitp_itp_id` IN UNNEST(%(calitp_itp_id_1:INT64)s)) AS `anon_4` ON `anon_3`.`stop_key` = `anon_4`.`stop_key`) AS `anon_2`) AS `anon_1`]
[parameters: {'date_1': datetime.date(2022, 11, 2), 'calitp_extracted_at_1': datetime.date(2022, 11, 2), 'calitp_deleted_at_1': datetime.date(2022, 11, 2), 'calitp_itp_id_1': [372]}]
(Background on this error at: https://sqlalche.me/e/14/4xp6)