In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

import branca
import folium
import shared_utils

from siuba import *
import pandas as pd
import geopandas as gpd
import dask.dataframe as dd

pd.set_option('display.max_columns', None) 

import datetime as dt
import time

from calitp import get_engine
from calitp.tables import tbls

engine = get_engine()
connection = engine.connect()

GCS_FILE_PATH = 'gs://calitp-analytics-data/data-analyses/ahsc_grant/'

import gcsfs
fs = gcsfs.GCSFileSystem()



# Creating trips per am peak / midday / pm / weekend by stop
This is assembles the by-stop analytical file. This version loops over all operators with GTFS Schedule data, for recent analysis days. It then joins ACS characteristics.

## File Assembly

In [2]:
# set date parameters
analysis_wkd = dt.date(2022,11,2)
analysis_sat = dt.date(2022,11,5)
analysis_sun = dt.date(2022,11,6)

In [3]:
# read in ACS data
acs_ca = gpd.read_parquet(f"{GCS_FILE_PATH}acs_tbl_ca.parquet")

# join to job data
jobdata=(pd.read_parquet(f"{GCS_FILE_PATH}job_density")
         >> select(_.geo_id,_.jobs_total)
        )

acs_ca = (acs_ca
          >> inner_join(_,jobdata)
         )

#project
acs_ca = acs_ca.to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [4]:
# get all operators w/ GTFS Schedule
organizations_itp_ids = (
    tbls.mart_transit_database.dim_organizations()
    >> filter(_.gtfs_static_status=="Static OK")
    >> select(_.itp_id, _.name, _.caltrans_district, _.ntd_agency_info_key)
    >> collect()
).dropna()

In [5]:
organizations_itp_ids = (organizations_itp_ids 
                    >> mutate(itp_id = _.itp_id.astype('int64'))
                    >> arrange(_.caltrans_district)
                   )

organizations_itp_ids

Unnamed: 0,itp_id,name,caltrans_district,ntd_agency_info_key
144,377,Yurok Tribe,01 - Eureka,rec8r7tzQX5E4o2Hz
146,42,Blue Lake Rancheria,01 - Eureka,recWel1B9dqxuQBqK
517,108,City of Eureka,01 - Eureka,rechClEBSVslID4Tx
581,18,City of Arcata,01 - Eureka,recckkVw6GfLZtS1V
659,198,Mendocino Transit Authority,01 - Eureka,recutczOUiAPT9K1v
...,...,...,...,...
663,386,Yuma County Intergovernmental Public Transport...,11 - San Diego,recMDd8k7WUcRZ35Z
682,226,North County Transit District,11 - San Diego,recUgXoPRXXLMK6Rh
127,14,Anaheim Transportation Network,12 - Irvine,rec6g6iKaKu4n4r89
588,154,City of Laguna Beach,12 - Irvine,reckaNJPUeFDQGMuy


In [6]:
#get table of trip keys for different days
def daily_trips(day_type, analysis_dt):
    trips_df = (shared_utils.gtfs_utils.get_trips(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        trip_cols = ["calitp_itp_id","trip_id","route_id","shape_id","direction_id"]
    ) 
                    ) 
    
    if trips_df.empty:
        return pd.DataFrame()
    
    #routes contains route type - filter to bus
    routes_df = (shared_utils.gtfs_utils.get_route_info(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        route_cols = ["calitp_itp_id","route_type","route_id"]
    ) 
                    ) 

    trips_df = (trips_df
                     >> inner_join(_,routes_df)
                     >> filter(_.route_type=="3")
               )
    
    #added to bail-out for rail-only agencies
    if trips_df.empty:
        return pd.DataFrame()
    
    # stop times contains how many trips go by a stop
    stoptimes_df = (shared_utils.gtfs_utils.get_stop_times(
        selected_date = analysis_dt,
        itp_id_list = [itp_id],
        get_df = True
    )
        >> inner_join(_,trips_df)
        >> group_by(_.calitp_itp_id,_.route_type,_.stop_id)
        >> summarize(n_trips = _.trip_id.nunique(),
                     n_routes = _.route_id.nunique(),
                     route_list = lambda _: [_.route_id.tolist()] 
                    )
        >> ungroup()
        >> mutate(daytype = day_type,
                  analysis_date = analysis_dt
                 )
                       )
    stoptimes_df['route_list_string'] = stoptimes_df['route_list'].apply(",".join)
    
    #stop geometry
    stops_geo = shared_utils.gtfs_utils.get_stops(
                     selected_date = analysis_dt, 
                     itp_id_list = [itp_id]
    )
    
    stoptimes_geo = (stops_geo.merge(stoptimes_df, on = ["calitp_itp_id","stop_id"])
                     >> select (_.calitp_itp_id,_.stop_desc,_.stop_name,_.stop_id,
                                _.geometry,_.n_trips,_.n_routes,_.daytype,_.analysis_date,_.route_list_string
                               )
                     >> mutate (point_geometry = _.geometry)
                    ).to_crs(shared_utils.geography_utils.CA_NAD83Albers)
    
    
    #replace geometry with a .25mi buffer
    stoptimes_geo.geometry = stoptimes_geo.buffer(402.336)
    
    #join w/ acs data
    stops_acs_joined = stoptimes_geo.sjoin(acs_ca, how='left', predicate='intersects')
    
    
    #roll back up to stop level
    stops_acs_rollup = (stops_acs_joined
                    >> group_by(_.calitp_itp_id,_.stop_id, _.stop_name,
                                _.n_trips,_.n_routes,_.daytype,_.analysis_date,_.route_list_string)
                    >> summarize(sum_tracts = _.geo_id.nunique(),
                                 sum_total_pop = _.total_pop.sum(),
                                 sum_households = _.households.sum(),
                                 sum_not_us_citizen_pop = _.not_us_citizen_pop.sum(),
                                 sum_youth_pop = _.youth_pop.sum(),
                                 sum_seniors_pop = _.seniors_pop.sum(),
                                 sum_pop_determined_poverty_status = _.pop_determined_poverty_status.sum(), #denominator for poverty rate
                                 sum_poverty = _.poverty.sum(),
                                 sum_no_car = _.no_car.sum(), #workers without access to car
                                 sum_no_cars = _.no_cars.sum(), #households without car
                                 sum_land_area = _.ALAND.sum(),
                                 sum_jobs=_.jobs_total.sum()
                                )
                    >> ungroup()
                    >> mutate(land_area_sqkm=_.sum_land_area/1000000,
                           pop_density = _.sum_total_pop/_.land_area_sqkm,
                           job_density = _.sum_jobs/_.land_area_sqkm,
                           pct_not_us_citizen_pop = (_.sum_not_us_citizen_pop/_.sum_total_pop)*100,
                           pct_youth_pop = (_.sum_youth_pop/_.sum_total_pop)*100,
                           pct_seniors_pop = (_.sum_seniors_pop/_.sum_total_pop)*100,
                           pct_poverty = (_.sum_poverty/_.sum_pop_determined_poverty_status)*100,
                           pct_pop_workers_no_car = (_.sum_no_car/_.sum_total_pop)*100,
                           pct_hh_no_cars = (_.sum_no_cars/_.sum_households)*100
                          )
                       )
    
    stops_acs_rollup = (stops_acs_rollup
                       >> left_join(_,organizations_itp_ids, {"calitp_itp_id":"itp_id"})
                       ).drop_duplicates()
    
    #put back point geometry - removed for not using geometry
    # stops_acs_rollup_gpd = (stops_geo 
    #                         >> select(_.calitp_itp_id,_.stop_name,_.stop_id, _.geometry)
    #                         >> right_join(_,stops_acs_rollup)
    #                        )
    
    return stops_acs_rollup

In [7]:
fs_list = fs.ls(f"{GCS_FILE_PATH}tool_data/")

In [8]:
ran_operators = [
        int(path.split("tool_data/")[1].split("_")[2].split(".")[0])
        for path in fs_list
        if ".parquet" in path.split("tool_data/")[1]
    ]

In [9]:
for itp_id in organizations_itp_ids['itp_id'].tolist():
    if itp_id in ran_operators:
        print(f"already ran: {itp_id}")
        continue
    else:
        print(f"processing agency {itp_id} starting {dt.datetime.now()}...")
        stops_weekday = daily_trips("Weekday",analysis_wkd)
        stops_saturday = daily_trips("Saturday",analysis_sat)
        stops_sunday = daily_trips("Sunday",analysis_sun)
        stoptimes_all = pd.concat([stops_weekday,stops_saturday,stops_sunday], ignore_index=True)
        stoptimes_all.to_parquet(f"{GCS_FILE_PATH}tool_data/trips_perstop_{itp_id}.parquet")

already ran: 377
already ran: 42
already ran: 108
already ran: 18
already ran: 198
already ran: 135
already ran: 261
already ran: 159
already ran: 329
already ran: 83
already ran: 344
already ran: 334
already ran: 162
already ran: 259
already ran: 251
already ran: 221
already ran: 122
already ran: 23
already ran: 271
already ran: 340
already ran: 105
already ran: 273
already ran: 331
already ran: 101
already ran: 376
already ran: 372
already ran: 48
already ran: 314
already ran: 350
already ran: 110
already ran: 282
already ran: 301
already ran: 94
already ran: 247
already ran: 264
already ran: 356
already ran: 294
already ran: 336
already ran: 218
already ran: 61
processing agency 279 starting 2022-12-06 10:15:25.424870...
already ran: 368
already ran: 4
already ran: 310
processing agency 315 starting 2022-12-06 10:15:37.983929...
processing agency 288 starting 2022-12-06 10:15:48.247489...
processing agency 312 starting 2022-12-06 10:15:53.062980...
processing agency 22 starting 2022