# Bus Service Opportunities by Census Tract

### Count the # of times a bus arrives at bus stop daily, aggregated to tract

* `gtfs_schedule_dim_stop_times`: how long each trip takes
* `gtfs_schedule_fact_daily_trips`: filter to date
* `gtfs_schedule_dim_stops`: lat/lon

Merging stop lat/lon with stop times can show how many times a bus passes through a given stop throughout the day.

* [Calculate freq from GTFS](https://groups.google.com/g/transit-developers/c/wQ4IAj59za0)

In [1]:
import geopandas as gpd
import pandas as pd
import os

import utils
import tract_utils
import prep_data

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)
pd.set_option("display.max_rows", 20)

from calitp.tables import tbl
from calitp import query_sql
from siuba import *

SELECTED_DATE = "2021-10-07"

In [None]:
'''
tbl_stop_times = (
    tbl.views.gtfs_schedule_dim_stop_times()
    >> filter(_.calitp_extracted_at <= SELECTED_DATE, 
              _.calitp_deleted_at > SELECTED_DATE, 
             )
)
'''

In [None]:
'''
daily_stop_times = (
    tbl.views.gtfs_schedule_fact_daily_trips()
 >> filter(_.service_date == SELECTED_DATE, 
          _.is_in_service == True)
 >> left_join(_, tbl_stop_times,
              # also added url number to the join keys ----
             ["calitp_itp_id", "calitp_url_number", "trip_id"])
 >> select(_.itp_id==_.calitp_itp_id, _.calitp_url_number,
           _.trip_key, _.trip_id, 
           _.service_date,
           _.stop_id, _.stop_sequence, _.arrival_time)
 >> filter(_.arrival_time >= "05:00:00", 
          _.arrival_time <= "20:00:00")
 >> group_by(_.itp_id, _.calitp_url_number, 
             _.trip_id, _.trip_key,
             _.service_date, 
             _.stop_id, _.stop_sequence)
 >> count(_.arrival_time)
 >> collect()
)

daily_stop_times.to_parquet("./daily_stop_times.parquet")
'''

In [2]:
daily_stop_times = pd.read_parquet("./daily_stop_times.parquet")

In [3]:
daily_stop_times = utils.include_exclude_multiple_feeds(
    daily_stop_times, id_col = "itp_id",
    include_ids = [182], exclude_ids = [200])

# obs in original df: 3772291
# obs in new df: 2951851
These operators have multiple calitp_url_number values: [106, 110, 167, 182, 280, 290, 310, 350]


In [4]:
aggregated_stops_per_day = (daily_stop_times
                            .groupby(["itp_id", "stop_id"])
                            .agg({"arrival_time": "count"})
                            .reset_index()
                            .rename(columns = {"arrival_time": "num_arrivals"})
                           )

In [5]:
aggregated_stops_with_geom = (
    tbl.views.gtfs_schedule_dim_stops()
    >> select(_.itp_id == _.calitp_itp_id, _.stop_id, 
             _.stop_lat, _.stop_lon, _.stop_name)
    >> arrange(_.itp_id, _.stop_id, 
               _.stop_lat, _.stop_lon)
    >> collect()
    >> inner_join(_, aggregated_stops_per_day, 
              ["itp_id", "stop_id"])
    >> collect()
)

#aggregated_stops_with_geom.to_parquet("./aggregated_stops_with_geom.parquet")

In [6]:
def add_stop_geometry_merge_census_tracts(df, census_tract_df):
    # If there are the same stops with multiple lat/lon values
    # Drop duplicates
    df2 = (df.sort_values(["itp_id", "stop_id", 
                           "stop_lon", "stop_lat"])
           .drop_duplicates(subset = ["itp_id", "stop_id"])
           .reset_index(drop=True)
          )
    print(f"# obs in joined df: {len(df)}")
    print(f"# obs in joined df, no dups: {len(df2)}")
    
    # Add stop geometry column
    df3 = gpd.GeoDataFrame(df2, 
                           geometry=gpd.points_from_xy(df2.stop_lon, 
                                                       df2.stop_lat), 
                           crs = utils.WGS84
                          ).drop(columns = ["stop_lon", "stop_lat"])
    
    # Join stops (points) to census tracts (polygons)
    gdf = gpd.sjoin(df3.to_crs(utils.WGS84), 
                    census_tract_df.to_crs(utils.WGS84),
                    # Use inner, or else left join will result in some NaN tracts
                    how = "inner",
                    predicate = "intersects"
                   ).drop(columns = "index_right")
    
    return gdf

In [7]:
census_tracts = prep_data.generate_calenviroscreen_lehd_data(prep_data.datasets)

gdf = add_stop_geometry_merge_census_tracts(
    aggregated_stops_with_geom, census_tracts)

# obs in joined df: 196207
# obs in joined df, no dups: 84567


In [8]:
tract_group_cols = ["Tract"]
sum_cols = ["num_arrivals"]
count_cols = ["stop_id"]
nunique_cols = ["itp_id"]

gdf2 = tract_utils.aggregate_by_tract(gdf, tract_group_cols, 
                                          sum_cols = sum_cols, 
                                          count_cols = count_cols, 
                                          nunique_cols = nunique_cols)

final_df = tract_utils.attach_tract_geometry(gdf2, census_tracts, 
                                              merge_col = ["Tract"], join="left")

In [9]:
final_df.to_parquet("./bus_stop_times_by_tract.parquet")
final_df.head()


This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.

  final_df.to_parquet("./bus_stop_times_by_tract.parquet")


Unnamed: 0,Tract,ZIP,Population,sq_mi,pop_sq_mi,overall_ptile,pollution_ptile,popchar_ptile,equity_group,pollution_group,popchar_group,County,City,geometry,num_jobs,jobs_sq_mi,index,num_arrivals,stop_id,itp_id
0,6001400100,94704,3120,2.655917,1174.735658,2.79879,26.621033,1.525466,1,1,1,Alameda,Oakland,"POLYGON ((-122.24408 37.88322, -122.24198 37.8...",936,352.420697,2335.0,155.0,9.0,2.0
1,6001400200,94618,2007,0.229901,8729.842746,2.874433,24.181705,1.651538,1,1,1,Alameda,Oakland,"POLYGON ((-122.24191 37.85181, -122.24202 37.8...",1357,5902.539415,1428.0,698.0,14.0,1.0
2,6001400300,94618,5051,0.427356,11819.185813,15.935451,33.366521,12.266768,1,2,1,Alameda,Oakland,"POLYGON ((-122.24590 37.84500, -122.25241 37.8...",1978,4628.459619,1090.0,1516.0,27.0,2.0
3,6001400400,94609,4007,0.271558,14755.587549,18.973777,26.235221,18.431669,1,1,1,Alameda,Oakland,"POLYGON ((-122.25295 37.85117, -122.25305 37.8...",983,3619.850901,1276.0,397.0,13.0,1.0
4,6001400500,94609,4124,0.227012,18166.435207,29.740292,31.400124,30.156329,1,1,1,Alameda,Oakland,"POLYGON ((-122.26023 37.85274, -122.26130 37.8...",362,1594.628891,1283.0,441.0,13.0,1.0
