# Bus Service Opportunities by Census Tract

### Count the # of times a bus arrives at bus stop daily, aggregated to tract

* `gtfs_schedule_dim_stop_times`: how long each trip takes
* `gtfs_schedule_fact_daily_trips`: filter to date
* `gtfs_schedule_dim_stops`: lat/lon

Merging stop lat/lon with stop times can show how many times a bus passes through a given stop throughout the day.

* [Calculate freq from GTFS](https://groups.google.com/g/transit-developers/c/wQ4IAj59za0)

In [None]:
import geopandas as gpd
import pandas as pd
import os

import utils
import tract_utils
import prep_data

os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)
pd.set_option("display.max_rows", 20)

from calitp.tables import tbl
from calitp import query_sql
from siuba import *

# To export to GCS
from calitp.storage import get_fs
fs = get_fs()

SELECTED_DATE = "2021-10-07"

In [None]:
'''
tbl_stop_times = (
    tbl.views.gtfs_schedule_dim_stop_times()
    >> filter(_.calitp_extracted_at <= SELECTED_DATE, 
              _.calitp_deleted_at > SELECTED_DATE, 
             )
)
'''

In [None]:
'''
daily_stop_times = (
    tbl.views.gtfs_schedule_fact_daily_trips()
 >> filter(_.service_date == SELECTED_DATE, 
          _.is_in_service == True)
 >> left_join(_, tbl_stop_times,
              # also added url number to the join keys ----
             ["calitp_itp_id", "calitp_url_number", "trip_id"])
 >> select(_.itp_id==_.calitp_itp_id, _.calitp_url_number,
           _.trip_key, _.trip_id, 
           _.service_date,
           _.stop_id, _.stop_sequence, _.arrival_time)
 >> filter(_.arrival_time >= "05:00:00", 
          _.arrival_time <= "20:00:00")
 >> group_by(_.itp_id, _.calitp_url_number, 
             _.trip_id, _.trip_key,
             _.service_date, 
             _.stop_id, _.stop_sequence)
 >> count(_.arrival_time)
 >> collect()
)

daily_stop_times.to_parquet("./daily_stop_times.parquet")
'''

In [None]:
daily_stop_times = pd.read_parquet("./daily_stop_times.parquet")

In [None]:
daily_stop_times = utils.include_exclude_multiple_feeds(
    daily_stop_times, id_col = "itp_id",
    include_ids = [182], exclude_ids = [200])

In [None]:
aggregated_stops_per_day = (daily_stop_times
                            .groupby(["itp_id", "stop_id"])
                            .agg({"arrival_time": "count"})
                            .reset_index()
                            .rename(columns = {"arrival_time": "num_arrivals"})
                           )

In [None]:
aggregated_stops_with_geom = (
    tbl.views.gtfs_schedule_dim_stops()
    >> select(_.itp_id == _.calitp_itp_id, _.stop_id, 
             _.stop_lat, _.stop_lon, _.stop_name)
    >> arrange(_.itp_id, _.stop_id, 
               _.stop_lat, _.stop_lon)
    >> collect()
    >> inner_join(_, aggregated_stops_per_day, 
              ["itp_id", "stop_id"])
    >> collect()
)

#aggregated_stops_with_geom.to_parquet("./aggregated_stops_with_geom.parquet")

In [None]:
def add_stop_geometry_merge_census_tracts(df, census_tract_df):
    # If there are the same stops with multiple lat/lon values
    # Drop duplicates
    df2 = (df.sort_values(["itp_id", "stop_id", 
                           "stop_lon", "stop_lat"])
           .drop_duplicates(subset = ["itp_id", "stop_id"])
           .reset_index(drop=True)
          )
    print(f"# obs in joined df: {len(df)}")
    print(f"# obs in joined df, no dups: {len(df2)}")
    
    # Add stop geometry column
    df3 = gpd.GeoDataFrame(df2, 
                           geometry=gpd.points_from_xy(df2.stop_lon, 
                                                       df2.stop_lat), 
                           crs = utils.WGS84
                          ).drop(columns = ["stop_lon", "stop_lat"])
    
    # Join stops (points) to census tracts (polygons)
    gdf = gpd.sjoin(df3.to_crs(utils.WGS84), 
                    census_tract_df.to_crs(utils.WGS84),
                    # Use inner, or else left join will result in some NaN tracts
                    how = "inner",
                    predicate = "intersects"
                   ).drop(columns = "index_right")
    
    return gdf

In [None]:
census_tracts = prep_data.generate_calenviroscreen_lehd_data(prep_data.datasets)

gdf = add_stop_geometry_merge_census_tracts(
    aggregated_stops_with_geom, census_tracts)

In [None]:
tract_group_cols = ["Tract"]
sum_cols = ["num_arrivals"]
count_cols = ["stop_id"]
nunique_cols = ["itp_id"]

gdf2 = tract_utils.aggregate_by_tract(gdf, tract_group_cols, 
                                          sum_cols = sum_cols, 
                                          count_cols = count_cols, 
                                          nunique_cols = nunique_cols)

final_df = tract_utils.attach_tract_geometry(gdf2, census_tracts, 
                                              merge_col = ["Tract"], join="left")

In [None]:
# Export to GCS (but save locally first)
FILE_NAME = "bus_stop_times_by_tract.parquet"
final_df.to_parquet(f"./{FILE_NAME}")

fs.put(f"./{FILE_NAME}", f"{utils.GCS_FILE_PATH}{FILE_NAME}")

final_df.head()