# Transit service and speeds in CalEnviroScreen tracts

* Use this as exploratory analysis to check aggregate transit service against RT segment speeds.
* Note: aggregated monthly transit service hours are for Dec 2022, and RT segment speeds are for Jan 2023.


#### Changes to `ad_hoc`
* there isn't any `service_hours` aggregated for the month
* there isn't any sense of how many days went into that `shape_id`

In [None]:
import altair as alt
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import pandas as pd

from calitp_data_analysis.tables import tbls
from calitp_data_analysis.sql import query_sql
from calitp_data_analysis.gcs_geopandas import GCSGeoPandas
gcsgp = GCSGeoPandas()

from shared_utils import portfolio_utils, rt_dates, v1_rt_dates, time_helpers, schedule_rt_utils
from calitp_data_analysis import calitp_color_palette as cp
from calitp_data_analysis import geography_utils, styleguide

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/"
BUS_SERVICE_GCS = f"{GCS_FILE_PATH}bus_service_increase/"
SEGMENT_GCS = f"{GCS_FILE_PATH}rt_segment_speeds/"

analysis_date = v1_rt_dates.v1_dates["jan2023"]

## Import data

* CalEnviroScreen (equity) and LEHD (jobs) by census tract
* Monthly GTFS scheduled transit service (`mart_ad_hoc.fct_scheduled_service_by_daypart`)
* Speeds by 1 km segments 

### CalEnviroScreen tracts

In [None]:
calenviroscreen_tracts = gcsgp.read_parquet(
    f"{BUS_SERVICE_GCS}calenviroscreen_lehd_by_tract.parquet",
    columns = ["Tract", "overall_ptile", "geometry"]
).to_crs(geography_utils.WGS84)

In [None]:
def set_equity_groups(row):
    if (row.overall_ptile < 34):
        return 1
    elif (row.overall_ptile >= 34) and (row.overall_ptile < 67):
        return 2
    else:
        return 3

calenviroscreen_tracts = calenviroscreen_tracts.assign(
    equity_group = calenviroscreen_tracts.apply(
        lambda x: set_equity_groups(x), axis=1)
)

### Monthly GTFS scheduled transit service

Comes with `shape_id`, but we would still need to pick a day in that month to get the shape geometry.

Instead, let's just aggregate up to route.

In [None]:
# query = '''
#     SELECT * FROM cal-itp-data-infra.mart_ad_hoc.fct_scheduled_service_by_daypart
#     WHERE year = 2022 AND month = 12
# '''

# monthly_transit_service = query_sql(query, as_df=True)
# monthly_transit_service.to_parquet("./data/dec_service.parquet")

In [None]:
monthly_transit_service = pd.read_parquet("./data/dec_service.parquet")

In [None]:
def categorize_peak_off_peak(row):
    if row in ["AM Peak", "PM Peak"]:
        return "Peak"
    else:
        return "Off Peak"

def wrangle_monthly_service(df: pd.DataFrame):
    """
    Categorize time-of-day by peak / off-peak.

    Aggregate up to route-level and peak.
    """    
    df = df.assign(
        peak = df.time_of_day.apply(categorize_peak_off_peak)
    )    
    
    by_route = portfolio_utils.aggregate_by_geography(
        df,
        group_cols = ["name", "source_record_id", 
                      "route_id", "route_short_name", 
                      "peak"
                     ],
        sum_cols = ["n_trips"]
    )
    
    by_route = by_route.assign(
        n_trips = by_route.n_trips.fillna(0).astype(int),
    )
    
    return by_route

In [None]:
transit_service = wrangle_monthly_service(monthly_transit_service)

### Speeds

* First, use RT to determine a trip's time-of-day category. In the future, use scheduled data.
* Get speeds by segment. Average speed by peak/off peak for same segment.

In [None]:
def find_rt_trip_start_time(analysis_date: str):
    """
    Find the earliest min_time by trip_id.
    Use that timestamp to get time-of-day.
    """
    speeds_by_trip = dd.read_parquet(
        f"{SEGMENT_GCS}speeds_route_segments_{analysis_date}/",
        columns = ["gtfs_dataset_key", "trip_id","min_time"]
    ).drop_duplicates()
    
    
    trip_start_time = (speeds_by_trip
                   .groupby(["gtfs_dataset_key", "trip_id"])
                   .min_time
                   .min()
                   .reset_index()
                   .compute()
                  )

    trip_start = schedule_rt_utils.localize_vp_timestamp(
        trip_start_time, ["min_time"]
    )
    
    trip_start = trip_start.assign(
        time_of_day = trip_start.min_time_local.apply(
            time_helpers.categorize_time_of_day)
    )
    
    trip_start = trip_start.assign(
        peak = trip_start.time_of_day.apply(categorize_peak_off_peak)
    )
    
    return trip_start[["gtfs_dataset_key", "trip_id", "peak"]]

In [None]:
rt_trip_start = find_rt_trip_start_time(analysis_date)

In [None]:
speeds = dd.read_parquet(
    f"{SEGMENT_GCS}speeds_route_segments_{analysis_date}/",
    columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
               "trip_id",
               "route_dir_identifier", "segment_sequence",
               "speed_mph"]
).astype({"segment_sequence": int})

In [None]:
# Merge in time-of-day category
speeds_with_daytype = dd.merge(
    speeds,
    rt_trip_start,
    on = ["gtfs_dataset_key", "trip_id"]
)

In [None]:
# Find average speed for peak vs off peak for each segment
avg_speeds_by_peak = portfolio_utils.aggregate_by_geography(
    speeds_with_daytype.compute(),
    group_cols = ["gtfs_dataset_key", "_gtfs_dataset_name",
                  "route_dir_identifier", "segment_sequence", 
                  "peak"],
    mean_cols = ["speed_mph"],
    nunique_cols = ["trip_id"],
    rename_cols = True
)

In [None]:
# Merge in route segment to get segment's line geom
route_segments = gpd.read_parquet(
    f"{SEGMENT_GCS}longest_shape_segments_{analysis_date}.parquet", 
).drop(columns = ["feed_key", "longest_shape_id"])

speeds_by_segment = pd.merge(
    route_segments,
    avg_speeds_by_peak,
    on = ["gtfs_dataset_key", "route_dir_identifier", 
          "segment_sequence"],
    how = "inner",
).to_crs(geography_utils.WGS84)

In [None]:
# Merge in aggregated transit service hours
'''
speeds_with_service_by_segment = pd.merge(
    speeds_by_segment,
    transit_service,
    on = ["name", "route_id", "peak"],
    how = "inner",
    validate = "1:1"
)
'''

## Spatial Join segments to tracts

In [None]:
segments_with_tract = gpd.sjoin(
    speeds_by_segment,
    calenviroscreen_tracts,
    how = "inner",
    predicate = "intersects"
).drop(columns = "index_right")

## Aggregate speeds and trips by equity group

In [None]:
avg_speeds_by_equity = portfolio_utils.aggregate_by_geography(
    segments_with_tract,
    group_cols = ["equity_group", "peak"],
    sum_cols = ["trip_id_nunique"],
    mean_cols = ["speed_mph_mean"],
    nunique_cols = ["route_id"]
)

avg_speeds_by_equity = avg_speeds_by_equity.assign(
    speed_mph_mean = avg_speeds_by_equity.speed_mph_mean.round(1)
)

## Visualizations

In [None]:
chart = (alt.Chart(avg_speeds_by_equity)
         .mark_bar()
         .encode(
             x = alt.X("equity_group:O", title="Equity Group"),
             y = alt.Y("speed_mph_mean:Q", title="Avg Speed"),
             column = alt.Column("peak:N"),
             color = alt.Color("equity_group:O", 
                               scale = alt.Scale(
                                   range=cp.CALITP_CATEGORY_BOLD_COLORS)
                              ),
             tooltip = ["equity_group", "speed_mph_mean", "peak"]
         ).properties(
             title="Avg Speeds by Equity Group", width=150
         ).interactive()
)

chart

In [None]:
chart2 = (alt.Chart(avg_speeds_by_equity)
         .mark_bar()
         .encode(
             x = alt.X("equity_group:O", title="Equity Group"),
             y = alt.Y("trip_id_nunique:Q", title="# Trips"),
             column = alt.Column("peak:N"),
             color = alt.Color("equity_group:O", 
                               scale = alt.Scale(
                                   range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
                              ),
             tooltip = ["equity_group", "trip_id_nunique", "peak"]
         ).properties(
             title="Trips by Equity Group", width=150
         ).interactive()
)

chart2