In [16]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(1_000_000_000_000) ## 1TB?

from siuba import *
import pandas as pd
import geopandas as gpd
import datetime as dt

from rt_analysis import rt_filter_map_plot

from shared_utils import rt_utils, rt_dates
from calitp_data_analysis.tables import tbls
from calitp_data_analysis.sql import query_sql




## Checking if intermediate data is available

   * First, decide on a date of interest. Let's use May 17, 2023
   * The rt_utils.get_operators function takes the analysis date as a datetime.date object, which we'll construct below
   * It also takes a list of operators (currently itp_ids) of interest
   * Let's start with Big Blue Bus (300)

In [9]:
analysis_date = dt.date(2023, 5, 17)

In [13]:
def get_speedmaps_ix_df(analysis_date: dt.date, itp_id: int | None = None) -> pd.DataFrame:
    """
    Collect relevant keys for finding all schedule and rt data for a reports-assessed organization.
    Note that organizations may have multiple sets of feeds, or share feeds with other orgs.

    Used with specific itp_id in rt_analysis.rt_parser.OperatorDayAnalysis, or without specifying itp_id
    to get an overall table of which datasets were processed and how to deduplicate if needed
    """
    analysis_dt = dt.datetime.combine(analysis_date, dt.time(0))

    daily_service = tbls.mart_gtfs.fct_daily_feed_scheduled_service_summary() >> select(
        _.schedule_gtfs_dataset_key == _.gtfs_dataset_key, _.feed_key, _.service_date
    )

    # dim_orgs = (
    #     tbls.mart_transit_database.dim_organizations()
    #     >> filter(_._valid_from <= analysis_dt, _._valid_to > analysis_dt)
    #     >> select(_.source_record_id)
    # )

    org_feeds_datasets = (
        tbls.mart_transit_database.dim_provider_gtfs_data()
        >> filter(_._valid_from <= analysis_dt, _._valid_to >= analysis_dt)
        >> filter(
            _.public_customer_facing_or_regional_subfeed_fixed_route, _.vehicle_positions_gtfs_dataset_key != None
        )
        >> inner_join(_, daily_service, by="schedule_gtfs_dataset_key")
        # >> inner_join(_, dim_orgs, on={"organization_source_record_id": "source_record_id"})
        >> filter(_.service_date == analysis_date)
        >> select(
            _.feed_key,
            _.schedule_gtfs_dataset_key,
            _.vehicle_positions_gtfs_dataset_key,
            _.organization_itp_id,
            # _.caltrans_district,
            _.organization_name,
            _.service_date,
        )
    )

    if itp_id:
        org_feeds_datasets = org_feeds_datasets >> filter(_.organization_itp_id == itp_id)

    return org_feeds_datasets >> collect()

In [43]:
df = get_speedmaps_ix_df(analysis_date)

In [21]:
daily_service = query_sql(
f'''
SELECT gtfs_dataset_key AS schedule_gtfs_dataset_key, feed_key, service_date
FROM cal-itp-data-infra.mart_gtfs.fct_daily_feed_scheduled_service_summary
WHERE service_date = '{analysis_date}'
'''
)

org_feeds_datasets = query_sql(
f'''
SELECT schedule_gtfs_dataset_key, vehicle_positions_gtfs_dataset_key, organization_itp_id,
organization_name
FROM cal-itp-data-infra.mart_transit_database.dim_provider_gtfs_data
WHERE _valid_from <= '{analysis_date}'
AND _valid_to >= '{analysis_date}'
AND public_customer_facing_or_regional_subfeed_fixed_route
AND NOT vehicle_positions_gtfs_dataset_key IS NULL
'''
)

daily_service.merge(org_feeds_datasets, on='schedule_gtfs_dataset_key')

In [50]:
def get_legacy_speedmaps_ix_df(analysis_date: dt.date) -> pd.DataFrame:
    """
    v1/legacy data only
    
    Collect relevant keys for finding all schedule and rt data for a reports-assessed organization.
    Note that organizations may have multiple sets of feeds, or share feeds with other orgs.
    """
    
    daily_service = query_sql(
    f'''
    SELECT gtfs_dataset_key AS schedule_gtfs_dataset_key, feed_key, service_date
    FROM cal-itp-data-infra.mart_gtfs.fct_daily_feed_scheduled_service_summary
    WHERE service_date = '{analysis_date}'
    '''
    )

    org_feeds_datasets = query_sql(
    f'''
    SELECT schedule_gtfs_dataset_key, vehicle_positions_gtfs_dataset_key, organization_itp_id,
    organization_name
    FROM cal-itp-data-infra.mart_transit_database.dim_provider_gtfs_data
    WHERE _valid_from <= '{analysis_date}'
    AND _valid_to >= '{analysis_date}'
    AND public_customer_facing_or_regional_subfeed_fixed_route
    AND NOT vehicle_positions_gtfs_dataset_key IS NULL
    '''
    )

    daily_service = daily_service.merge(org_feeds_datasets, on='schedule_gtfs_dataset_key')

    return daily_service

In [51]:
get_speedmaps_ix_df_2(analysis_date, 300)

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,service_date,vehicle_positions_gtfs_dataset_key,organization_itp_id,organization_name
91,dbbe8ee4864a2715a40749605395d584,dd0281dab930de28114c82dddba7a33a,2023-05-17,6c2d7daaf979779fa2089c6395baf98b,300,City of Santa Monica


In [52]:
df2 = get_speedmaps_ix_df_2(analysis_date)

In [53]:
df2

Unnamed: 0,schedule_gtfs_dataset_key,feed_key,service_date,vehicle_positions_gtfs_dataset_key,organization_itp_id,organization_name
0,0139b1253130b33adcd4b3a4490530d2,ef2a9772d88be1e335eb3a4d6c8bc148,2023-05-17,62b02efa3b66277d3d6e7a7f74da37dc,474,Tulare County Regional Transit Agency
1,015d67d5b75b5cf2b710bbadadfb75f5,3d079c8f6317b3022761676eebddcc39,2023-05-17,ef98f81941594bebcbc3afa66856def3,194,Marin County Transit District
2,07d3b79f14cec8099119e1eb649f065b,abafd2d70a711fd3ce9a149c4e7f349e,2023-05-17,d8572f10524b0cd0171d98d50b2a7e1b,331,Tahoe Transportation District
3,0881af3822466784992a49f1cc57d38f,77ad395c1411bd121238fc3de6732d8e,2023-05-17,13c6e9f58e53c12416b7754352fa1acd,315,Sonoma-Marin Area Rail Transit District
4,09e16227fc42c4fe90204a9d11581034,9c93eedde54ab5f9a25ba51b977358b8,2023-05-17,00accf770009aafd5dc103ff2eeddb37,70,Cloverdale Transit
...,...,...,...,...,...,...
103,f74424acf8c41e4c1e9fd42838c4875c,2b1009326f4e095ffdb9fd3009d480ad,2023-05-17,7342e9ab4e268b54f9cce8cc25f11345,97,City of Duarte
104,f74424acf8c41e4c1e9fd42838c4875c,2b1009326f4e095ffdb9fd3009d480ad,2023-05-17,7342e9ab4e268b54f9cce8cc25f11345,112,Foothill Transit
105,fa84a7225bba743cc0b7afc21c319a7a,051ef8e3e453f1b8153694811e47d6a7,2023-05-17,55f407b2387b7d084c49010586098dd1,243,City of Pasadena
106,fb467982dcc77a7f9199bebe709bb700,d6e0324a31a3908244bf886131947d68,2023-05-17,a2710c46937c4d17f3b41568a2d67f05,294,Santa Clara Valley Transportation Authority


In [55]:
df.equals(df2[df.columns])

True

Function returns a dictionary of the status of each itp_id passed (in addition to printing it out).

# Generate a speedmap with RtFilterMapper

   * First, we'll construct an `RtFilterMapper` object from intermediate data using `rt_filter_map_plot.from_gcs()`
   * Be sure to check first with `rt_utils.get_operators` that data is available. Otherwise, you'll get an error

In [4]:
rt_filter_map_plot.from_gcs?

[0;31mSignature:[0m [0mrt_filter_map_plot[0m[0;34m.[0m[0mfrom_gcs[0m[0;34m([0m[0mitp_id[0m[0;34m,[0m [0manalysis_date[0m[0;34m,[0m [0mpbar[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Generates RtFilterMapper from cached artifacts in GCS. Generate using rt_parser.OperatorDayAnalysis.export_views_gcs()
    
[0;31mFile:[0m      ~/data-analyses/rt_delay/rt_analysis/rt_filter_map_plot.py
[0;31mType:[0m      function

In [None]:
rt_day = rt_filter_map_plot.from_gcs(300, analysis_date)

In [None]:
no_data = rt_filter_map_plot.from_gcs(300, dt.date(2022, 12, 20)) #we haven't ran this date, so this will give an error

## Key Attributes: information and dataframes

`RtFilterMapper` makes various information available as attributes, including the agency name and itp_id. The rt_trips and stop_delay_view (geo)dataframes loaded from GCS are also present, additionally `RtFilterMapper` calculates additional views such as endpoint_delay_view and endpoint_delay_summary.

These core views are _not_ changed by any filter you set, but they can be filtered manually if desired.

In [None]:
rt_day.organization_name

In [None]:
rt_day.calitp_itp_id

In [None]:
rt_day.rt_trips >> head(3)

In [None]:
rt_day.stop_delay_view >> head(3)

In [None]:
rt_day.endpoint_delay_summary >> head(3)

In [None]:
rt_day.endpoint_delay_view >> head(3)

## Setting filter and generating a map

To set a filter, provide at least one arguement to `set_filter`. Once set, most maps, charts, and descriptions produced by RtFilterMapper will automatically apply this filter.

In [None]:
rt_day.set_filter?

In [None]:
rt_day.reset_filter?

Here, let's filter to the morning peak period (6-9AM) and routes 2 and R12

In [None]:
rt_day.set_filter(start_time='06:00', end_time='09:00', route_names=['2', 'R12'])

In [None]:
m = rt_day.segment_speed_map()

In [None]:
m

## Variance map (new!)

Aggregated speeds are important, but it's also helpful to understand the variation in speeds in each segment. Even if the aggregate speeds are acceptable, if some trips get through an area much faster than others, it's challenging to set a schedule that will provide a consistently good rider experience. Identifying areas of high variance can help target improvements.

This is now quantified as the ratio between 80th percentile and 20th percentile speeds, and used on the CA Transit Speed Maps tool.

In [None]:
rt_day.map_variance?

In [None]:
rt_day.map_variance()

## More performant maps via embedded app

Rather than render the maps directly in the notebook via geopandas and folium, we now have the capability to save them as compressed GeoJSON to GCS and render them in an IFrame using a [minimal web app](https://github.com/cal-itp/data-infra/tree/main/apps/maps).

This method is much more efficient, and we rely on it to maintain the quantity and quality of maps on the [CA Transit Speed Maps](https://analysis.calitp.org/rt/README.html) site.

`display_spa_map` will always show the most recent map generated with either `segment_speed_map` or `map_variance`, then saved via `map_gz_export`.

In [None]:
rt_day.map_gz_export?

In [None]:
%%capture
rt_day.map_gz_export()

In [None]:
rt_day.spa_map_state

In [None]:
rt_day.display_spa_map?

In [None]:
rt_day.display_spa_map()

In [None]:
rt_day.map_variance(no_render=True)

In [None]:
%%capture
rt_day.map_gz_export(map_type = 'variance')

In [None]:
rt_day.display_spa_map()

## Speed variability chart

Let's say we're interested in a closer look at speed variability along route 2. `RtFilterMapper` supports a detailed speed variability chart, but we must first filter to a single shape_id.

In [None]:
# Get shape_ids associated with routes in our current filter
rt_day._filter(rt_day.rt_trips) >> distinct(_.route_short_name, _.shape_id)

In [None]:
# Set a new filter using shape_id
rt_day.set_filter(shape_ids=['25975'])

In [None]:
rt_day.chart_variability?

In [None]:
rt_day.chart_variability(num_segments=10)

## Corridor Metrics (for SCCP, LPP, 100 Buses, etc) 

Showing speeds on a map is great, but often it would be nice to have a reasonable estimate of how much time is lost to low speeds in a certain area -- for any combination of routes and times of day. `RtFilterMapper` includes corridor analysis methods designed to facilitate these estimates.

First, it's necessary to define the corridor. Generally, draw the corridor with a small buffer (~10 meters) around the segment you wish to measure. Note that `RtFilterMapper` currently only handles one contiguous corridor at a time. To draw the corridor, use any platform that outputs a supported GIS file format. [geojson.io](https://geojson.io) is an easy one, but you could also use ESRI or QGIS.

If using geojson.io:
* Navigate the map to your area of interest
* Click the pentagon-shaped "polygon" tool in the top center right of the map
* Construct your polygon as a series of points
* When back at the start, double-click on the starting point to finish drawing (a complete polygon will look like the screenshot below)
* Save file as geojson

![finished polygon on geojson.io](img/geojson_io.png)

### Attaching Corridor to an RtFilterMapper instance

* first, construct an `RtFilterMapper` on the date of interest as above
* use geopandas to load your corridor file as a geodataframe
* pass that geodataframe to `RtFilterMapper.add_corridor()`

In [None]:
corridor = gpd.read_file('./corridors_100buses/example_santamonica.geojson')

In [None]:
corridor

In [None]:
rt_day.add_corridor(corridor)

`add_corridor` finds all shapes with at least one stop within the designated corridor, and tracks all stops for those shapes starting with the stop just before entering the corridor and ending with the stop just after leaving the corridor. You can use `RtFilterMapper.quick_map_corridor()` to confirm corridor and stop locations.

In [None]:
rt_day.quick_map_corridor()

### Using Corridor for Maps and Metrics

In [None]:
# reset to remove filter from earlier
rt_day.reset_filter()

With a corridor attached, you can use the `RtFilterMapper.segment_speed_map()` `corridor` arguement to generate a speed map filtered to just the corridor.

In [None]:
rt_day.segment_speed_map(corridor=True)

#### About the Metrics

The schedule-based metric is a daily average of the sum of median trip stop delays along the corridor. To further explain, we take each corridor trip that we have data for and look at the delay in comparison to the schedule at each stop, after subtracting off any delay present as the trip entered the corridor. For each trip we then take the median delay of all stops along the corridor, and sum these medians to create the metric.

The speed-based metric is a daily average of the sum of delays for each trip traversing the corridor as compared to a reference speed of 16 miles per hour. To further explain, we take each corridor trip that we have data for and calculate the hypothetical time it would take for that trip to traverse the corridor at a speed of 16 mph. The difference between the actual time it took for the trip to traverse the corridor and that hypothetical time is the speed-based delay for that trip, and we sum those delays to create the metric. This metric is intended to provide a more consistent basis for comparison independent of scheduling practices.

In other words, if we expect a hypothetical bus lane/signal priority/payment system etc to increase corridor speeds to 16mph, this is how much time we could save per day.

With corridor attached, generate both metrics using `RtFilterMapper.corridor_metrics()`

In [None]:
rt_day.corridor_metrics()

`RtFilterMapper.corridor_metrics()` now supports `RtFilterMapper`'s filtering system. If the filter set excludes all corridor trips, it will raise an error.

_don't set a filter if running SCCP/LPP metrics_

In [None]:
rt_day.set_filter(shape_ids = ['26025']) # a single shape through corridor

In [None]:
rt_day.corridor_metrics()

In [None]:
rt_day.set_filter(route_names = ['R12']) # route not near corridor

In [None]:
rt_day.corridor_metrics()

If running metrics for the SCCP or LPP programs, `rt_analysis` now includes a convienience function to generate the required nine-day average.

In [None]:
from rt_analysis import sccp_tools
import numpy as np

2022-2023 cycle standard SCCP/LPP date range:

In [None]:
date_range = np.arange('2022-04-30', '2022-05-09', dtype='datetime64[D]')

The next cell generates an `OperatorDayAnalysis` for each date in the range that hasn't been ran already, and saves intermediate data to gcs.

In [None]:
%%capture --no-stdout
for date in date_range:
    analysis_date = date.astype(dt.date)
    check_ran = rt_utils.get_operators(analysis_date, [300])
    if check_ran[300] == 'already_ran':
        continue
    rt_day = rt_parser.OperatorDayAnalysis(300, analysis_date, pbar)
    # don't worry about the CRS not set UserWarning
    rt_day.export_views_gcs()

In [None]:
sccp_tools.sccp_average_metrics?

In [None]:
bbb_corridor_metrics = sccp_tools.sccp_average_metrics(itp_id=300, date_range=date_range, corridor=corridor)

In [None]:
bbb_corridor_metrics

If needed, it's easy to construct a dataframe with the metrics and the date array used, to support charts or tables

In [None]:
pd.DataFrame({'dates': date_range, 'schedule_metric': bbb_corridor_metrics['all_schedule'],
'speed_metric': bbb_corridor_metrics['all_speed']})