## Agency Grain Metrics
* Starting from `vp_trips` this time. 
* Add it to the pipeline in `rt_scheduled_v_ran/scripts/rt_v_scheduled_agency.py`
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    helpers,
    metrics,
    time_series_utils,
)
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables

In [4]:
dict_inputs

{'dir': '${gcs_paths.RT_SCHED_GCS}', 'stop_times_direction': 'stop_times_direction', 'sched_trip_metrics': 'schedule_trip/schedule_trip_metrics', 'sched_route_direction_metrics': 'schedule_route_dir/schedule_route_direction_metrics', 'vp_trip_metrics': 'vp_trip/trip_metrics', 'vp_route_direction_metrics': 'vp_route_dir/route_direction_metrics', 'vp_operator_metrics': 'vp_operator/operator_metrics', 'schedule_rt_stop_times': 'schedule_rt_stop_times', 'early_trip_minutes': -5, 'late_trip_minutes': 5}

In [5]:
rt_dates.y2024_dates

['2024-01-17',
 '2024-02-14',
 '2024-03-13',
 '2024-04-17',
 '2024-05-22',
 '2024-06-12',
 '2024-07-17',
 '2024-08-14',
 '2024-09-18']

In [6]:
TRIP_EXPORT = dict_inputs.vp_trip_metrics

In [7]:
crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "name",
    "organization_name",
    "caltrans_district",]

In [8]:
analysis_date = "2024-09-18"

In [9]:
df = pd.read_parquet(f"{RT_SCHED_GCS}{TRIP_EXPORT}_{analysis_date}.parquet")

In [10]:
df2 = gtfs_schedule_wrangling.merge_operator_identifiers(
        df,
        [analysis_date],
        columns = crosswalk_cols)

### What time of day do I use?
`df.loc[df["time_period"] == "all_day"]` is not available.

In [11]:
df2.time_of_day.unique()

array(['AM Peak', 'Evening', 'PM Peak', 'Early AM', 'Midday', 'Owl'],
      dtype=object)

In [12]:
df2.peak_offpeak.unique()

array(['peak', 'offpeak'], dtype=object)

In [13]:
df2.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,route_id,direction_id,scheduled_service_minutes,total_vp,rt_service_minutes,minutes_atleast1_vp,minutes_atleast2_vp,vp_in_shape,sched_rt_category,time_of_day,peak_offpeak,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_difference,is_early,is_ontime,is_late,name,organization_name,caltrans_district
0,7cc0cb1871dfd558f11a2885c145d144,000213c9d5753f9565b679d8ab84929f,2,1.0,29.0,200,66.25,68,66,195,schedule_and_vp,AM Peak,peak,3.02,0.97,1.0,1.0,1.0,1.0,37.25,0,0,1,Bay Area 511 Muni Schedule,City and County of San Francisco,04 - Oakland


In [14]:
groupby_cols = [
    "caltrans_district",
    "organization_name",
    "schedule_gtfs_dataset_key",
]

In [15]:
agg1 = (
    df2.groupby(groupby_cols)
    .agg(
        {
            "total_vp": "sum",
            "vp_in_shape": "sum",
            "rt_service_minutes": "sum",
        }
    )
    .reset_index()
)

In [16]:
agg1["vp_per_min_agency"] = ((agg1.total_vp / agg1.rt_service_minutes)).round(2)
agg1["spatial_accuracy_agency"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2)

In [17]:
agg1.head()

Unnamed: 0,caltrans_district,organization_name,schedule_gtfs_dataset_key,total_vp,vp_in_shape,rt_service_minutes,vp_per_min_agency,spatial_accuracy_agency
0,01 - Eureka,City of Arcata,a253a8d7acd57657bb98050f37dd6b0f,38142,17935,13367.5,2.85,47.02
1,01 - Eureka,City of Eureka,a253a8d7acd57657bb98050f37dd6b0f,38142,17935,13367.5,2.85,47.02
2,01 - Eureka,Humboldt Transit Authority,a253a8d7acd57657bb98050f37dd6b0f,38142,17935,13367.5,2.85,47.02
3,01 - Eureka,Lake Transit Authority,0a3c0b21c85fb09f8db91599e14dd7f7,11572,11223,5015.75,2.31,96.98
4,01 - Eureka,Mendocino Transit Authority,770072d7a8d356b529ef34fe01715bcb,16196,13702,6699.27,2.42,84.6


In [18]:
agg1.loc[agg1.organization_name == "Marin County Transit District"].T

Unnamed: 0,28
caltrans_district,04 - Oakland
organization_name,Marin County Transit District
schedule_gtfs_dataset_key,015d67d5b75b5cf2b710bbadadfb75f5
total_vp,97768
vp_in_shape,63718
rt_service_minutes,36831.13
vp_per_min_agency,2.65
spatial_accuracy_agency,65.17
