## Agency Grain Metrics
* Add it to the pipeline in `rt_scheduled_v_ran/scripts/rt_v_scheduled_agency.py`
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    helpers,
    metrics,
    time_series_utils,
)
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Exploring
* Need to filter for only ("sched_rt_category", "==", "schedule_and_vp") to get the same results as the one on analysis.calitp.org.

In [3]:
dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables

In [4]:
rt_dates.y2024_dates

['2024-01-17',
 '2024-02-14',
 '2024-03-13',
 '2024-04-17',
 '2024-05-22',
 '2024-06-12',
 '2024-07-17',
 '2024-08-14',
 '2024-09-18']

In [5]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [6]:
ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics

In [7]:
ROUTE_EXPORT

'vp_route_dir/route_direction_metrics'

In [8]:
analysis_date = "2024-09-18"

In [9]:
df = pd.read_parquet(f"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet")

In [10]:
df = df.loc[df["time_period"] == "all_day"].reset_index(drop=True)

In [11]:
df.sample().T

Unnamed: 0,2275
schedule_gtfs_dataset_key,cc53a0dbf5df90e3009b9cb5d89d80ba
route_id,4867
direction_id,0.00
time_period,all_day
minutes_atleast1_vp,1629
minutes_atleast2_vp,1578
total_rt_service_minutes,1627.50
total_scheduled_service_minutes,1221.00
total_vp,4613
vp_in_shape,3491


In [12]:
groupby_cols = [
    "caltrans_district",
    "organization_name",
    "schedule_gtfs_dataset_key",
]

In [13]:
agg1 = (
    df.groupby(groupby_cols)
    .agg(
        {
            "total_vp": "sum",
            "vp_in_shape": "sum",
            "total_rt_service_minutes": "sum",
        }
    )
    .reset_index()
)

In [14]:
agg1["vp_per_min_agency"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round(2)
agg1["spatial_accuracy_agency"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2)

In [15]:
63718 / 97768

0.6517265362899927

In [16]:
agg1.loc[agg1.organization_name == "Marin County Transit District"]

Unnamed: 0,caltrans_district,organization_name,schedule_gtfs_dataset_key,total_vp,vp_in_shape,total_rt_service_minutes,vp_per_min_agency,spatial_accuracy_agency
23,04 - Oakland,Marin County Transit District,015d67d5b75b5cf2b710bbadadfb75f5,97768,63718,36831.13,2.65,65.17


### Original

In [17]:
organization_name = "Marin County Transit District"

In [18]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

# Keep only rows that are found in both schedule and real time data
vp_sched_df = pd.read_parquet(schd_vp_url)

In [19]:
vp_sched_df = vp_sched_df.loc[vp_sched_df["time_period"] == "all_day"].reset_index(
    drop=True
)

In [20]:
schedule_and_vp_only = vp_sched_df.loc[vp_sched_df.sched_rt_category == "schedule_and_vp"]

In [21]:
vp_sched_df2 = (
    vp_sched_df.groupby(
        [
            "caltrans_district",
            "organization_name",
            "schedule_gtfs_dataset_key",
            "service_date",
        ]
    )
    .agg(
        {
            "total_vp": "sum",
            "total_rt_service_minutes": "sum",
            "vp_in_shape": "sum",
        }
    )
    .reset_index()
)

In [22]:
vp_sched_df2.loc[
    (vp_sched_df2.organization_name == organization_name)
    & (vp_sched_df2.service_date == "2024-09-18")
]

Unnamed: 0,caltrans_district,organization_name,schedule_gtfs_dataset_key,service_date,total_vp,total_rt_service_minutes,vp_in_shape
956,04 - Oakland,Marin County Transit District,015d67d5b75b5cf2b710bbadadfb75f5,2024-09-18,97768,36831.13,63718


In [23]:
vp_sched_df3 = (
    schedule_and_vp_only.groupby(["caltrans_district", "organization_name", "service_date"])
    .agg(
        {
            "total_vp": "sum",
            "total_rt_service_minutes": "sum",
            "vp_in_shape": "sum",
        }
    )
    .reset_index()
)

In [24]:
vp_sched_df3["vp_per_min_agency"] = ((vp_sched_df3.total_vp / vp_sched_df3.total_rt_service_minutes)).round(2)
vp_sched_df3["spatial_accuracy_agency"] = ((vp_sched_df3.vp_in_shape / vp_sched_df3.total_vp) * 100).round(2)

In [25]:
vp_sched_df3.loc[
    (vp_sched_df3.organization_name == organization_name)
    & (vp_sched_df3.service_date == "2024-09-18")
].T

Unnamed: 0,477
caltrans_district,04 - Oakland
organization_name,Marin County Transit District
service_date,2024-09-18 00:00:00
total_vp,67420
total_rt_service_minutes,25282.30
vp_in_shape,61736
vp_per_min_agency,2.67
spatial_accuracy_agency,91.57


In [26]:
63718 / 97768

0.6517265362899927