## Request for Aggregated Data.
* would there be any way to present aggregated data at the agency level?  I'd like to see (and, ideally compare) spatial accuracy and VP per minute.   

In [16]:
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [12]:
GTFS_DATA_DICT.digest_tables.route_schedule_vp

'digest/schedule_vp_metrics'

In [7]:
GTFS_DATA_DICT.rt_vs_schedule_tables.vp_route_direction_metrics

'vp_route_dir/route_direction_metrics'

In [8]:
df1 = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_vs_schedule/schedule_route_dir/schedule_route_direction_metrics_2024-08-14.parquet"
)

In [11]:
df1.head(2).drop(columns=["geometry"])

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,route_name,avg_scheduled_service_minutes,avg_stop_miles,n_trips,time_period,frequency,is_coverage,is_downtown_local,is_local,is_rapid,is_express,is_rail,route_primary_direction
0,1dc8ca4d6e9e915c60172b9ff57baefc,11X,0.0,p_1438047,Napa-Vallejo Express,51.2,4.14,5,all_day,0.21,1.0,0.0,0.0,0.0,1.0,0.0,Northbound
1,1dc8ca4d6e9e915c60172b9ff57baefc,11X,0.0,p_1438047,Napa-Vallejo Express,51.2,4.14,5,peak,0.62,1.0,0.0,0.0,0.0,1.0,0.0,Northbound


In [13]:
GTFS_DATA_DICT.schedule_downloads.trips

'trips'

In [19]:
# time_series_utils.concatenate_datasets_across_dates??

In [17]:
analysis_date_list = rt_dates.y2024_dates

In [21]:
import merge_operator_service

In [22]:
trips = merge_operator_service.concatenate_trips(analysis_date_list)

In [23]:
trips.head(2)

Unnamed: 0,name,service_date,route_long_name,trip_first_departure_datetime_pacific,service_hours
0,VCTC GMV Schedule,2024-01-17,Route 18,2024-01-17 15:45:00,0.58
1,LA Metro Bus Schedule,2024-01-17,Metro Local Line,2024-01-17 12:47:00,0.78


In [24]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [25]:
sched_vp_df = pd.read_parquet(schd_vp_url)

In [34]:
sched_vp_df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'is_express', 'is_rapid', 'is_rail', 'is_coverage',
       'is_downtown_local', 'is_local', 'service_date', 'typology',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes',
       'schedule_source_record_id_x', 'sched_rt_category', 'speed_mph', 'name',
       'route_long_name', 'route_short_name', 'route_combined_name',
       'route_id', 'schedule_source_record_id_y', 'base64_url',
       'organization_source_record_id', 'organization_n

In [52]:
sched_vp_df2 = sched_vp_df[
    [
        "service_date",
        "organization_name",
        "caltrans_district",
        "sched_rt_category",
        "route_long_name",
        "route_combined_name",
        "route_primary_direction",
        "time_period",
        "n_scheduled_trips",
        "total_vp",
        "total_rt_service_minutes",
        "vp_in_shape",
    ]
]

In [54]:
sched_vp_df3 = sched_vp_df2.loc[
    sched_vp_df2.sched_rt_category != "schedule_only"
].reset_index(drop=True)

In [56]:
# Test with marin first
marin_df = sched_vp_df3.loc[
    sched_vp_df3.organization_name == "Marin County Transit District"
]

In [62]:
marin_df_agg1 = (
    marin_df.groupby(["service_date"])
    .agg({"total_vp": "sum", "vp_in_shape": "sum", "total_rt_service_minutes": "sum"})
    .reset_index()
)

In [67]:
marin_df_agg1["spatial_accuracy"] = marin_df_agg1.vp_in_shape / marin_df_agg1.total_vp

In [70]:
marin_df_agg1["vp_per_min"] = marin_df_agg1.total_vp/marin_df_agg1.total_rt_service_minutes 

In [71]:
marin_df_agg1

Unnamed: 0,service_date,total_vp,vp_in_shape,total_rt_service_minutes,vp_per_min,spatial_accuracy
0,2023-03-15,233730,176470,99062.02,2.36,0.76
1,2023-04-12,211354,161482,97942.04,2.16,0.76
2,2023-05-17,189012,172524,74813.9,2.53,0.91
3,2023-06-14,185750,174704,67864.64,2.74,0.94
4,2023-07-12,184550,168758,68946.2,2.68,0.91
5,2023-08-15,170768,159188,64528.56,2.65,0.93
6,2023-09-13,192160,173710,72335.14,2.66,0.9
7,2023-10-11,182182,167856,67881.96,2.68,0.92
8,2023-11-15,193766,179308,73111.08,2.65,0.93
9,2023-12-13,191694,176090,72716.1,2.64,0.92
