## Agency Grain Metrics
* Add it to the pipeline in `rt_scheduled_v_ran/scripts/rt_v_scheduled_agency.py`
* `cd data-analyses/rt_segment_speeds && pip install -r requirements.txt && cd ../_shared_utils && make setup_env && cd ../gtfs_digest`

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    helpers,
    metrics,
    time_series_utils,
)
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Exploring

In [3]:
RT_SCHED_GCS

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [4]:
dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables

In [5]:
ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics

In [6]:
ROUTE_EXPORT

'vp_route_dir/route_direction_metrics'

In [7]:
dict_inputs.vp_agency_metrics

'vp_agency/agency_metrics'

In [8]:
analysis_date = rt_dates.DATES["apr2024"]

In [28]:
{k: v for k, v in  rt_dates.DATES.items() if (k[:3], k[3:]) >= ('mar', '2023')}

{'may2022': '2022-05-04',
 'sep2022': '2022-09-14',
 'sep2022a': '2022-09-21',
 'oct2022': '2022-10-12',
 'nov2022a': '2022-11-07',
 'nov2022b': '2022-11-08',
 'nov2022c': '2022-11-09',
 'nov2022d': '2022-11-10',
 'nov2022': '2022-11-16',
 'mar2023': '2023-03-15',
 'may2023': '2023-05-17',
 'sep2023': '2023-09-13',
 'oct2023a': '2023-10-09',
 'oct2023b': '2023-10-10',
 'oct2023': '2023-10-11',
 'oct2023c': '2023-10-12',
 'oct2023d': '2023-10-13',
 'oct2023e': '2023-10-14',
 'oct2023f': '2023-10-15',
 'nov2023': '2023-11-15',
 'mar2024': '2024-03-13',
 'may2024': '2024-05-22',
 'sep2024': '2024-09-18'}

In [9]:
df = pd.read_parquet(f"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet")

In [10]:
df = df.loc[df["time_period"] == "all_day"].reset_index(drop=True)

In [11]:
groupby_cols = ["caltrans_district", "organization_name", "schedule_gtfs_dataset_key"]

In [12]:
agg1 = (
    df.groupby(groupby_cols)
    .agg(
        {
            "total_vp": "sum",
            "vp_in_shape": "sum",
            "total_rt_service_minutes": "sum",
        }
    )
    .reset_index()
)

In [13]:
agg1["vp_per_min_agency"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round(2)
agg1["spatial_accuracy_agency"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(2)

In [14]:
agg1.sample().T

Unnamed: 0,32
caltrans_district,04 - Oakland
organization_name,Santa Clara Valley Transportation Authority
schedule_gtfs_dataset_key,fb467982dcc77a7f9199bebe709bb700
total_vp,778638
vp_in_shape,739031
total_rt_service_minutes,303399.17
vp_per_min_agency,2.57
spatial_accuracy_agency,94.91


In [15]:
agg1.head(2)

Unnamed: 0,caltrans_district,organization_name,schedule_gtfs_dataset_key,total_vp,vp_in_shape,total_rt_service_minutes,vp_per_min_agency,spatial_accuracy_agency
0,01 - Eureka,City of Eureka,a253a8d7acd57657bb98050f37dd6b0f,37981,18000,13102.61,2.9,47.39
1,01 - Eureka,Lake Transit Authority,0a3c0b21c85fb09f8db91599e14dd7f7,13320,12772,5433.32,2.45,95.89


### Functions 

In [16]:
def agency_metrics(analysis_date: str, dict_inputs: dict) -> pd.DataFrame:
    # start = datetime.datetime.now()

    ROUTE_EXPORT = dict_inputs.vp_route_direction_metrics
    AGENCY_EXPORT = dict_inputs.vp_agency_metrics

    # Read in dataframe.
    df = pd.read_parquet(f"{RT_SCHED_GCS}{ROUTE_EXPORT}_{analysis_date}.parquet")

    # Keep only all_day.
    df = df.loc[df["time_period"] == "all_day"].reset_index(drop=True)

    # Aggregate
    groupby_cols = [
        "caltrans_district",
        "organization_name",
        "schedule_gtfs_dataset_key",
    ]

    sum_cols = ["total_vp", "vp_in_shape", "total_rt_service_minutes"]
    agg1 = df.groupby(groupby_cols).agg({**{e: "sum" for e in sum_cols}}).reset_index()

    agg1["vp_per_min_agency"] = ((agg1.total_vp / agg1.total_rt_service_minutes)).round(
        2
    )
    agg1["spatial_accuracy_agency"] = ((agg1.vp_in_shape / agg1.total_vp) * 100).round(
        2
    )

    agg1 = agg1.drop(columns=sum_cols)
    # Save
    agg1.to_parquet(f"{RT_SCHED_GCS}{AGENCY_EXPORT}_TEST_{analysis_date}.parquet")

    # end = datetime.datetime.now()
    # logger.info(f"agency aggregation {analysis_date}: {end - start}")

    return agg1

In [17]:
analysis_date2 = rt_dates.DATES["apr2024"]

In [18]:
dict_inputs = GTFS_DATA_DICT.rt_vs_schedule_tables

In [19]:
apr_df = agency_metrics(
    analysis_date2,
    dict_inputs,
)

In [20]:
703396 / 862711

0.8153321332404478

In [21]:
apr_df.loc[apr_df.organization_name == "Alameda-Contra Costa Transit District"].T

Unnamed: 0,12
caltrans_district,04 - Oakland
organization_name,Alameda-Contra Costa Transit District
schedule_gtfs_dataset_key,c499f905e33929a641f083dad55c521e
vp_per_min_agency,2.02
spatial_accuracy_agency,81.53


In [22]:
apr_df.sample().T

Unnamed: 0,34
caltrans_district,04 - Oakland
organization_name,Sonoma-Marin Area Rail Transit District
schedule_gtfs_dataset_key,0881af3822466784992a49f1cc57d38f
vp_per_min_agency,3.01
spatial_accuracy_agency,99.61


### Look at the files