## Request for Aggregated Data by Agency
* <i>would there be any way to present aggregated data at the agency level?  I'd like to see (and, ideally compare) spatial accuracy and VP per minute.</i>

In [2]:
import _operators_prep
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [21]:
import altair as alt
import yaml

with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

# Color Palette
with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)

import _report_utils
from IPython.display import HTML, Markdown, display, display_html

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [6]:
cols_to_keep = [
    "service_date",
    "organization_name",
    "caltrans_district",
    "sched_rt_category",
    "route_long_name",
    "route_combined_name",
    "route_primary_direction",
    "time_period",
    "n_scheduled_trips",
    "total_vp",
    "total_rt_service_minutes",
    "vp_in_shape",
]

In [4]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [5]:
sched_vp_df = pd.read_parquet(schd_vp_url)

In [7]:
sched_vp_df2 = sched_vp_df[cols_to_keep]

In [8]:
# Keep only rows that are schedule_and_vp like in the portfolio
sched_vp_df3 = sched_vp_df2.loc[
    sched_vp_df2.sched_rt_category == "schedule_and_vp"
].reset_index(drop=True)

In [10]:
# Aggregate by district, organization, and service date 
agg1 = (
    sched_vp_df3.groupby(["caltrans_district", "organization_name", "service_date"])
    .agg({"total_vp": "sum", "vp_in_shape": "sum", "total_rt_service_minutes": "sum"})
    .reset_index()
)

In [11]:
# Find metrics
agg1["spatial_accuracy"] = (agg1.vp_in_shape / agg1.total_vp) * 100
agg1["vp_per_min"] = agg1.total_vp / agg1.total_rt_service_minutes

In [13]:
# Sort the data 
agg1 = agg1.sort_values(
    by=["caltrans_district", "organization_name", "service_date"]
).reset_index(drop=True)

In [14]:
agg1.head()

Unnamed: 0,caltrans_district,organization_name,service_date,total_vp,vp_in_shape,total_rt_service_minutes,spatial_accuracy,vp_per_min
0,01 - Eureka,Blue Lake Rancheria,2023-04-12,21250,20718,7756.46,97.5,2.74
1,01 - Eureka,Blue Lake Rancheria,2023-05-17,21190,20718,7779.04,97.77,2.72
2,01 - Eureka,Blue Lake Rancheria,2023-06-14,22502,21562,8198.7,95.82,2.74
3,01 - Eureka,Blue Lake Rancheria,2023-07-12,22006,21104,7957.42,95.9,2.77
4,01 - Eureka,Blue Lake Rancheria,2023-08-15,23260,22264,8588.9,95.72,2.71


### Match `organization_name` with what's on the portfolio

In [15]:
to_keep = [
    "organization_name",
    "caltrans_district",
]

In [16]:
sched_only_and_vp = _operators_prep.operators_schd_vp_rt()[to_keep]

In [17]:
m1 = pd.merge(agg1, sched_only_and_vp, on=to_keep, how="outer", indicator=True)

In [18]:
m1._merge.value_counts()

both          1193
right_only      83
left_only       58
Name: _merge, dtype: int64

In [19]:
# Merge for only rows found in both
m2 = pd.merge(agg1, sched_only_and_vp, on=to_keep, how="inner")

### Charts

In [28]:
def heatmap(
    df: pd.DataFrame,
    column: str,
    domain_color: list,
    range_color: list,
    max_y_axis: int,
) -> alt.Chart:
    # Grab District
    district = df.caltrans_district.iloc[0]

    # Create color scale
    color_scale = alt.Scale(domain=domain_color, range=range_color)

    chart = (
        alt.Chart(df)
        .mark_rect()
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title="Date",
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
            ),
            y=alt.Y(
                "organization_name:N",
                title="Organization Name",
                axis=alt.Axis(labelFontSize=9),
            ),
            color=alt.Color(
                f"{column}:Q",
                title=_report_utils.labeling(column),
                scale=color_scale,
            ),
            tooltip=["organization_name", "caltrans_district", column, "service_date"],
        )
    )

    chart = chart.properties(width=600, height=300, title=f"District {district}")
    return chart

In [29]:
unique_districts = list(m2.caltrans_district.unique())

In [36]:
charts = []
for district in unique_districts:
    filtered_df = m2.loc[m2.caltrans_district == district]
    chart = heatmap(
        filtered_df,
        "vp_per_min",
        color_dict["vp_domain"],
        color_dict["vp_range"],
        3,
    )
    charts.append(chart)
display(
    Markdown(
        f"""<h1 style="border-bottom: 3px solid #000;"><b>{readable_dict["vp_per_min_graph"]["title"]}</b></h1><br>
        {readable_dict["vp_per_min_graph"]["subtitle"]}
            """
    )
)
display(alt.vconcat(*charts))

<h1 style="border-bottom: 3px solid #000;"><b>Vehicle Positions per Minute</b></h1><br>
        Trips should have 2+ VPs per minute. This metric reflects the accuracy of the temporal data collected.
            

In [37]:
charts = []
for district in unique_districts:
    filtered_df = m2.loc[m2.caltrans_district == district]
    chart = heatmap(
        filtered_df,
        "spatial_accuracy",
        color_dict["spatial_accuracy_domain"],
        color_dict["spatial_accuracy_range"],
        3,
    )
    charts.append(chart)
display(
    Markdown(
        f"""<h1 style="border-bottom: 3px solid #000;"><b>{readable_dict["spatial_accuracy_graph"]["title"]}</b></h1><br>
        {readable_dict["spatial_accuracy_graph"]["subtitle"]}
            """
    )
)
display(alt.vconcat(*charts))

<h1 style="border-bottom: 3px solid #000;"><b>Spatial Accuracy</b></h1><br>
        The percentage of vehicle positions that fall within 35 meters of a route's scheduled shape (path) reflects the accuracy of the collected spatial data.
            