## Request for Aggregated Data by Agency
* <i>would there be any way to present aggregated data at the agency level?  I'd like to see (and, ideally compare) spatial accuracy and VP per minute.</i>

In [1]:
import _aggregate_agency
import _operators_prep
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
import altair as alt
import yaml

with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

# Color Palette
with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)

import _report_utils
from IPython.display import HTML, Markdown, display, display_html

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [14]:
og = _aggregate_agency.load_data()

In [15]:
agg1 = _aggregate_agency.aggregate_by_agency(og)

In [26]:
og.loc[og.organization_name.str.contains("Yuma")].head()

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [6]:
def harmonize_org_names() -> pd.DataFrame:

    # Find relevant operators
    to_keep = [
        "organization_name",
        "caltrans_district",
    ]

    sched_only_and_vp = _operators_prep.operators_schd_vp_rt()[to_keep]

    return sched_only_and_vp

In [7]:
op_names = harmonize_org_names()

In [17]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

schd_vp_df = pd.read_parquet(
    schd_vp_url,
    filters=[[("sched_rt_category", "in", ["schedule_and_vp", "schedule_only"])]],
    columns=[
        "schedule_gtfs_dataset_key",
        "caltrans_district",
        "organization_name",
        "name",
        "sched_rt_category",
        "service_date",
    ],
)

In [25]:
schd_vp_df.loc[schd_vp_df.caltrans_district == "11 - San Diego"][["caltrans_district","organization_name","name","service_date"]].drop_duplicates()

Unnamed: 0,caltrans_district,organization_name,name,service_date
13819,11 - San Diego,Yuma County Intergovernmental Public Transportation Authority,Yuma Schedule,2023-03-15
13820,11 - San Diego,Yuma County Intergovernmental Public Transportation Authority,Yuma Schedule,2023-04-12
13821,11 - San Diego,Yuma County Intergovernmental Public Transportation Authority,Yuma Schedule,2023-05-17
13822,11 - San Diego,Yuma County Intergovernmental Public Transportation Authority,Yuma Schedule,2023-06-14
13823,11 - San Diego,Yuma County Intergovernmental Public Transportation Authority,Yuma Schedule,2023-07-12
13824,11 - San Diego,Yuma County Intergovernmental Public Transportation Authority,Yuma Schedule,2023-08-15
13825,11 - San Diego,Yuma County Intergovernmental Public Transportation Authority,Yuma Schedule,2023-09-13
13826,11 - San Diego,Yuma County Intergovernmental Public Transportation Authority,Yuma Schedule,2023-10-11
13827,11 - San Diego,Yuma County Intergovernmental Public Transportation Authority,Yuma Schedule,2023-11-15
13828,11 - San Diego,Yuma County Intergovernmental Public Transportation Authority,Yuma Schedule,2023-12-13


In [23]:
schd_vp_df.loc[schd_vp_df.organization_name == "Flagship Cruises and Events Inc."][
    ["caltrans_district","organization_name","name","service_date"]
].drop_duplicates()

Unnamed: 0,caltrans_district,organization_name,name,service_date
14457,11 - San Diego,Flagship Cruises and Events Inc.,San Diego Schedule,2023-04-12
14459,11 - San Diego,Flagship Cruises and Events Inc.,San Diego Schedule,2023-06-14
108279,11 - San Diego,Flagship Cruises and Events Inc.,San Diego Schedule,2023-09-13
108280,11 - San Diego,Flagship Cruises and Events Inc.,San Diego Schedule,2023-10-11
108281,11 - San Diego,Flagship Cruises and Events Inc.,San Diego Schedule,2023-11-15
108282,11 - San Diego,Flagship Cruises and Events Inc.,San Diego Schedule,2023-12-13
108283,11 - San Diego,Flagship Cruises and Events Inc.,San Diego Schedule,2024-01-17
108284,11 - San Diego,Flagship Cruises and Events Inc.,San Diego Schedule,2024-02-14
108285,11 - San Diego,Flagship Cruises and Events Inc.,San Diego Schedule,2024-03-13
108286,11 - San Diego,Flagship Cruises and Events Inc.,San Diego Schedule,2024-04-17


In [22]:
schd_vp_df2 = (
    schd_vp_df.dropna(subset="caltrans_district")
    .sort_values(
        by=[
            "caltrans_district",
            "organization_name",
            "service_date",
        ],
        ascending=[True, True, False],
    )
    .drop_duplicates(
        subset=[
            "organization_name",
            "caltrans_district",
        ]
    )
    .reset_index(drop=True)
    )

In [9]:
op_names

Unnamed: 0,organization_name,caltrans_district
0,City of Eureka,01 - Eureka
1,Curry Public Transit,01 - Eureka
2,Lake Transit Authority,01 - Eureka
3,Mendocino Transit Authority,01 - Eureka
4,POINT,01 - Eureka
5,Redwood Coast Transit Authority,01 - Eureka
6,Lassen Transit Service Agency,02 - Redding
7,Modoc Transportation Agency,02 - Redding
8,Plumas Transit Systems,02 - Redding
9,Shasta County,02 - Redding


In [11]:
to_keep = [
    "organization_name",
    "caltrans_district",
]

In [12]:
# Merge for only rows found in both
m1 = pd.merge(agg1, op_names, on=to_keep, how="inner")

In [13]:
m1.loc[m1.organization_name == "Flagship Cruises and Events Inc."]

Unnamed: 0,caltrans_district,organization_name,service_date,total_vp,vp_in_shape,total_rt_service_minutes,spatial_accuracy,vp_per_min


### Match `organization_name` with what's on the portfolio

In [None]:
m2.columns = m2.columns.map(_report_utils.replace_column_names)

### Double Check Operators

In [None]:
m2.loc[
    (m2.Organization == "City and County of San Francisco") & (m2.Date == "2024-04-17")
]

In [None]:
sf_check = sched_vp_df.loc[
    (sched_vp_df.organization_name == "City and County of San Francisco")
    & (sched_vp_df.service_date == "2024-04-17")
]

In [None]:
sf_check.n_scheduled_trips.sum()

In [None]:
sf_check.total_vp.sum()

In [None]:
sf_check.vp_in_shape.sum()

In [None]:
sf_check.vp_in_shape.sum() / sf_check.total_vp.sum()

In [None]:
sf_check.total_vp.sum() / sf_check.total_rt_service_minutes.sum()

In [None]:
m2.loc[(m2.Organization == "City of Elk Grove") & (m2.Date == "2023-05-17")]

In [None]:
elkgrove_check = sched_vp_df.loc[
    (sched_vp_df.organization_name == "City of Elk Grove")
    & (sched_vp_df.service_date == "2023-05-17")
]

In [None]:
elkgrove_check.n_scheduled_trips.sum()

In [None]:
elkgrove_check.total_vp.sum()

In [None]:
elkgrove_check.vp_in_shape.sum()

In [None]:
elkgrove_check.total_vp.sum() / elkgrove_check.total_rt_service_minutes.sum()

In [None]:
elkgrove_check.vp_in_shape.sum() / elkgrove_check.total_vp.sum()

In [None]:
m2.loc[(m2.Organization == "City of Torrance") & (m2.Date == "2023-06-14")]

In [None]:
torrance_check = sched_vp_df.loc[
    (sched_vp_df.organization_name == "City of Torrance")
    & (sched_vp_df.service_date == "2023-06-14")
]

In [None]:
torrance_check.total_vp.sum()

In [None]:
torrance_check.vp_in_shape.sum()

In [None]:
torrance_check.total_vp.sum() / torrance_check.total_rt_service_minutes.sum()

In [None]:
torrance_check.vp_in_shape.sum() / torrance_check.total_vp.sum()

### Charts

In [None]:
unique_districts = list(m2.District.unique())

In [None]:
charts = []
for district in unique_districts:
    filtered_df = m2.loc[m2.District == district]
    chart = heatmap(
        filtered_df,
        "vp_per_min",
        color_dict["vp_domain"],
        color_dict["vp_range"],
        3,
    )
    charts.append(chart)
display(
    Markdown(
        f"""<h1 style="border-bottom: 3px solid #000;"><b>{readable_dict["vp_per_min_graph"]["title"]}</b></h1><br>
        {readable_dict["vp_per_min_graph"]["subtitle"]}
            """
    )
)
display(alt.vconcat(*charts))

In [None]:
charts = []
for district in unique_districts:
    filtered_df = m2.loc[m2.District == district]
    chart = heatmap(
        filtered_df,
        "spatial_accuracy",
        color_dict["spatial_accuracy_domain"],
        color_dict["spatial_accuracy_range"],
        3,
    )
    charts.append(chart)
display(
    Markdown(
        f"""<h1 style="border-bottom: 3px solid #000;"><b>{readable_dict["spatial_accuracy_graph"]["title"]}</b></h1><br>
        {readable_dict["spatial_accuracy_graph"]["subtitle"]}
            """
    )
)
display(alt.vconcat(*charts))