## Request for Aggregated Data.
* <i>would there be any way to present aggregated data at the agency level?  I'd like to see (and, ideally compare) spatial accuracy and VP per minute.</i>

In [1]:
import _operators_prep
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from shared_utils import catalog_utils, rt_dates, rt_utils
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS

# https://github.com/cal-itp/data-analyses/blob/main/_shared_utils/shared_utils/gtfs_analytics_data.yml
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [4]:
sched_vp_df = pd.read_parquet(schd_vp_url)

In [6]:
cols_to_keep = [
    "service_date",
    "organization_name",
    "caltrans_district",
    "sched_rt_category",
    "route_long_name",
    "route_combined_name",
    "route_primary_direction",
    "time_period",
    "n_scheduled_trips",
    "total_vp",
    "total_rt_service_minutes",
    "vp_in_shape",
]

In [7]:
sched_vp_df2 = sched_vp_df[cols_to_keep]

In [8]:
sched_vp_df3 = sched_vp_df2.loc[
    sched_vp_df2.sched_rt_category == "schedule_and_vp"
].reset_index(drop=True)

In [9]:
sched_vp_df3.columns

Index(['service_date', 'organization_name', 'caltrans_district',
       'sched_rt_category', 'route_long_name', 'route_combined_name',
       'route_primary_direction', 'time_period', 'n_scheduled_trips',
       'total_vp', 'total_rt_service_minutes', 'vp_in_shape'],
      dtype='object')

In [10]:
agg1 = (
    sched_vp_df3.groupby(["caltrans_district", "organization_name", "service_date"])
    .agg({"total_vp": "sum", "vp_in_shape": "sum", "total_rt_service_minutes": "sum"})
    .reset_index()
)

In [11]:
agg1["spatial_accuracy"] = agg1.vp_in_shape / agg1.total_vp

In [12]:
agg1["vp_per_min"] = agg1.total_vp / agg1.total_rt_service_minutes

In [13]:
agg1 = agg1.sort_values(
    by=["caltrans_district", "organization_name", "service_date"]
).reset_index(drop=True)

### Need to harmonize names since they change over time.

In [14]:
to_keep = [
    "organization_name",
    "caltrans_district",
]

In [15]:
sched_only_and_vp = _operators_prep.operators_schd_vp_rt()[to_keep]

In [18]:
m1 = pd.merge(agg1, sched_only_and_vp, on=to_keep, how="outer", indicator=True)

In [19]:
m1._merge.value_counts()

both          1193
right_only      83
left_only       58
Name: _merge, dtype: int64

### SF is truly missing some values.

In [61]:
agg1.loc[agg1.organization_name == 'City and County of San Francisco']

Unnamed: 0,caltrans_district,organization_name,service_date,total_vp,vp_in_shape,total_rt_service_minutes,spatial_accuracy,vp_per_min
217,04 - Oakland,City and County of San Francisco,2023-03-15,2951218,2786820,1339078.6,0.94,2.2
218,04 - Oakland,City and County of San Francisco,2023-04-12,2940622,2766094,1334229.24,0.94,2.2
219,04 - Oakland,City and County of San Francisco,2023-05-17,2947296,2775222,1363559.04,0.94,2.16
220,04 - Oakland,City and County of San Francisco,2023-06-14,3009324,2817828,1330438.52,0.94,2.26
221,04 - Oakland,City and County of San Francisco,2023-07-12,3019158,2832900,1352234.12,0.94,2.23
222,04 - Oakland,City and County of San Francisco,2023-08-15,2995550,2797332,1349798.64,0.93,2.22
223,04 - Oakland,City and County of San Francisco,2023-10-11,3037422,2837534,1376785.96,0.93,2.21
224,04 - Oakland,City and County of San Francisco,2023-11-15,2776652,2515408,1313324.96,0.91,2.11
225,04 - Oakland,City and County of San Francisco,2023-12-13,3049096,2839148,1359915.4,0.93,2.24
226,04 - Oakland,City and County of San Francisco,2024-01-17,3005168,2803508,1338155.54,0.93,2.25


#### Charts

In [26]:
import altair as alt

In [62]:
def heatmap(df:pd.DataFrame, column:str):
    district = df.caltrans_district.iloc[0]
    chart = alt.Chart(df).mark_rect().encode(
    y='organization_name:N',
    x="yearmonthdate(service_date):O",
    color=f'{column}:Q',
    tooltip=['organization_name','caltrans_district',column,'service_date'])
        
    chart = chart.properties(width=400, height=300, title = f"{column} for {district}")
    return chart

In [56]:
unique_districts = list(m2.caltrans_district.unique())

In [63]:
for district in unique_districts:
    filtered_df = m2.loc[m2.caltrans_district == district]
    display(heatmap(filtered_df, 'vp_per_min'))

In [64]:
for district in unique_districts:
    filtered_df = m2.loc[m2.caltrans_district == district]
    display(heatmap(filtered_df, 'spatial_accuracy'))