# For certain metrics, aggregate them to quarterly in `merge_data`

* Tiffany's Teams message: 
 > you can do it within merge_data for now, it'll be the quickest way to get your quarterly averages. 
merge_data is simply concatenating schedule, summary speeds, rt vs schedule by route-direction-peak/offpeak-single day together. You can take that and aggregate several days into quarterly.
Here's the script for average_segment_speeds that mocks up aggregating across single days / using weighted averages, and you'll see how some of the functions are put together, specifically tagging the quarter-year / weekday or weekend / weighted averages (using n_trips) to weight the metric column. https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/average_segment_speeds.py
* Metrics to Roll Up
    * Avg scheduled minute
    * VP per minute
    * Spatial Accuracy

In [1]:
import _report_utils
import _section1_utils as section1
import _section2_utils as section2
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import (
    catalog_utils,
    portfolio_utils,
    rt_dates,
    rt_utils,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

# Data Dictionary
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = [rt_dates.DATES["feb2025"]]

In [4]:
analysis_date = rt_dates.DATES["feb2025"]

In [5]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [6]:
sdi = "San Diego Metropolitan Transit System"

In [7]:
marin = "Marin County Transit District"

In [8]:
df = pd.read_parquet(schd_vp_url)

In [9]:
df["quarter"] = pd.PeriodIndex(df.service_date, freq="Q").astype("str")

## Edit function because this will be published publicly & all the metrics/columns should be available.

In [10]:
sd_df = df.loc[df.organization_name == sdi]

In [11]:
sd_df.shape

(13025, 47)

In [54]:
sd_df[["rt_sched_journey_ratio"]].head()

Unnamed: 0,rt_sched_journey_ratio
61970,1.5
61973,0.94
61975,0.86
61979,1.47
61982,1.47


In [55]:
sd_df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'is_express', 'is_rapid', 'is_rail', 'is_coverage',
       'is_downtown_local', 'is_local', 'service_date', 'typology',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'route_long_name', 'route_short_name',
       'route_combined_name', 'route_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'route_primary_direction', 'name',
       '

In [12]:
schd_metric_cols = [
    "avg_scheduled_service_minutes",
    "avg_stop_miles",
    "frequency",
    "total_scheduled_service_minutes",
    "rt_sched_journey_ratio",
]

In [13]:
rt_metric_cols = [
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_rt_service_minutes",
    "total_vp",
    "vp_in_shape",
    "vp_per_minute",
    "avg_rt_service_minutes",
    "speed_mph",
]

In [31]:
rt_metric_timeliness = [
    "is_early",
    "is_ontime",
    "is_late",
]

In [14]:
cols_not_grouped = [
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
    "route_primary_direction",
    "name",
    "schedule_source_record_id",
    "route_long_name",
    "route_short_name",
    "route_combined_name",
    "typology",
    "is_express",
    "is_rapid",
    "is_rail",
    "is_coverage",
    "is_downtown_local",
    "is_local",
    "sched_rt_category",
]

In [15]:
group_cols = [
    "quarter",
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [16]:
sd_df.sched_rt_category.unique()

['schedule_and_vp', 'schedule_only']
Categories (3, object): ['schedule_only', 'vp_only', 'schedule_and_vp']

In [18]:
sd_df.drop(columns=cols_not_grouped).head()

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,service_date,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,speed_mph,route_id,quarter
61970,1adf7a7bde86b42ed014f6de74c7132e,0.0,all_day,62.04,0.17,50,2.08,2023-04-12,3292,2373,4651.7,3102.0,5907,5818,2,31,17,50,1.27,0.98,0.71,0.51,1.0,0.77,1.5,93.03,11.1,1,2023Q2
61973,1adf7a7bde86b42ed014f6de74c7132e,0.0,all_day,62.04,0.17,50,2.08,2023-05-17,388,207,404.32,429.0,603,573,1,6,0,7,1.49,0.95,0.96,0.51,0.9,0.48,0.94,57.76,10.84,1,2023Q2
61975,1adf7a7bde86b42ed014f6de74c7132e,0.0,all_day,62.04,0.17,50,2.08,2023-06-14,500,256,527.61,614.0,765,758,5,3,2,10,1.45,0.99,0.95,0.48,0.81,0.42,0.86,52.76,12.59,1,2023Q2
61979,1adf7a7bde86b42ed014f6de74c7132e,0.0,all_day,62.04,0.17,50,2.08,2023-07-12,3175,1607,4545.07,3102.0,4822,4711,4,38,8,50,1.06,0.98,0.7,0.35,1.0,0.52,1.47,90.9,11.23,1,2023Q3
61982,1adf7a7bde86b42ed014f6de74c7132e,0.0,all_day,62.04,0.17,50,2.08,2023-08-15,3207,1662,4574.6,3102.0,4902,4755,4,34,12,50,1.07,0.97,0.7,0.36,1.0,0.54,1.47,91.49,10.93,1,2023Q3


### Need to re-calculate the % and `"rt_sched_journey_ratio"`
### Need to simply group and sum is_early 	is_ontime 	is_late

In [35]:
segment_calcs.calculate_weighted_averages??

[0;31mSignature:[0m
[0msegment_calcs[0m[0;34m.[0m[0mcalculate_weighted_averages[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m:[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgroup_cols[0m[0;34m:[0m [0mlist[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetric_cols[0m[0;34m:[0m [0mlist[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweight_col[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mcalculate_weighted_averages[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m:[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m [0;34m[0m
[0;34m[0m    [0mgroup_cols[0m[0;34m:[0m [0mlist[0m[0;34m,[0m [0;34m[0m
[0;34m[0m    [0mmetric_cols[0m[0;34m:[0m [0mlist[0m[0;34m,[0m [0;34m[0m
[0;34m[0m    [0mweight_col[0m[0;34m:[0m [0mstr[0m[0;34m[0m
[

In [20]:
sd_rt_metrics = segment_calcs.calculate_weighted_averages(
    df=sd_df,
    group_cols=group_cols,
    metric_cols=rt_metric_cols,
    weight_col="n_vp_trips",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c] * df[weight_col]


In [23]:
sd_rt_metrics.loc[
    (sd_rt_metrics.quarter == "2023Q2")
    & (sd_rt_metrics.route_id == "1")
    & (sd_rt_metrics.direction_id == 0)
]

Unnamed: 0,quarter,schedule_gtfs_dataset_key,route_id,direction_id,time_period,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,vp_per_minute,avg_rt_service_minutes,speed_mph,n_vp_trips
547,2023Q2,1adf7a7bde86b42ed014f6de74c7132e,1,0.0,all_day,2571.88,1830.73,3592.41,4585.39,4514.79,2.34,24.21,12.99,1.32,83.33,11.3,67
548,2023Q2,1adf7a7bde86b42ed014f6de74c7132e,1,0.0,offpeak,1338.95,943.5,2319.24,2378.61,2311.47,2.45,8.84,9.61,1.14,97.4,11.6,38
549,2023Q2,1adf7a7bde86b42ed014f6de74c7132e,1,0.0,peak,1244.83,895.55,1235.17,2227.34,2227.03,0.0,15.69,3.17,1.74,64.91,10.95,29


In [27]:
# Check minutes_atleast_1_vp
(1523 * 23 + 188 * 3 + 169 * 3) / (23 + 3 + 3)

1244.8275862068965

In [29]:
# Check vp_in_sshape
((2736 * 23) + (296 * 3) + (296 * 3)) / (23 + 3 + 3)

2231.1724137931033

In [52]:
# total_rt_service-minutes 1235.17 	
((2736 * 23) + (296 * 3) + (296 * 3)) / (23 + 3 + 3)

2231.1724137931033

### 

In [32]:
# VP per min
(1.81 * 23 + 1.55 * 3 + 1.42 * 3) / (23 + 3 + 6)

1.579375

In [36]:
one_route_one_qtr = sd_df.loc[
    (sd_df.quarter == "2023Q2")
    & (sd_df.route_id == "1")
    & (sd_df.direction_id == 0)
    & (sd_df.time_period == "peak")
]

In [46]:
one_route_one_qtr

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,route_long_name,route_short_name,route_combined_name,route_id,base64_url,organization_source_record_id,organization_name,caltrans_district,route_primary_direction,name,schedule_source_record_id,quarter
62000,1adf7a7bde86b42ed014f6de74c7132e,0.0,peak,62.04,0.17,23,0.96,0.0,1.0,0.0,0.0,1.0,0.0,2023-04-12,rapid,1523,1104,1508.65,1487.0,2736,2736,0,19,4,23,1.81,1.0,1.0,0.73,1.0,0.74,1.01,65.59,schedule_and_vp,10.75,Fashion Valley - La Mesa,1,1 Fashion Valley - La Mesa,1,aHR0cHM6Ly93d3cuc2RtdHMuY29tL2dvb2dsZV90cmFuc2l0X2ZpbGVzL2dvb2dsZV90cmFuc2l0LnppcA==,recZALk4vysuoTVjF,San Diego Metropolitan Transit System,11 - San Diego,Eastbound,San Diego Schedule,recfZ9iWkptccoONX,2023Q2
62003,1adf7a7bde86b42ed014f6de74c7132e,0.0,peak,62.04,0.17,23,0.96,0.0,1.0,0.0,0.0,1.0,0.0,2023-05-17,rapid,188,107,191.41,195.0,296,296,0,3,0,3,1.55,1.0,0.98,0.56,0.96,0.55,0.98,63.8,schedule_and_vp,10.65,Fashion Valley - La Mesa,1,1 Fashion Valley - La Mesa,1,aHR0cHM6Ly93d3cuc2RtdHMuY29tL2dvb2dsZV90cmFuc2l0X2ZpbGVzL2dvb2dsZV90cmFuc2l0LnppcA==,recZALk4vysuoTVjF,San Diego Metropolitan Transit System,11 - San Diego,Eastbound,San Diego Schedule,recfZ9iWkptccoONX,2023Q2
62005,1adf7a7bde86b42ed014f6de74c7132e,0.0,peak,62.04,0.17,23,0.96,0.0,1.0,0.0,0.0,1.0,0.0,2023-06-14,rapid,169,86,182.28,179.0,259,256,0,3,0,3,1.42,0.99,0.93,0.47,0.94,0.48,1.02,60.76,schedule_and_vp,12.75,Fashion Valley - La Mesa,1,1 Fashion Valley - La Mesa,1,aHR0cHM6Ly93d3cuc2RtdHMuY29tL2dvb2dsZV90cmFuc2l0X2ZpbGVzL2dvb2dsZV90cmFuc2l0LnppcA==,recZALk4vysuoTVjF,San Diego Metropolitan Transit System,11 - San Diego,Eastbound,San Diego Schedule,recfZ9iWkptccoONX,2023Q2


In [49]:
agg1 = (
    one_route_one_qtr.groupby(group_cols)
    .agg({c: "sum" for c in ["vp_per_minute", "vp_in_shape"] + ["n_vp_trips"]})
    .reset_index()
)

In [50]:
for c in ["vp_per_minute", "vp_in_shape"]:
        agg1[c] = agg1[c].divide(agg1["n_vp_trips"]).round(2)

In [51]:
agg1

Unnamed: 0,quarter,schedule_gtfs_dataset_key,route_id,direction_id,time_period,vp_per_minute,vp_in_shape,n_vp_trips
0,2023Q2,1adf7a7bde86b42ed014f6de74c7132e,1,0.0,peak,0.16,113.38,29


In [56]:
sd_schd_metrics = segment_calcs.calculate_weighted_averages(
    df=sd_df,
    group_cols=group_cols,
    metric_cols=schd_metric_cols,
    weight_col= 'n_scheduled_trips',
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c] * df[weight_col]


In [57]:
sd_schd_metrics.head(3)

Unnamed: 0,quarter,schedule_gtfs_dataset_key,route_id,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,total_scheduled_service_minutes,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio
0,2023Q1,9707f17da050a601ceda7143d58fa9a2,1,0.0,all_day,1.24,0.0,1.0,5200.0,7615000.0,2500.0,1922.5,3775.0
1,2023Q1,9707f17da050a601ceda7143d58fa9a2,1,0.0,offpeak,2.3,0.01,1.0,816.48,1177335.0,729.0,605.07,1428.84
2,2023Q1,9707f17da050a601ceda7143d58fa9a2,1,0.0,peak,2.7,0.01,1.0,507.84,756999.0,498.85,370.83,529.0


In [58]:
one_route_one_qtr

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,route_long_name,route_short_name,route_combined_name,route_id,base64_url,organization_source_record_id,organization_name,caltrans_district,route_primary_direction,name,schedule_source_record_id,quarter
62000,1adf7a7bde86b42ed014f6de74c7132e,0.0,peak,62.04,0.17,23,0.96,0.0,1.0,0.0,0.0,1.0,0.0,2023-04-12,rapid,1523,1104,1508.65,1487.0,2736,2736,0,19,4,23,1.81,1.0,1.0,0.73,1.0,0.74,1.01,65.59,schedule_and_vp,10.75,Fashion Valley - La Mesa,1,1 Fashion Valley - La Mesa,1,aHR0cHM6Ly93d3cuc2RtdHMuY29tL2dvb2dsZV90cmFuc2l0X2ZpbGVzL2dvb2dsZV90cmFuc2l0LnppcA==,recZALk4vysuoTVjF,San Diego Metropolitan Transit System,11 - San Diego,Eastbound,San Diego Schedule,recfZ9iWkptccoONX,2023Q2
62003,1adf7a7bde86b42ed014f6de74c7132e,0.0,peak,62.04,0.17,23,0.96,0.0,1.0,0.0,0.0,1.0,0.0,2023-05-17,rapid,188,107,191.41,195.0,296,296,0,3,0,3,1.55,1.0,0.98,0.56,0.96,0.55,0.98,63.8,schedule_and_vp,10.65,Fashion Valley - La Mesa,1,1 Fashion Valley - La Mesa,1,aHR0cHM6Ly93d3cuc2RtdHMuY29tL2dvb2dsZV90cmFuc2l0X2ZpbGVzL2dvb2dsZV90cmFuc2l0LnppcA==,recZALk4vysuoTVjF,San Diego Metropolitan Transit System,11 - San Diego,Eastbound,San Diego Schedule,recfZ9iWkptccoONX,2023Q2
62005,1adf7a7bde86b42ed014f6de74c7132e,0.0,peak,62.04,0.17,23,0.96,0.0,1.0,0.0,0.0,1.0,0.0,2023-06-14,rapid,169,86,182.28,179.0,259,256,0,3,0,3,1.42,0.99,0.93,0.47,0.94,0.48,1.02,60.76,schedule_and_vp,12.75,Fashion Valley - La Mesa,1,1 Fashion Valley - La Mesa,1,aHR0cHM6Ly93d3cuc2RtdHMuY29tL2dvb2dsZV90cmFuc2l0X2ZpbGVzL2dvb2dsZV90cmFuc2l0LnppcA==,recZALk4vysuoTVjF,San Diego Metropolitan Transit System,11 - San Diego,Eastbound,San Diego Schedule,recfZ9iWkptccoONX,2023Q2


## All Charts

In [None]:
# display(section2.filtered_route(df))

## Subset df

In [None]:
# Filter out to all day
all_day = df.loc[df.Period == "all_day"].reset_index(drop=True)

In [None]:
all_day.sample()

### The results are really off for `Average VP per Minute`

In [None]:
def quarterly_rollup(all_day: pd.DataFrame) -> pd.DataFrame:
    """
    Using the dataframe that houses only all_day values,
    roll up months to each quarter for certain metrics.
    """
    # Turn date to quarters
    all_day["quarter"] = pd.PeriodIndex(all_day.Date, freq="Q").astype("str")

    quarterly_metrics = segment_calcs.calculate_weighted_averages(
        df=all_day,
        group_cols=[
            "quarter",
            "Organization",
            "Route",
            "dir_0_1",
            "Direction",
        ],
        metric_cols=[
            "Average VP per Minute",
            "% VP within Scheduled Shape",
            "Average Scheduled Service (trip minutes)",
            "ruler_100_pct",
            "ruler_for_vp_per_min",
        ],
        weight_col="# Trips with VP",
    )
    return quarterly_metrics

In [None]:
all_metrics_test = quarterly_rollup(all_day)

## Test charts

In [None]:
import yaml

with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

In [None]:
# Color Palette
with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)

In [None]:
all_metrics_test.sample(3)

In [None]:
all_metrics_test.info()

In [None]:
# Charts
import altair as alt

alt.data_transformers.enable("default", max_rows=None)

In [None]:
def grouped_bar_chart(
    df: pd.DataFrame,
    color_col: str,
    y_col: str,
    offset_col: str,
    title: str,
    subtitle: str,
    range_color: list,
    quarter: bool = False,
) -> alt.Chart:

    tooltip_cols = [
        "Route",
        "Direction",
        color_col,
        y_col,
    ]

    if quarter == False:
        # Clean dataframe
        tooltip_cols.append("Period")
        tooltip_cols.append("Date")
        df = clean_data_charts(df, y_col)

        chart = (
            alt.Chart(df)
            .mark_bar(size=5)
            .encode(
                x=alt.X(
                    "yearmonthdate(Date):O",
                    title=["Date"],
                    axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                ),
                y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)),
                xOffset=alt.X(
                    f"{offset_col}:N", title=_report_utils.labeling(offset_col)
                ),
                color=alt.Color(
                    f"{color_col}:N",
                    title=_report_utils.labeling(color_col),
                    scale=alt.Scale(range=range_color),
                ),
                tooltip=tooltip_cols,
            )
        )
    else:
        tooltip_cols.append("quarter")
        chart = (
            alt.Chart(df)
            .mark_bar(size=5)
            .encode(
                x=alt.X(
                    "quarter",
                    title=["Quarter"],
                    axis=alt.Axis(labelAngle=-45),
                ),
                y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)),
                xOffset=alt.X(
                    f"{offset_col}:N", title=_report_utils.labeling(offset_col)
                ),
                color=alt.Color(
                    f"{color_col}:N",
                    title=_report_utils.labeling(color_col),
                    scale=alt.Scale(range=range_color),
                ),
                tooltip=tooltip_cols,
            )
        )

    chart = (chart).properties(
        title={
            "text": title,
            "subtitle": [subtitle],
        },
        width=400,
        height=250,
    )

    return chart

In [None]:
all_metrics_test.columns

In [None]:
# Create dropdown
routes_list = all_metrics_test["Route"].unique().tolist()

route_dropdown = alt.binding_select(
    options=routes_list,
    name="Routes: ",
)
# Column that controls the bar charts
xcol_param = alt.selection_point(
    fields=["Route"], value=routes_list[0], bind=route_dropdown
)

### Average Scheduled Service (trip minutes)

In [None]:
(
    grouped_bar_chart(
        df=all_metrics_test,
        color_col="Direction",
        y_col="Average Scheduled Service (trip minutes)",
        offset_col="Direction",
        title=readable_dict["avg_scheduled_min_graph"]["title"],
        subtitle=readable_dict["avg_scheduled_min_graph"]["subtitle"],
        range_color=color_dict["four_colors"],
        quarter=True,
    )
).add_params(xcol_param).transform_filter(xcol_param)

### % VP within Scheduled Shape

In [None]:
def base_facet_with_ruler_chart(
    df: pd.DataFrame,
    y_col: str,
    ruler_col: str,
    title: str,
    subtitle: str,
    domain_color: list,
    range_color: list,
    quarter: bool = False,
) -> alt.Chart:

    tooltip_cols = [
        "Route",
        "Direction",
        y_col,
    ]

    # Set y-axis
    max_y = section2.set_y_axis(df, y_col)

    # Clean dataframe
    df = section2.clean_data_charts(df, y_col)

    # Create color scale
    color_scale = alt.Scale(domain=domain_color, range=range_color)

    # Create ruler
    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({ruler_col}):Q")
    )
    if quarter == False:
        tooltip_cols.append("Period")
        tooltip_cols.append("Date")
        chart = (
            alt.Chart(df)
            .mark_bar(size=7, clip=True)
            .encode(
                x=alt.X(
                    "yearmonthdate(Date):O",
                    title=[x_col],
                    axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                ),
                y=alt.Y(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=alt.Scale(domain=[0, max_y]),
                ),
                color=alt.Color(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=color_scale,
                ),
                tooltip=df[tooltip_cols].columns.tolist(),
            )
        )
    else:
        tooltip_cols.append("quarter")
        chart = (
            alt.Chart(df)
            .mark_bar(size=7, clip=True)
            .encode(
                x=alt.X(
                    "quarter",
                    title="Quarter",
                    axis=alt.Axis(labelAngle=-45),
                ),
                y=alt.Y(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=alt.Scale(domain=[0, max_y]),
                ),
                color=alt.Color(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=color_scale,
                ),
                tooltip=df[tooltip_cols].columns.tolist(),
            )
        )
    # All charts
    chart = (chart + ruler).properties(width=200, height=250)
    chart = chart.facet(
        column=alt.Column(
            "Direction:N",
        )
    ).properties(
        title={
            "text": title,
            "subtitle": [subtitle],
        }
    )

    return chart

In [None]:
color_dict["spatial_accuracy_range"]

In [None]:
base_facet_with_ruler_chart(
    df=all_metrics_test,
    y_col="% VP within Scheduled Shape",
    ruler_col="ruler_100_pct",
    title=readable_dict["spatial_accuracy_graph"]["title"],
    subtitle=readable_dict["spatial_accuracy_graph"]["subtitle"],
    domain_color=color_dict["spatial_accuracy_domain"],
    range_color=color_dict["spatial_accuracy_range"],
    quarter=True,
).add_params(xcol_param).transform_filter(xcol_param)

### Average VP per Minute

In [None]:
(
    (
        base_facet_with_ruler_chart(
            all_metrics_test,
            "Average VP per Minute",
            "ruler_for_vp_per_min",
            readable_dict["vp_per_min_graph"]["title"],
            readable_dict["vp_per_min_graph"]["subtitle"],
            color_dict["vp_domain"],
            color_dict["vp_range"],
            quarter=True,
        )
    )
    .add_params(xcol_param)
    .transform_filter(xcol_param)
)

In [None]:
all_metrics_test.loc[
    (all_metrics_test.Organization == "San Diego Metropolitan Transit System")
    & (all_metrics_test["Route"] == "1 Fashion Valley - La Mesa")
    & (all_metrics_test.quarter == "2023Q3")
]

### % of Scheduled Trip Journey chart is messed up due to the rulers

In [None]:
df.head(1)

In [None]:
sched_journey_vp = section2.pct_vp_journey(
    all_day,
    "% Scheduled Trip w/ 1+ VP/Minute",
    "% Scheduled Trip w/ 2+ VP/Minute",
)

In [None]:
sched_journey_vp.shape

In [None]:
sched_journey_vp.ruler_100_pct.describe()

In [None]:
sched_journey_vp.head(1)

In [None]:
(
    section2.base_facet_circle(
        sched_journey_vp,
        "% of Actual Trip Minutes",
        "Category",
        "ruler_100_pct",
        readable_dict["sched_vp_per_min_graph"]["title"],
        readable_dict["sched_vp_per_min_graph"]["subtitle"],
        color_dict["tri_color"],
    )
    .add_params(xcol_param)
    .transform_filter(xcol_param)
)

### Change Total Scheduled Trips -> the graph is cutting off in a strange fashion.

In [None]:
peak_offpeak_df = df.loc[df["Period"] != "all_day"].reset_index(drop=True)

In [None]:
def rollup_schd_qtr(peak_offpeak_df: pd.DataFrame) -> pd.DataFrame:
    """
    Roll up # Scheduled Trips to be on a quarterly basis
    since this metric doesn't change very often.
    """
    # Aggregate
    agg1 = (
        peak_offpeak_df.groupby(
            ["quarter", "Period", "Organization", "Route", "dir_0_1", "Direction"]
        )
        .agg({"Date": "nunique", "# scheduled trips": "sum"})
        .reset_index()
    )

    # If a quarter is complete with all 3 months, divide by 3
    agg1.loc[agg1["Date"] == 3, "# scheduled trips"] = (
        agg1.loc[agg1["Date"] == 3, "# scheduled trips"] / 3
    )

    # If a quarter is incomplete with only 2 months, divide by 2
    agg1.loc[agg1["Date"] == 2, "# scheduled trips"] = (
        agg1.loc[agg1["Date"] == 2, "# scheduled trips"] / 2
    )
    return agg1

In [None]:
total_scheduled_trips = rollup_schd_qtr(peak_offpeak_df)

In [None]:
(
    section2.grouped_bar_chart(
        total_scheduled_trips.loc[(total_scheduled_trips.dir_0_1 == 0)],
        color_col="Period",
        y_col="# scheduled trips",
        offset_col="Period",
        title=readable_dict["trips_per_day_graph"]["title"],
        subtitle="",
        range_color=color_dict["spatial_accuracy_range"],
        quarter=True,
    )
    .add_params(xcol_param)
    .transform_filter(xcol_param)
)

In [None]:
total_scheduled_trips.head(1)

In [None]:
def stacked_bar_chart(
    df: pd.DataFrame,
    y_col: str,
    color_col: str,
    title: str,
    subtitle: str,
    range_color: list,
    quarter: bool = False,
) -> alt.Chart:
    tooltip_cols = [
        "Route",
        "Direction",
        y_col,
    ]

    # Set y-axis
    max_y = section2.set_y_axis(df, y_col)

    # Clean dataframe
    df = section2.clean_data_charts(df, y_col)

    if quarter == False:
        tooltip_cols.append("Period")
        tooltip_cols.append("Date")
        chart = (
            alt.Chart(df)
            .mark_bar(size=7, clip=True)
            .encode(
                x=alt.X(
                    "yearmonthdate(Date):O",
                    title=["Date"],
                    axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                ),
                y=alt.Y(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=alt.Scale(domain=[0, max_y]),
                ),
                color=alt.Color(
                    f"{color_col}:N",
                    title=_report_utils.labeling(color_col),
                    scale=alt.Scale(range=color_dict["four_colors"]),
                ),
                tooltip=tooltip_cols,
            )
        )

    else:
        tooltip_cols.append("quarter")
        chart = (
            alt.Chart(df)
            .mark_bar(size=7, clip=True)
            .encode(
                x=alt.X(
                    "quarter",
                    title=["Quarter"],
                    axis=alt.Axis(labelAngle=-45),
                ),
                y=alt.Y(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=alt.Scale(domain=[0, max_y]),
                ),
                color=alt.Color(
                    f"{color_col}:N",
                    title=_report_utils.labeling(color_col),
                    scale=alt.Scale(range=color_dict["four_colors"]),
                ),
                tooltip=tooltip_cols,
            )
        )

    chart = chart.properties(width=200, height=250)

    # Facet the chart
    chart = chart.properties(
        title={
            "text": title,
            "subtitle": subtitle,
        }
    )

    return chart

In [None]:
stacked_bar_chart(
    df=total_scheduled_trips.loc[total_scheduled_trips.dir_0_1 == 0],
    y_col="# scheduled trips",
    color_col="Period",
    title="",
    subtitle="",
    range_color=color_dict["four_colors"],
    quarter=True,
).add_params(xcol_param).transform_filter(xcol_param)

In [None]:
(
    alt.Chart(total_scheduled_trips.loc[total_scheduled_trips.dir_0_1 == 0])
    .mark_bar(size=7, clip=True)
    .encode(
        x=alt.X(
            "quarter",
            title=["quarter"],
            axis=alt.Axis(labelAngle=-45),
        ),
        y=alt.Y(
            "# scheduled trips:Q",
            title=_report_utils.labeling(
                "# scheduled trips",
            ),
            scale=alt.Scale(domain=[0, 100]),
        ),
        color=alt.Color(
            "Period:N",
            title=_report_utils.labeling("Period"),
            scale=alt.Scale(range=color_dict["four_colors"]),
        ),
        tooltip=list(total_scheduled_trips.columns),
    )
).add_params(xcol_param).transform_filter(xcol_param)

In [None]:
(
    (
        section2.base_facet_chart(
            df=total_scheduled_trips,
            direction_to_filter=0,
            y_col="# scheduled trips",
            color_col="Period",
            facet_col="Period",
            title=readable_dict["trips_per_day_graph"]["title"],
            subtitle="",
            range_color=color_dict["four_colors"],
        )
    )
    .add_params(xcol_param)
    .transform_filter(xcol_param)
)

In [None]:
# Peak East bound
23 + 23 + 23

## Turn off group keys for the `segment_calcs.calculate_weighted_averages` function
* This didn't work at all. 

In [None]:
def calculate_weighted_averages(
    df: pd.DataFrame, group_cols: list, metric_cols: list, weight_col: str
):
    """
    For certain aggregations, we need to calculate a weighted average,
    weighted by the number of trips.

    If we want peak/offpeak weighted calculations,
    we can take time-of-day (AM peak, PM peak) and
    get a peak speed calculation, after weighting by the number
    of trips present in each time-of-day bin.

    Ex: metric_cols = ['p20_mph', 'p50_mph', 'p80_mph']
    weight_cols = 'n_trips'

    """
    for c in metric_cols:
        df[c] = df[c] * df[weight_col]

    df2 = (
        df.groupby(group_cols)
        .agg({c: "sum" for c in metric_cols + [weight_col]})
        .reset_index()
    )

    for c in metric_cols:
        df2[c] = df2[c].divide(df2[weight_col]).round(2)

    return df2

In [None]:
all_metrics_test2 = calculate_weighted_averages(
    df=all_day,
    group_cols=[
        "quarter",
        "Organization",
        "schedule_gtfs_dataset_key",
        "Route ID",
        "dir_0_1",
        "Direction",
    ],
    metric_cols=[
        "Average VP per Minute",
        "% VP within Scheduled Shape",
        "Average Scheduled Service (trip minutes)",
    ],
    weight_col="# Trips with VP",
)

In [None]:
all_metrics_test2.loc[
    (all_metrics_test2.Organization == "San Diego Metropolitan Transit System")
    & (all_metrics_test2["Route ID"] == "1")
    & (all_metrics_test2.quarter == "2023Q2")
    & (all_metrics_test2.Direction == "Eastbound")
]