# For certain metrics, aggregate them to quarterly in `merge_data`

* Tiffany's Teams message: 
 > you can do it within merge_data for now, it'll be the quickest way to get your quarterly averages. 
merge_data is simply concatenating schedule, summary speeds, rt vs schedule by route-direction-peak/offpeak-single day together. You can take that and aggregate several days into quarterly.
Here's the script for average_segment_speeds that mocks up aggregating across single days / using weighted averages, and you'll see how some of the functions are put together, specifically tagging the quarter-year / weekday or weekend / weighted averages (using n_trips) to weight the metric column. https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/average_segment_speeds.py
* Metrics to Roll Up
    * Avg scheduled minute
    * VP per minute
    * Spatial Accuracy

In [1]:
import _report_operator_section1_utils as section1
import _report_operator_section2_utils as section2
import _report_utils
import geopandas as gpd
import merge_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, metrics, segment_calcs
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import (
    catalog_utils,
    portfolio_utils,
    rt_dates,
    rt_utils,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

# Data Dictionary
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = [rt_dates.DATES["feb2025"]]

In [4]:
analysis_date = rt_dates.DATES["feb2025"]

In [5]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [6]:
sdi = "San Diego Metropolitan Transit System"

In [7]:
marin = "Marin County Transit District"

In [8]:
df = pd.read_parquet(schd_vp_url)

In [9]:
df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles',
       'route_primary_direction', 'n_scheduled_trips', 'frequency',
       'is_express', 'is_rapid', 'is_rail', 'is_coverage', 'is_downtown_local',
       'is_local', 'service_date', 'typology', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name',


## 3/27/2025: Incorporate Tiffany's Comments from `structure.md`.

In [10]:
sd_df = df.loc[df.organization_name == sdi]

In [11]:
sd_df.shape

(37761, 46)

In [12]:
sd_df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles',
       'route_primary_direction', 'n_scheduled_trips', 'frequency',
       'is_express', 'is_rapid', 'is_rail', 'is_coverage', 'is_downtown_local',
       'is_local', 'service_date', 'typology', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name',


### Use `add_quarter` in `shared_utils.time_helpers`

In [13]:
from shared_utils import time_helpers

In [14]:
sd_df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles',
       'route_primary_direction', 'n_scheduled_trips', 'frequency',
       'is_express', 'is_rapid', 'is_rail', 'is_coverage', 'is_downtown_local',
       'is_local', 'service_date', 'typology', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name',


In [15]:
sd_df = time_helpers.add_quarter(sd_df, "service_date")

In [16]:
sd_df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles',
       'route_primary_direction', 'n_scheduled_trips', 'frequency',
       'is_express', 'is_rapid', 'is_rail', 'is_coverage', 'is_downtown_local',
       'is_local', 'service_date', 'typology', 'name', 'route_long_name',
       'route_short_name', 'route_combined_name', 'route_id',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name',


In [17]:
sd_df.year_quarter.value_counts()

2024_Q1    4887
2024_Q2    4814
2024_Q4    4792
2024_Q3    4740
2023_Q3    4723
2023_Q4    4689
2023_Q2    4330
2025_Q1    3214
2023_Q1    1572
Name: year_quarter, dtype: int64

In [18]:
sd_df.quarter.value_counts()

1    9673
4    9481
3    9463
2    9144
Name: quarter, dtype: int64

### I see some similarities in the getting the metrics you want as in https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/metrics.py, specifically the derive_rt_vs_schedule_metrics. Is there a way to bridge these 2 instances so that the function itself is adapted to handle the generic df without over-handling some other stuff?
* This function is used `rt_scheduled_v_ran/scripts/rt_v_scheduled_trip.py` and `rt_scheduled_v_ran/scripts/rt_v_scheduled_routes.py`
* We don't necessarily need all the rounding, separate out the division part out? 

In [19]:
schd_metric_cols = [
    "avg_scheduled_service_minutes",
    "avg_stop_miles",
    "frequency",
    "total_scheduled_service_minutes",
]

groupby_cols = [
    "schedule_gtfs_dataset_key",
    "year_quarter",
    "direction_id",
    "time_period",
    "route_id",
]
rt_metric_cols = [
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_rt_service_minutes",
    "total_vp",
    "vp_in_shape",
    "avg_rt_service_minutes",
    "speed_mph",
]
rt_metric_no_weighted_avg = [
    "is_early",
    "is_ontime",
    "is_late",
]
crosswalk_cols = [
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
    "route_primary_direction",
    "name",
    "schedule_source_record_id",
    "is_express",
    "is_rapid",
    "is_rail",
    "is_coverage",
    "is_downtown_local",
    "is_local",
    "service_date",
    "typology",
    "sched_rt_category",
    "route_long_name",
    "route_short_name",
    "route_combined_name",
]
group_cols = [
    "year_quarter",
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [31]:
# sd_df.loc[(sd_df.route_id == "1") & (sd_df.direction_id == 0) & (sd_df.time_period == "peak")]

In [21]:
rt_df = sd_df.copy()
schd_df = sd_df.copy()
timeliness_df = sd_df.copy()

In [22]:
rt_metrics = segment_calcs.calculate_weighted_averages(
    df=sd_df[groupby_cols+rt_metric_cols+["n_vp_trips"]],
    group_cols=groupby_cols,
    metric_cols=rt_metric_cols,
    weight_col="n_vp_trips",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c] * df[weight_col]


In [23]:
rt_metrics.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,year_quarter,direction_id,time_period,route_id,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_vp,vp_in_shape,avg_rt_service_minutes,speed_mph,n_vp_trips
0,1adf7a7bde86b42ed014f6de74c7132e,2023_Q2,0.0,all_day,1,2571.88,1830.73,3592.41,4585.39,4514.79,83.33,11.3,201


In [24]:
sd_df.head(1)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,route_primary_direction,n_scheduled_trips,frequency,is_express,is_rapid,is_rail,is_coverage,is_downtown_local,is_local,service_date,typology,name,route_long_name,route_short_name,route_combined_name,route_id,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district,year,quarter,year_quarter
255719,1adf7a7bde86b42ed014f6de74c7132e,0.0,all_day,62.04,0.17,Eastbound,50,2.08,0.0,1.0,0.0,0.0,1.0,0.0,2023-04-12,rapid,San Diego Schedule,Fashion Valley - La Mesa,1,1 Fashion Valley - La Mesa,1,3292,2373,4651.7,3102.0,5907,5818,2,31,17,50,1.27,0.98,0.71,0.51,1.0,0.77,1.5,93.03,schedule_and_vp,11.1,recfZ9iWkptccoONX,aHR0cHM6Ly93d3cuc2RtdHMuY29tL2dvb2dsZV90cmFuc2l0X2ZpbGVzL2dvb2dsZV90cmFuc2l0LnppcA==,recZALk4vysuoTVjF,San Diego Metropolitan Transit System,11 - San Diego,2023,2,2023_Q2


In [25]:
# Calculate Scheduled Metrics that need to have a weighted average
schd_metrics = segment_calcs.calculate_weighted_averages(
    df=sd_df[groupby_cols + schd_metric_cols + ["n_scheduled_trips"]],
    group_cols=groupby_cols,
    metric_cols=schd_metric_cols,
    weight_col="n_scheduled_trips",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c] * df[weight_col]


In [27]:
schd_metrics.head(1).T

Unnamed: 0,0
schedule_gtfs_dataset_key,1adf7a7bde86b42ed014f6de74c7132e
year_quarter,2023_Q2
direction_id,0.00
time_period,all_day
route_id,1
avg_scheduled_service_minutes,62.04
avg_stop_miles,0.17
frequency,2.08
total_scheduled_service_minutes,1381.67
n_scheduled_trips,450


In [28]:
# Calculate Scheduled Metrics that need to have a weighted average
schd_metrics2 = segment_calcs.calculate_weighted_averages(
    df=schd_df,
    group_cols=groupby_cols,
    metric_cols=schd_metric_cols,
    weight_col="n_scheduled_trips",
)

In [29]:
schd_metrics2.head(1).T

Unnamed: 0,0
schedule_gtfs_dataset_key,1adf7a7bde86b42ed014f6de74c7132e
year_quarter,2023_Q2
direction_id,0.00
time_period,all_day
route_id,1
avg_scheduled_service_minutes,62.04
avg_stop_miles,0.17
frequency,2.08
total_scheduled_service_minutes,1381.67
n_scheduled_trips,450


In [None]:
# Calculate trips by timeliness which doesn't need weighted average
timeliness_df = timeliness_df[groupby_cols + rt_metric_no_weighted_avg]
timeliness_df2 = (
    timeliness_df.groupby(groupby_cols)
    .agg({"is_early": "sum", "is_ontime": "sum", "is_late": "sum"})
    .reset_index()
)

In [None]:
# Create a crosswalk with string descriptives such as
# organization_name, route_long_name, etc that were excluded from the groupby_cols
crosswalk = sd_df[groupby_cols + crosswalk_cols]

# Merge all the dataframes
m1 = (
    pd.merge(rt_metrics, schd_metrics, on=groupby_cols)
    .merge(timeliness_df2, on=groupby_cols)
    .merge(crosswalk, on=groupby_cols)
)

In [None]:
m1.columns

In [None]:
# Have to temporarily rm total to some of the columns
m1 = m1.rename(
    columns={
        "total_rt_service_minutes": "rt_service_minutes",
        "total_scheduled_service_minutes": "scheduled_service_minutes",
    }
)

In [None]:
m1 = metrics.calculate_rt_vs_schedule_metrics(m1)

In [None]:
# Rename back 
m1 = m1.rename(
    columns={
        "rt_service_minutes": "total_rt_service_minutes",
        "scheduled_service_minutes": "total_scheduled_service_minutes"
    }
)

In [None]:
m1["rt_sched_journey_ratio"] = (
        m1.total_rt_service_minutes / m1.total_scheduled_service_minutes
    )

In [None]:
col_proper_order = list(df.columns) + ['year_quarter']

In [None]:
# Rearrange columns to match original df
m1 = m1[col_proper_order]

# Drop service_date & duplicates
m1 = (m1
          .drop(columns=["service_date"])
          .drop_duplicates(subset = group_cols)
          .reset_index(drop=True))

In [None]:
m1.head(1)

## Pre 3/27/2025 Work: Organizing columns 
* **Groupby Columns** 
['schedule_gtfs_dataset_key', 
'quarter' 
'direction_id', 
'time_period',
'route_id', 
* **Columns to crosswalk back once grouping by quarters is done**:
'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district', 'route_primary_direction', 'name',
       'schedule_source_record_id',
'is_express', 'is_rapid', 'is_rail', 'is_coverage',
 'is_downtown_local', 'is_local', 'service_date', 'typology',
  'sched_rt_category', 'route_long_name', 'route_short_name',
       'route_combined_name',        
* **Scheduled Route Info**: 'n_scheduled_trips' is the weighted column
'avg_scheduled_service_minutes', 
'avg_stop_miles', 
,
'frequency', 'total_scheduled_service_minutes',

* **RT Route Info**: 'n_vp_trips' is the weighted column.
'minutes_atleast1_vp', 'minutes_atleast2_vp',
'total_rt_service_minutes', 
  'total_vp', 'vp_in_shape', 
       ,avg_rt_service_minutes', 
'speed_mph',

* **RT Route Info: no need for weighted avg**: 'is_early', 'is_ontime', 'is_late',
* **ReCalculate Columns**
'pct_in_shape',
'pct_rt_journey_atleast1_vp', 
'pct_rt_journey_atleast2_vp',
'pct_sched_journey_atleast1_vp', 
'pct_sched_journey_atleast2_vp',
'rt_sched_journey_ratio', "vp_per_minute",]

In [None]:
schd_metric_cols = [
    "avg_scheduled_service_minutes",
    "avg_stop_miles",
    "frequency",
    "total_scheduled_service_minutes",
]

In [None]:
groupby_cols = [
    "schedule_gtfs_dataset_key",
    "quarter",
    "direction_id",
    "time_period",
    "route_id",
]

In [None]:
rt_metric_cols = [
    "minutes_atleast1_vp",
    "minutes_atleast2_vp",
    "total_rt_service_minutes",
    "total_vp",
    "vp_in_shape",
    "avg_rt_service_minutes",
    "speed_mph",
]

In [None]:
rt_metric_no_weighted_avg = [
    "is_early",
    "is_ontime",
    "is_late",
]

In [None]:
crosswalk_cols = [
    "base64_url",
    "organization_source_record_id",
    "organization_name",
    "caltrans_district",
    "route_primary_direction",
    "name",
    "schedule_source_record_id",
    "is_express",
    "is_rapid",
    "is_rail",
    "is_coverage",
    "is_downtown_local",
    "is_local",
    "service_date",
    "typology",
    "sched_rt_category",
    "route_long_name",
    "route_short_name",
    "route_combined_name",
]

In [None]:
group_cols = [
    "quarter",
    "schedule_gtfs_dataset_key",
    "route_id",
    "direction_id",
    "time_period",
]

In [None]:
sd_df.sched_rt_category.unique()

## Turn this to a function

In [None]:
def quarterly_metrics(df: pd.DataFrame) -> pd.DataFrame:
    """
    GTFS Digest is presented on a monthly candece.
    Aggregate this dataframe to be on a quarterly grain
    instead.
    """
    # Create copies of the original df before aggregating because I noticed applying
    #  segment_calcs.calculate_weighted_averages impacts the original df
    rt_df = df.copy()
    schd_df = df.copy()
    timeliness_df = df.copy()

    # Calculate RT Metrics that need to have a weighted average
    rt_metrics = segment_calcs.calculate_weighted_averages(
        df=rt_df,
        group_cols=groupby_cols,
        metric_cols=rt_metric_cols,
        weight_col="n_vp_trips",
    )

    # Calculate Scheduled Metrics that need to have a weighted average
    schd_metrics = segment_calcs.calculate_weighted_averages(
        df=schd_df,
        group_cols=groupby_cols,
        metric_cols=schd_metric_cols,
        weight_col="n_scheduled_trips",
    )

    # Calculate trips by timeliness which doesn't need weighted average
    timeliness_df = timeliness_df[groupby_cols + rt_metric_no_weighted_avg]
    timeliness_df2 = (
        timeliness_df.groupby(groupby_cols)
        .agg({"is_early": "sum", "is_ontime": "sum", "is_late": "sum"})
        .reset_index()
    )

    # Create a crosswalk with string descriptives such as
    # organization_name, route_long_name, etc that were excluded from the groupby_cols
    crosswalk = df[groupby_cols + crosswalk_cols]

    # Merge all the dataframes
    m1 = (
        pd.merge(rt_metrics, schd_metrics, on=groupby_cols)
        .merge(timeliness_df2, on=groupby_cols)
        .merge(crosswalk, on=groupby_cols)
    )

    # Re-calculate certain columns
    m1["pct_in_shape"] = m1.vp_in_shape / m1.total_vp
    m1["pct_rt_journey_atleast1_vp"] = (
        m1.minutes_atleast1_vp / m1.total_rt_service_minutes
    )
    m1["pct_rt_journey_atleast2_vp"] = (
        m1.minutes_atleast2_vp / m1.total_rt_service_minutes
    )
    m1["pct_sched_journey_atleast1_vp"] = (
        m1.minutes_atleast1_vp / m1.total_scheduled_service_minutes
    )
    m1["pct_sched_journey_atleast2_vp"] = (
        m1.minutes_atleast2_vp / m1.total_scheduled_service_minutes
    )
    m1["vp_per_minute"] = m1.total_vp / m1.total_rt_service_minutes
    m1["rt_sched_journey_ratio"] = (
        m1.total_rt_service_minutes / m1.total_scheduled_service_minutes
    )

    # Rearrange columns to match original df
    m1 = m1[list(df.columns)]

    # Drop service_date
    m1 = m1.drop(columns=["service_date"]).drop_duplicates().reset_index(drop=True)
    return m1

In [None]:
m1_all_orgs = quarterly_metrics(df)

In [None]:
df.loc[
    (df.quarter == "2024Q3")
    & (df.route_id == "30")
    & (df.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (df.route_primary_direction == "Westbound")
    & (df.time_period == "peak")
][["service_date", "n_scheduled_trips"]]

In [None]:
df.loc[
    (df.quarter == "2024Q3")
    & (df.route_id == "30")
    & (df.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (df.route_primary_direction == "Westbound")
    & (df.time_period == "peak")
].n_scheduled_trips.sum()

In [None]:
df.loc[
    (df.quarter == "2024Q3")
    & (df.route_id == "30")
    & (df.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (df.route_primary_direction == "Westbound")
    & (df.time_period == "peak")
].n_scheduled_trips.sum()

In [None]:
m1_all_orgs.loc[
    (m1_all_orgs.quarter == "2024Q3")
    & (m1_all_orgs.route_id == "30")
    & (m1_all_orgs.schedule_gtfs_dataset_key == "7cc0cb1871dfd558f11a2885c145d144")
    & (m1_all_orgs.route_primary_direction == "Westbound")
][["n_scheduled_trips"]]

## Calculate RT Route Info

In [None]:
rt_copy = sd_df.copy()

In [None]:
sd_rt_metrics = segment_calcs.calculate_weighted_averages(
    df=rt_copy,
    group_cols=groupby_cols,
    metric_cols=rt_metric_cols,
    weight_col="n_vp_trips",
)

In [None]:
sd_rt_metrics.loc[
    (sd_rt_metrics.quarter == "2024Q2")
    & (sd_rt_metrics.route_id == "1")
    & (sd_rt_metrics.direction_id == 0)
    & (sd_rt_metrics.time_period == "offpeak")
]

In [None]:
sd_df.loc[
    (sd_df.quarter == "2024Q2")
    & (sd_df.route_id == "1")
    & (sd_df.direction_id == 0)
    & (sd_df.time_period == "offpeak")
][rt_metric_cols]

In [None]:
sd_df.loc[
    (sd_df.quarter == "2024Q2")
    & (sd_df.route_id == "1")
    & (sd_df.direction_id == 0)
    & (sd_df.time_period == "offpeak")
][["n_vp_trips"]]

In [None]:
# VP in Shape
((3821 * 27) + (3493 * 27) + (5252 * 33)) / 87

In [None]:
# Speed MPH
((11.34 * 27) + (11.42 * 27) + (11.1 * 33)) / 87

In [None]:
# Minutes at least 2 vp
((1558 * 27) + (1393 * 27) + (2147 * 33)) / 87

## Calculate Scheduled Route Info

In [None]:
schd_copy = sd_df.copy()

In [None]:
sd_schd_metrics = segment_calcs.calculate_weighted_averages(
    df=schd_copy,
    group_cols=groupby_cols,
    metric_cols=schd_metric_cols,
    weight_col="n_scheduled_trips",
)

In [None]:
sd_schd_metrics.loc[
    (sd_schd_metrics.quarter == "2024Q2")
    & (sd_schd_metrics.route_id == "1")
    & (sd_schd_metrics.direction_id == 0)
    & (sd_schd_metrics.time_period == "offpeak")
]

In [None]:
sd_df.loc[
    (sd_df.quarter == "2024Q2")
    & (sd_df.route_id == "1")
    & (sd_df.direction_id == 0)
    & (sd_df.time_period == "offpeak")
][["n_vp_trips"]]

In [None]:
sd_df.loc[
    (sd_df.quarter == "2024Q2")
    & (sd_df.route_id == "1")
    & (sd_df.direction_id == 0)
    & (sd_df.time_period == "offpeak")
][schd_metric_cols]

In [None]:
((1615 * 27) + (1615 * 27) + (2001 * 33)) / (27 + 27 + 33)

## Timeliness Trips

In [None]:
timeliness = sd_df.copy()

In [None]:
timeliness = timeliness[groupby_cols + rt_metric_no_weighted_avg]

In [None]:
timeliness.head(2)

In [None]:
timeliness2 = (
    timeliness.groupby(groupby_cols)
    .agg({"is_early": "sum", "is_ontime": "sum", "is_late": "sum"})
    .reset_index()
)

In [None]:
timeliness2.loc[
    (timeliness2.quarter == "2024Q2")
    & (timeliness2.route_id == "1")
    & (timeliness2.direction_id == 0)
    & (timeliness2.time_period == "offpeak")
]

In [None]:
sd_df.loc[
    (sd_df.quarter == "2024Q2")
    & (sd_df.route_id == "1")
    & (sd_df.direction_id == 0)
    & (sd_df.time_period == "offpeak")
][rt_metric_no_weighted_avg]

## Join all the tables back together

In [None]:
crosswalk = sd_df[groupby_cols + crosswalk_cols]

In [None]:
len(crosswalk.drop_duplicates())

In [None]:
len(crosswalk)

In [None]:
m1 = (
    pd.merge(sd_rt_metrics, sd_schd_metrics, on=groupby_cols)
    .merge(timeliness2, on=groupby_cols)
    .merge(crosswalk, on=groupby_cols)
)

In [None]:
m1.sample()

## Re-Calculate Metrics 

In [None]:
m1["pct_in_shape"] = m1.vp_in_shape / m1.total_vp

In [None]:
m1["pct_rt_journey_atleast1_vp"] = m1.minutes_atleast1_vp / m1.total_rt_service_minutes

In [None]:
m1["pct_rt_journey_atleast2_vp"] = m1.minutes_atleast2_vp / m1.total_rt_service_minutes

In [None]:
m1["pct_sched_journey_atleast1_vp"] = (
    m1.minutes_atleast1_vp / m1.total_scheduled_service_minutes
)

In [None]:
m1["pct_sched_journey_atleast2_vp"] = (
    m1.minutes_atleast2_vp / m1.total_scheduled_service_minutes
)

In [None]:
m1["vp_per_minute"] = m1.total_vp / m1.total_rt_service_minutes

In [None]:
m1["rt_sched_journey_ratio"] = (
    m1.total_rt_service_minutes / m1.total_scheduled_service_minutes
)

## Rearrange columns to match

In [None]:
m1 = m1[list(sd_df.columns)]

In [None]:
m1.columns == sd_df.columns

## All Charts

In [None]:
# display(section2.filtered_route(df))

## Subset df

In [None]:
# Filter out to all day
all_day = df.loc[df.Period == "all_day"].reset_index(drop=True)

In [None]:
all_day.sample()

### The results are really off for `Average VP per Minute`

In [None]:
def quarterly_rollup(all_day: pd.DataFrame) -> pd.DataFrame:
    """
    Using the dataframe that houses only all_day values,
    roll up months to each quarter for certain metrics.
    """
    # Turn date to quarters
    all_day["quarter"] = pd.PeriodIndex(all_day.Date, freq="Q").astype("str")

    quarterly_metrics = segment_calcs.calculate_weighted_averages(
        df=all_day,
        group_cols=[
            "quarter",
            "Organization",
            "Route",
            "dir_0_1",
            "Direction",
        ],
        metric_cols=[
            "Average VP per Minute",
            "% VP within Scheduled Shape",
            "Average Scheduled Service (trip minutes)",
            "ruler_100_pct",
            "ruler_for_vp_per_min",
        ],
        weight_col="# Trips with VP",
    )
    return quarterly_metrics

In [None]:
all_metrics_test = quarterly_rollup(all_day)

## Test charts

In [None]:
import yaml

with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

In [None]:
# Color Palette
with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)

In [None]:
all_metrics_test.sample(3)

In [None]:
all_metrics_test.info()

In [None]:
# Charts
import altair as alt

alt.data_transformers.enable("default", max_rows=None)

In [None]:
def grouped_bar_chart(
    df: pd.DataFrame,
    color_col: str,
    y_col: str,
    offset_col: str,
    title: str,
    subtitle: str,
    range_color: list,
    quarter: bool = False,
) -> alt.Chart:

    tooltip_cols = [
        "Route",
        "Direction",
        color_col,
        y_col,
    ]

    if quarter == False:
        # Clean dataframe
        tooltip_cols.append("Period")
        tooltip_cols.append("Date")
        df = clean_data_charts(df, y_col)

        chart = (
            alt.Chart(df)
            .mark_bar(size=5)
            .encode(
                x=alt.X(
                    "yearmonthdate(Date):O",
                    title=["Date"],
                    axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                ),
                y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)),
                xOffset=alt.X(
                    f"{offset_col}:N", title=_report_utils.labeling(offset_col)
                ),
                color=alt.Color(
                    f"{color_col}:N",
                    title=_report_utils.labeling(color_col),
                    scale=alt.Scale(range=range_color),
                ),
                tooltip=tooltip_cols,
            )
        )
    else:
        tooltip_cols.append("quarter")
        chart = (
            alt.Chart(df)
            .mark_bar(size=5)
            .encode(
                x=alt.X(
                    "quarter",
                    title=["Quarter"],
                    axis=alt.Axis(labelAngle=-45),
                ),
                y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)),
                xOffset=alt.X(
                    f"{offset_col}:N", title=_report_utils.labeling(offset_col)
                ),
                color=alt.Color(
                    f"{color_col}:N",
                    title=_report_utils.labeling(color_col),
                    scale=alt.Scale(range=range_color),
                ),
                tooltip=tooltip_cols,
            )
        )

    chart = (chart).properties(
        title={
            "text": title,
            "subtitle": [subtitle],
        },
        width=400,
        height=250,
    )

    return chart

In [None]:
all_metrics_test.columns

In [None]:
# Create dropdown
routes_list = all_metrics_test["Route"].unique().tolist()

route_dropdown = alt.binding_select(
    options=routes_list,
    name="Routes: ",
)
# Column that controls the bar charts
xcol_param = alt.selection_point(
    fields=["Route"], value=routes_list[0], bind=route_dropdown
)

### Average Scheduled Service (trip minutes)

In [None]:
(
    grouped_bar_chart(
        df=all_metrics_test,
        color_col="Direction",
        y_col="Average Scheduled Service (trip minutes)",
        offset_col="Direction",
        title=readable_dict["avg_scheduled_min_graph"]["title"],
        subtitle=readable_dict["avg_scheduled_min_graph"]["subtitle"],
        range_color=color_dict["four_colors"],
        quarter=True,
    )
).add_params(xcol_param).transform_filter(xcol_param)

### % VP within Scheduled Shape

In [None]:
def base_facet_with_ruler_chart(
    df: pd.DataFrame,
    y_col: str,
    ruler_col: str,
    title: str,
    subtitle: str,
    domain_color: list,
    range_color: list,
    quarter: bool = False,
) -> alt.Chart:

    tooltip_cols = [
        "Route",
        "Direction",
        y_col,
    ]

    # Set y-axis
    max_y = section2.set_y_axis(df, y_col)

    # Clean dataframe
    df = section2.clean_data_charts(df, y_col)

    # Create color scale
    color_scale = alt.Scale(domain=domain_color, range=range_color)

    # Create ruler
    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({ruler_col}):Q")
    )
    if quarter == False:
        tooltip_cols.append("Period")
        tooltip_cols.append("Date")
        chart = (
            alt.Chart(df)
            .mark_bar(size=7, clip=True)
            .encode(
                x=alt.X(
                    "yearmonthdate(Date):O",
                    title=[x_col],
                    axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                ),
                y=alt.Y(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=alt.Scale(domain=[0, max_y]),
                ),
                color=alt.Color(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=color_scale,
                ),
                tooltip=df[tooltip_cols].columns.tolist(),
            )
        )
    else:
        tooltip_cols.append("quarter")
        chart = (
            alt.Chart(df)
            .mark_bar(size=7, clip=True)
            .encode(
                x=alt.X(
                    "quarter",
                    title="Quarter",
                    axis=alt.Axis(labelAngle=-45),
                ),
                y=alt.Y(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=alt.Scale(domain=[0, max_y]),
                ),
                color=alt.Color(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=color_scale,
                ),
                tooltip=df[tooltip_cols].columns.tolist(),
            )
        )
    # All charts
    chart = (chart + ruler).properties(width=200, height=250)
    chart = chart.facet(
        column=alt.Column(
            "Direction:N",
        )
    ).properties(
        title={
            "text": title,
            "subtitle": [subtitle],
        }
    )

    return chart

In [None]:
color_dict["spatial_accuracy_range"]

In [None]:
base_facet_with_ruler_chart(
    df=all_metrics_test,
    y_col="% VP within Scheduled Shape",
    ruler_col="ruler_100_pct",
    title=readable_dict["spatial_accuracy_graph"]["title"],
    subtitle=readable_dict["spatial_accuracy_graph"]["subtitle"],
    domain_color=color_dict["spatial_accuracy_domain"],
    range_color=color_dict["spatial_accuracy_range"],
    quarter=True,
).add_params(xcol_param).transform_filter(xcol_param)

### Average VP per Minute

In [None]:
(
    (
        base_facet_with_ruler_chart(
            all_metrics_test,
            "Average VP per Minute",
            "ruler_for_vp_per_min",
            readable_dict["vp_per_min_graph"]["title"],
            readable_dict["vp_per_min_graph"]["subtitle"],
            color_dict["vp_domain"],
            color_dict["vp_range"],
            quarter=True,
        )
    )
    .add_params(xcol_param)
    .transform_filter(xcol_param)
)

In [None]:
all_metrics_test.loc[
    (all_metrics_test.Organization == "San Diego Metropolitan Transit System")
    & (all_metrics_test["Route"] == "1 Fashion Valley - La Mesa")
    & (all_metrics_test.quarter == "2023Q3")
]

### % of Scheduled Trip Journey chart is messed up due to the rulers

In [None]:
df.head(1)

In [None]:
sched_journey_vp = section2.pct_vp_journey(
    all_day,
    "% Scheduled Trip w/ 1+ VP/Minute",
    "% Scheduled Trip w/ 2+ VP/Minute",
)

In [None]:
sched_journey_vp.shape

In [None]:
sched_journey_vp.ruler_100_pct.describe()

In [None]:
sched_journey_vp.head(1)

In [None]:
(
    section2.base_facet_circle(
        sched_journey_vp,
        "% of Actual Trip Minutes",
        "Category",
        "ruler_100_pct",
        readable_dict["sched_vp_per_min_graph"]["title"],
        readable_dict["sched_vp_per_min_graph"]["subtitle"],
        color_dict["tri_color"],
    )
    .add_params(xcol_param)
    .transform_filter(xcol_param)
)

### Change Total Scheduled Trips -> the graph is cutting off in a strange fashion.

In [None]:
peak_offpeak_df = df.loc[df["Period"] != "all_day"].reset_index(drop=True)

In [None]:
def rollup_schd_qtr(peak_offpeak_df: pd.DataFrame) -> pd.DataFrame:
    """
    Roll up # Scheduled Trips to be on a quarterly basis
    since this metric doesn't change very often.
    """
    # Aggregate
    agg1 = (
        peak_offpeak_df.groupby(
            ["quarter", "Period", "Organization", "Route", "dir_0_1", "Direction"]
        )
        .agg({"Date": "nunique", "# scheduled trips": "sum"})
        .reset_index()
    )

    # If a quarter is complete with all 3 months, divide by 3
    agg1.loc[agg1["Date"] == 3, "# scheduled trips"] = (
        agg1.loc[agg1["Date"] == 3, "# scheduled trips"] / 3
    )

    # If a quarter is incomplete with only 2 months, divide by 2
    agg1.loc[agg1["Date"] == 2, "# scheduled trips"] = (
        agg1.loc[agg1["Date"] == 2, "# scheduled trips"] / 2
    )
    return agg1

In [None]:
total_scheduled_trips = rollup_schd_qtr(peak_offpeak_df)

In [None]:
(
    section2.grouped_bar_chart(
        total_scheduled_trips.loc[(total_scheduled_trips.dir_0_1 == 0)],
        color_col="Period",
        y_col="# scheduled trips",
        offset_col="Period",
        title=readable_dict["trips_per_day_graph"]["title"],
        subtitle="",
        range_color=color_dict["spatial_accuracy_range"],
        quarter=True,
    )
    .add_params(xcol_param)
    .transform_filter(xcol_param)
)

In [None]:
total_scheduled_trips.head(1)

In [None]:
def stacked_bar_chart(
    df: pd.DataFrame,
    y_col: str,
    color_col: str,
    title: str,
    subtitle: str,
    range_color: list,
    quarter: bool = False,
) -> alt.Chart:
    tooltip_cols = [
        "Route",
        "Direction",
        y_col,
    ]

    # Set y-axis
    max_y = section2.set_y_axis(df, y_col)

    # Clean dataframe
    df = section2.clean_data_charts(df, y_col)

    if quarter == False:
        tooltip_cols.append("Period")
        tooltip_cols.append("Date")
        chart = (
            alt.Chart(df)
            .mark_bar(size=7, clip=True)
            .encode(
                x=alt.X(
                    "yearmonthdate(Date):O",
                    title=["Date"],
                    axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                ),
                y=alt.Y(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=alt.Scale(domain=[0, max_y]),
                ),
                color=alt.Color(
                    f"{color_col}:N",
                    title=_report_utils.labeling(color_col),
                    scale=alt.Scale(range=color_dict["four_colors"]),
                ),
                tooltip=tooltip_cols,
            )
        )

    else:
        tooltip_cols.append("quarter")
        chart = (
            alt.Chart(df)
            .mark_bar(size=7, clip=True)
            .encode(
                x=alt.X(
                    "quarter",
                    title=["Quarter"],
                    axis=alt.Axis(labelAngle=-45),
                ),
                y=alt.Y(
                    f"{y_col}:Q",
                    title=_report_utils.labeling(y_col),
                    scale=alt.Scale(domain=[0, max_y]),
                ),
                color=alt.Color(
                    f"{color_col}:N",
                    title=_report_utils.labeling(color_col),
                    scale=alt.Scale(range=color_dict["four_colors"]),
                ),
                tooltip=tooltip_cols,
            )
        )

    chart = chart.properties(width=200, height=250)

    # Facet the chart
    chart = chart.properties(
        title={
            "text": title,
            "subtitle": subtitle,
        }
    )

    return chart

In [None]:
stacked_bar_chart(
    df=total_scheduled_trips.loc[total_scheduled_trips.dir_0_1 == 0],
    y_col="# scheduled trips",
    color_col="Period",
    title="",
    subtitle="",
    range_color=color_dict["four_colors"],
    quarter=True,
).add_params(xcol_param).transform_filter(xcol_param)

In [None]:
(
    alt.Chart(total_scheduled_trips.loc[total_scheduled_trips.dir_0_1 == 0])
    .mark_bar(size=7, clip=True)
    .encode(
        x=alt.X(
            "quarter",
            title=["quarter"],
            axis=alt.Axis(labelAngle=-45),
        ),
        y=alt.Y(
            "# scheduled trips:Q",
            title=_report_utils.labeling(
                "# scheduled trips",
            ),
            scale=alt.Scale(domain=[0, 100]),
        ),
        color=alt.Color(
            "Period:N",
            title=_report_utils.labeling("Period"),
            scale=alt.Scale(range=color_dict["four_colors"]),
        ),
        tooltip=list(total_scheduled_trips.columns),
    )
).add_params(xcol_param).transform_filter(xcol_param)

In [None]:
(
    (
        section2.base_facet_chart(
            df=total_scheduled_trips,
            direction_to_filter=0,
            y_col="# scheduled trips",
            color_col="Period",
            facet_col="Period",
            title=readable_dict["trips_per_day_graph"]["title"],
            subtitle="",
            range_color=color_dict["four_colors"],
        )
    )
    .add_params(xcol_param)
    .transform_filter(xcol_param)
)

In [None]:
# Peak East bound
23 + 23 + 23

## Turn off group keys for the `segment_calcs.calculate_weighted_averages` function
* This didn't work at all. 

In [None]:
def calculate_weighted_averages(
    df: pd.DataFrame, group_cols: list, metric_cols: list, weight_col: str
):
    """
    For certain aggregations, we need to calculate a weighted average,
    weighted by the number of trips.

    If we want peak/offpeak weighted calculations,
    we can take time-of-day (AM peak, PM peak) and
    get a peak speed calculation, after weighting by the number
    of trips present in each time-of-day bin.

    Ex: metric_cols = ['p20_mph', 'p50_mph', 'p80_mph']
    weight_cols = 'n_trips'

    """
    for c in metric_cols:
        df[c] = df[c] * df[weight_col]

    df2 = (
        df.groupby(group_cols)
        .agg({c: "sum" for c in metric_cols + [weight_col]})
        .reset_index()
    )

    for c in metric_cols:
        df2[c] = df2[c].divide(df2[weight_col]).round(2)

    return df2

In [None]:
all_metrics_test2 = calculate_weighted_averages(
    df=all_day,
    group_cols=[
        "quarter",
        "Organization",
        "schedule_gtfs_dataset_key",
        "Route ID",
        "dir_0_1",
        "Direction",
    ],
    metric_cols=[
        "Average VP per Minute",
        "% VP within Scheduled Shape",
        "Average Scheduled Service (trip minutes)",
    ],
    weight_col="# Trips with VP",
)

In [None]:
all_metrics_test2.loc[
    (all_metrics_test2.Organization == "San Diego Metropolitan Transit System")
    & (all_metrics_test2["Route ID"] == "1")
    & (all_metrics_test2.quarter == "2023Q2")
    & (all_metrics_test2.Direction == "Eastbound")
]