# Quarterly Roll Up 
* 6/9: Something is wrong with it, there are multiple rows repeated for the same quarter when there should only be one row for one quarter for one route. 
[Issue](https://github.com/cal-itp/data-analyses/issues/1416)

In [1]:
import _report_route_dir_visuals
import _route_dir_data_prep
import altair as alt
import calitp_data_analysis.magics
import deploy_portfolio_yaml
import pandas as pd
import quarterly_rollup
import yaml
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    metrics,
    project_vars,
    segment_calcs,
    time_series_utils,
)
from shared_utils import (
    catalog_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
from omegaconf import OmegaConf

readable_dict = OmegaConf.load("readable2.yml")

In [4]:
analysis_date_list = rt_dates.y2025_dates

In [5]:
#  portfolio_name = "City and County of San Francisco"
portfolio_name = "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)"

In [6]:
# Read in og df
DIGEST_RT_SCHED_MONTH = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp

In [7]:
monthly_df = pd.read_parquet(f"{RT_SCHED_GCS}{DIGEST_RT_SCHED_MONTH}.parquet")

In [8]:
one_route = monthly_df.loc[
    (monthly_df.portfolio_organization_name == portfolio_name)
    & (monthly_df.recent_combined_name == "80-89 Coastal Express")
]

In [9]:
one_route[
    ["recent_route_id", "recent_combined_name", "combined_name"]
].drop_duplicates()

Unnamed: 0,recent_route_id,recent_combined_name,combined_name
222269,4134,80-89 Coastal Express,80__80-89 Coastal Express
223550,4136,80-89 Coastal Express,80X__80-89 Coastal Express
224432,4137,80-89 Coastal Express,81__80-89 Coastal Express
225314,4138,80-89 Coastal Express,81B__80-89 Coastal Express
226196,4141,80-89 Coastal Express,84__80-89 Coastal Express
226994,4142,80-89 Coastal Express,84U__80-89 Coastal Express
227876,4143,80-89 Coastal Express,85__80-89 Coastal Express
228674,4144,80-89 Coastal Express,85C__80-89 Coastal Express
229556,4145,80-89 Coastal Express,86__80-89 Coastal Express
230438,4146,80-89 Coastal Express,87__80-89 Coastal Express


In [10]:
one_route = time_helpers.add_quarter(one_route, "service_date")

In [11]:
# Remove underscore
one_route.year_quarter = one_route.year_quarter.str.replace("_", " ")

In [12]:
rt_metrics = segment_calcs.calculate_weighted_averages(
    df=one_route[
        quarterly_rollup.groupby_cols + quarterly_rollup.rt_metric_cols + ["n_vp_trips"]
    ],
    group_cols=quarterly_rollup.groupby_cols,
    metric_cols=quarterly_rollup.rt_metric_cols,
    weight_col="n_vp_trips",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c] * df[weight_col]


In [13]:
rt_metrics.head(12)

Unnamed: 0,schedule_gtfs_dataset_key,year_quarter,direction_id,time_period,recent_combined_name,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_vp,vp_in_shape,avg_rt_service_minutes,speed_mph,n_vp_trips
0,1770249a5a2e770ca90628434d4934b1,2024 Q3,0,all_day,80-89 Coastal Express,338.12,330.47,478.76,974.82,0.0,163.72,30.58,833
1,1770249a5a2e770ca90628434d4934b1,2024 Q3,0,offpeak,80-89 Coastal Express,182.77,180.08,192.34,527.31,0.0,134.2,31.48,637
2,1770249a5a2e770ca90628434d4934b1,2024 Q3,0,peak,80-89 Coastal Express,584.0,569.0,1038.62,1685.0,0.0,259.65,27.65,196
3,1770249a5a2e770ca90628434d4934b1,2024 Q3,1,all_day,80-89 Coastal Express,514.63,503.79,767.22,1482.95,0.0,245.64,25.15,931
4,1770249a5a2e770ca90628434d4934b1,2024 Q3,1,offpeak,80-89 Coastal Express,120.0,115.5,122.2,341.0,0.0,122.2,18.78,98
5,1770249a5a2e770ca90628434d4934b1,2024 Q3,1,peak,80-89 Coastal Express,509.76,499.47,786.86,1470.24,0.0,260.16,25.01,833
6,1770249a5a2e770ca90628434d4934b1,2024 Q4,0,all_day,80-89 Coastal Express,289.39,285.89,524.42,855.56,0.0,202.83,28.47,2646
7,1770249a5a2e770ca90628434d4934b1,2024 Q4,0,offpeak,80-89 Coastal Express,167.48,165.21,265.81,494.79,0.0,199.92,28.68,2058
8,1770249a5a2e770ca90628434d4934b1,2024 Q4,0,peak,80-89 Coastal Express,472.33,466.33,851.99,1397.0,0.0,213.0,27.88,588
9,1770249a5a2e770ca90628434d4934b1,2024 Q4,1,all_day,80-89 Coastal Express,436.29,430.95,737.83,1287.44,0.0,260.01,25.91,2695


In [14]:
schd_metrics = segment_calcs.calculate_weighted_averages(
    df=one_route[
        quarterly_rollup.groupby_cols
        + quarterly_rollup.schd_metric_cols
        + ["n_scheduled_trips"]
    ],
    group_cols=quarterly_rollup.groupby_cols,
    metric_cols=quarterly_rollup.schd_metric_cols,
    weight_col="n_scheduled_trips",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c] * df[weight_col]


In [15]:
schd_metrics.head(12)

Unnamed: 0,schedule_gtfs_dataset_key,year_quarter,direction_id,time_period,recent_combined_name,avg_scheduled_service_minutes,avg_stop_miles,frequency,total_scheduled_service_minutes,n_scheduled_trips
0,1770249a5a2e770ca90628434d4934b1,2024 Q3,0,all_day,80-89 Coastal Express,105.71,1.82,0.1,249.26,840
1,1770249a5a2e770ca90628434d4934b1,2024 Q3,0,offpeak,80-89 Coastal Express,109.58,2.09,0.09,155.14,644
2,1770249a5a2e770ca90628434d4934b1,2024 Q3,0,peak,80-89 Coastal Express,93.0,0.94,0.5,374.0,196
3,1770249a5a2e770ca90628434d4934b1,2024 Q3,1,all_day,80-89 Coastal Express,109.9,2.47,0.12,318.58,931
4,1770249a5a2e770ca90628434d4934b1,2024 Q3,1,offpeak,80-89 Coastal Express,89.0,8.14,0.06,86.0,98
5,1770249a5a2e770ca90628434d4934b1,2024 Q3,1,peak,80-89 Coastal Express,112.35,1.81,0.36,310.29,833
6,1770249a5a2e770ca90628434d4934b1,2024 Q4,0,all_day,80-89 Coastal Express,106.72,12.92,0.1,244.44,2646
7,1770249a5a2e770ca90628434d4934b1,2024 Q4,0,offpeak,80-89 Coastal Express,110.64,13.67,0.09,154.71,2058
8,1770249a5a2e770ca90628434d4934b1,2024 Q4,0,peak,80-89 Coastal Express,93.0,10.31,0.5,374.0,588
9,1770249a5a2e770ca90628434d4934b1,2024 Q4,1,all_day,80-89 Coastal Express,109.9,6.51,0.12,301.0,2793


In [16]:
# Calculate trips by timeliness which doesn't need weighted average
timeliness_df = one_route[
    quarterly_rollup.groupby_cols + quarterly_rollup.rt_metric_no_weighted_avg
]
timeliness_df2 = (
    timeliness_df.groupby(quarterly_rollup.groupby_cols)
    .agg({"is_early": "sum", "is_ontime": "sum", "is_late": "sum"})
    .reset_index()
)

In [17]:
timeliness_df2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,year_quarter,direction_id,time_period,recent_combined_name,is_early,is_ontime,is_late
0,1770249a5a2e770ca90628434d4934b1,2024 Q3,0,all_day,80-89 Coastal Express,196,49,588
1,1770249a5a2e770ca90628434d4934b1,2024 Q3,0,offpeak,80-89 Coastal Express,196,49,392


In [18]:
crosswalk = one_route[quarterly_rollup.groupby_cols + quarterly_rollup.crosswalk_cols]

In [19]:
m1 = (
    pd.merge(rt_metrics, schd_metrics, on=quarterly_rollup.groupby_cols)
    .merge(timeliness_df2, on=quarterly_rollup.groupby_cols)
    .merge(crosswalk, on=quarterly_rollup.groupby_cols, how="inner")
)

In [20]:
len(rt_metrics), len(schd_metrics), len(timeliness_df2), len(crosswalk)

(24, 24, 24, 17661)

In [21]:
m1.shape

(17661, 44)

In [22]:
DIGEST_RT_SCHED_QTR = "digest/quarterly_schedule_vp_metrics"

In [23]:
qtr_df_script = pd.read_parquet(f"{RT_SCHED_GCS}{DIGEST_RT_SCHED_QTR}.parquet")

In [24]:
len(qtr_df_script)

123619

In [25]:
one_route = qtr_df_script.loc[
    (qtr_df_script.portfolio_organization_name == portfolio_name)
    & (qtr_df_script.recent_combined_name == "80-89 Coastal Express")
]

In [26]:
quarterly_rollup.group_cols

['year_quarter',
 'schedule_gtfs_dataset_key',
 'recent_route_id',
 'direction_id',
 'time_period']

In [27]:
one_route.shape

(164, 48)

In [37]:
len(one_route.drop_duplicates(subset=quarterly_rollup.groupby_cols))

24

In [29]:
len(one_route.drop_duplicates(subset=quarterly_rollup.group_cols))

164

In [30]:
DIGEST_RT_SCHED_QTR = "digest/quarterly_schedule_vp_metrics"

In [38]:
qtr_df = pd.read_parquet(f"{RT_SCHED_GCS}{DIGEST_RT_SCHED_QTR}.parquet")

In [41]:
qtr_df.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'is_express', 'is_ferry', 'is_rail', 'is_coverage',
       'is_local', 'is_downtown_local', 'is_rapid', 'typology', 'name',
       'combined_name', 'recent_combined_name', 'recent_route_id',
       'route_primary_direction', 'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltr

In [42]:
qtr_df = qtr_df.loc[
    (qtr_df["portfolio_organization_name"] == portfolio_name)
    & (qtr_df["recent_combined_name"] == "80-89 Coastal Express")
]

In [43]:
qtr_df.year_quarter.value_counts()

2024 Q3    6
2024 Q4    6
2025 Q1    6
2025 Q2    6
Name: year_quarter, dtype: int64

In [35]:
len(qtr_df.drop_duplicates())

24

In [44]:
ROUTE_DIR_QTR_EXPORT = "digest/quarterly_schedule_vp_metrics_report"

In [45]:
qtr_report = pd.read_parquet(f"{RT_SCHED_GCS}{ROUTE_DIR_QTR_EXPORT}.parquet")

In [46]:
qtr_report.columns

Index(['Direction (0/1)', 'Period', 'Average Scheduled Service (trip minutes)',
       '# Scheduled Trips', '# Realtime Trips', 'Route', 'Direction',
       '# Minutes with 1+ VP per Minute', '# Minutes with 2+ VP per Minute',
       '# Early Arrival Trips', '# On-Time Trips', '# Late Trips',
       'Average VP per Minute', '% VP within Scheduled Shape',
       '% Scheduled Trip w/ 1+ VP/Minute', '% Scheduled Trip w/ 2+ VP/Minute',
       'Realtime versus Scheduled Service Ratio', 'Speed (MPH)',
       'Portfolio Organization Name', 'Headway (Minutes)', 'GTFS Availability',
       'Average Stop Distance (Miles)', 'Quarter'],
      dtype='object')

In [47]:
qtr_report = qtr_report.loc[
    (qtr_report["Portfolio Organization Name"] == portfolio_name)
    & (qtr_report["Route"] == "80-89 Coastal Express")
]

In [49]:
qtr_report.Quarter.value_counts()

2024 Q3    6
2024 Q4    6
2025 Q1    6
2025 Q2    6
Name: Quarter, dtype: int64