# Quarterly Roll Up 
* 6/9: Something is wrong with it, there are multiple rows repeated for the same quarter when there should only be one row for one quarter for one route. 
[Issue](https://github.com/cal-itp/data-analyses/issues/1416)

In [1]:
import _report_route_dir_visuals
import _route_dir_data_prep
import altair as alt
import calitp_data_analysis.magics
import deploy_portfolio_yaml
import pandas as pd
import quarterly_rollup
import yaml
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    metrics,
    project_vars,
    segment_calcs,
    time_series_utils,
)
from shared_utils import (
    catalog_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
from omegaconf import OmegaConf

readable_dict = OmegaConf.load("readable2.yml")

In [4]:
analysis_date_list = rt_dates.y2025_dates

In [5]:
#  portfolio_name = "City and County of San Francisco"
portfolio_name = "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)"

In [6]:
# Read in og df
DIGEST_RT_SCHED_MONTH = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp

In [7]:
monthly_df = pd.read_parquet(f"{RT_SCHED_GCS}{DIGEST_RT_SCHED_MONTH}.parquet")

In [8]:
monthly_df.loc[monthly_df.recent_combined_name == "86 80-89 Coastal Express"].shape

(672, 46)

In [9]:
coastal_86 = monthly_df.loc[
    monthly_df.recent_combined_name == "86 80-89 Coastal Express"
]

In [10]:
len(coastal_86), len(coastal_86.drop_duplicates())

(672, 100)

In [11]:
coastal_86_dedup = monthly_df.loc[
    monthly_df.recent_combined_name == "86 80-89 Coastal Express"
]

In [13]:
df = time_helpers.add_quarter(coastal_86_dedup, 'service_date')

In [14]:
df.portfolio_organization_name.unique()

array(['Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)'],
      dtype=object)

In [15]:
rt_metrics = segment_calcs.calculate_weighted_averages(
    df=df[
        quarterly_rollup.groupby_cols + quarterly_rollup.rt_metric_cols + ["n_vp_trips"]
    ],
    group_cols=quarterly_rollup.groupby_cols,
    metric_cols=quarterly_rollup.rt_metric_cols,
    weight_col="n_vp_trips",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c] * df[weight_col]


In [16]:
rt_metrics.shape

(36, 13)

In [17]:
rt_metrics.columns

Index(['portfolio_organization_name', 'year_quarter', 'direction_id',
       'time_period', 'recent_combined_name', 'minutes_atleast1_vp',
       'minutes_atleast2_vp', 'total_rt_service_minutes', 'total_vp',
       'vp_in_shape', 'avg_rt_service_minutes', 'speed_mph', 'n_vp_trips'],
      dtype='object')

### Check the rolling up

In [82]:
coastal_86_dedup.service_date.unique()

array(['2023-04-12T00:00:00.000000000', '2023-06-14T00:00:00.000000000',
       '2023-07-12T00:00:00.000000000', '2023-08-15T00:00:00.000000000',
       '2023-09-13T00:00:00.000000000', '2023-10-11T00:00:00.000000000',
       '2023-11-15T00:00:00.000000000', '2023-12-13T00:00:00.000000000',
       '2024-01-17T00:00:00.000000000', '2024-02-14T00:00:00.000000000',
       '2024-03-13T00:00:00.000000000', '2024-04-17T00:00:00.000000000',
       '2024-05-22T00:00:00.000000000', '2024-06-12T00:00:00.000000000',
       '2024-07-17T00:00:00.000000000', '2024-08-14T00:00:00.000000000',
       '2024-09-18T00:00:00.000000000', '2024-10-16T00:00:00.000000000',
       '2024-11-13T00:00:00.000000000', '2024-12-11T00:00:00.000000000',
       '2025-01-15T00:00:00.000000000', '2025-02-12T00:00:00.000000000',
       '2025-03-12T00:00:00.000000000', '2025-04-16T00:00:00.000000000',
       '2025-05-14T00:00:00.000000000'], dtype='datetime64[ns]')

In [85]:
coastal_86_first_quarter = coastal_86_dedup.loc[coastal_86_dedup.service_date.isin(['2025-01-15T00:00:00.000000000', '2025-02-12T00:00:00.000000000',
       '2025-03-12T00:00:00.000000000'])]

In [90]:
coastal_86_first_quarter.loc[coastal_86_first_quarter.direction_id == 0][["total_scheduled_service_minutes", "n_vp_trips"]].drop_duplicates()

Unnamed: 0,total_scheduled_service_minutes,n_vp_trips
57378,225.0,2


In [94]:
len(coastal_86_first_quarter), len(coastal_86_first_quarter.drop_duplicates())

(84, 12)

In [95]:
coastal_86_first_quarter.loc[coastal_86_first_quarter.direction_id == 0].drop_duplicates().T

Unnamed: 0,57378,57385,57392,57549,57556,57563
schedule_gtfs_dataset_key,1770249a5a2e770ca90628434d4934b1,1770249a5a2e770ca90628434d4934b1,1770249a5a2e770ca90628434d4934b1,1770249a5a2e770ca90628434d4934b1,1770249a5a2e770ca90628434d4934b1,1770249a5a2e770ca90628434d4934b1
route_id,4145,4145,4145,4145,4145,4145
direction_id,0,0,0,0,0,0
time_period,all_day,all_day,all_day,offpeak,offpeak,offpeak
avg_scheduled_service_minutes,112.50,112.50,112.50,112.50,112.50,112.50
avg_stop_miles,3.48,56.86,56.86,3.48,56.86,56.86
n_scheduled_trips,2,2,2,2,2,2
frequency,0.08,0.08,0.08,0.12,0.12,0.12
service_date,2025-01-15 00:00:00,2025-02-12 00:00:00,2025-03-12 00:00:00,2025-01-15 00:00:00,2025-02-12 00:00:00,2025-03-12 00:00:00
is_express,1.00,1.00,1.00,1.00,1.00,1.00


In [18]:
rt_metrics.loc[rt_metrics.year_quarter == "2025_Q1"]

Unnamed: 0,portfolio_organization_name,year_quarter,direction_id,time_period,recent_combined_name,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_vp,vp_in_shape,avg_rt_service_minutes,speed_mph,n_vp_trips
28,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,0,all_day,86 80-89 Coastal Express,174.33,171.0,516.93,512.33,90.0,258.47,39.72,42
29,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,0,offpeak,86 80-89 Coastal Express,174.33,171.0,516.93,512.33,90.0,258.47,39.72,42
30,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,1,all_day,86 80-89 Coastal Express,356.67,350.67,907.89,1046.33,279.0,453.94,11.08,42
31,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,1,peak,86 80-89 Coastal Express,356.67,350.67,907.89,1046.33,279.0,453.94,11.08,42


In [19]:
schd_metrics = segment_calcs.calculate_weighted_averages(
    df=df[
        quarterly_rollup.groupby_cols
        + quarterly_rollup.schd_metric_cols
        + ["n_scheduled_trips"]
    ],
    group_cols=quarterly_rollup.groupby_cols,
    metric_cols=quarterly_rollup.schd_metric_cols,
    weight_col="n_scheduled_trips",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c] * df[weight_col]


In [20]:
schd_metrics.loc[schd_metrics.year_quarter == "2025_Q1"]

Unnamed: 0,portfolio_organization_name,year_quarter,direction_id,time_period,recent_combined_name,avg_scheduled_service_minutes,avg_stop_miles,frequency,total_scheduled_service_minutes,n_scheduled_trips
28,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,0,all_day,86 80-89 Coastal Express,112.5,39.07,0.08,225.0,42
29,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,0,offpeak,86 80-89 Coastal Express,112.5,39.07,0.12,225.0,42
30,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,1,all_day,86 80-89 Coastal Express,106.0,2.88,0.08,212.0,42
31,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,1,peak,86 80-89 Coastal Express,106.0,2.88,0.25,212.0,42


In [21]:
timeliness_df = df[
    quarterly_rollup.groupby_cols + quarterly_rollup.rt_metric_no_weighted_avg
]
timeliness_df2 = (
    timeliness_df.groupby(quarterly_rollup.groupby_cols)
    .agg({"is_early": "sum", "is_ontime": "sum", "is_late": "sum"})
    .reset_index()
)

In [22]:
timeliness_df2.loc[timeliness_df2.year_quarter == "2025_Q1"]

Unnamed: 0,portfolio_organization_name,year_quarter,direction_id,time_period,recent_combined_name,is_early,is_ontime,is_late
28,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,0,all_day,86 80-89 Coastal Express,21,0,21
29,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,0,offpeak,86 80-89 Coastal Express,21,0,21
30,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,1,all_day,86 80-89 Coastal Express,0,0,42
31,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)",2025_Q1,1,peak,86 80-89 Coastal Express,0,0,42


In [26]:
# Merge all the dataframes
m1 = pd.merge(rt_metrics, schd_metrics, on=quarterly_rollup.groupby_cols).merge(
    timeliness_df2, on=quarterly_rollup.groupby_cols
)

In [27]:
len(m1.loc[m1.year_quarter == "2025_Q1"])

4

In [28]:
len(m1.loc[m1.year_quarter == "2025_Q1"].drop_duplicates())

4

In [29]:
m1.loc[m1.year_quarter == "2025_Q1"].T

Unnamed: 0,28,29,30,31
portfolio_organization_name,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)","Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)","Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)","Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)"
year_quarter,2025_Q1,2025_Q1,2025_Q1,2025_Q1
direction_id,0,0,1,1
time_period,all_day,offpeak,all_day,peak
recent_combined_name,86 80-89 Coastal Express,86 80-89 Coastal Express,86 80-89 Coastal Express,86 80-89 Coastal Express
minutes_atleast1_vp,174.33,174.33,356.67,356.67
minutes_atleast2_vp,171.00,171.00,350.67,350.67
total_rt_service_minutes,516.93,516.93,907.89,907.89
total_vp,512.33,512.33,1046.33,1046.33
vp_in_shape,90.00,90.00,279.00,279.00


In [30]:
set(monthly_df.columns) - set(m1.columns)

{'base64_url',
 'caltrans_district',
 'combined_name',
 'is_coverage',
 'is_downtown_local',
 'is_express',
 'is_ferry',
 'is_local',
 'is_rail',
 'is_rapid',
 'name',
 'pct_in_shape',
 'pct_rt_journey_atleast1_vp',
 'pct_rt_journey_atleast2_vp',
 'pct_sched_journey_atleast1_vp',
 'pct_sched_journey_atleast2_vp',
 'recent_route_id',
 'route_id',
 'route_primary_direction',
 'rt_sched_journey_ratio',
 'sched_rt_category',
 'schedule_gtfs_dataset_key',
 'schedule_source_record_id',
 'service_date',
 'typology',
 'vp_per_minute'}

In [31]:
crosswalk_df = df[
    [
        "base64_url",
        "caltrans_district",
        "combined_name",
        "is_coverage",
        "is_downtown_local",
        "is_express",
        "is_ferry",
        "is_local",
        "is_rail",
        "is_rapid",
        "name",
        "recent_route_id",
        "route_id",
        "route_primary_direction",
        "sched_rt_category",
        "schedule_gtfs_dataset_key",
        "schedule_source_record_id",
        "typology",
        "portfolio_organization_name",
        "year_quarter",
        "direction_id",
        "time_period",
        "recent_combined_name"
    ]
].drop_duplicates()

In [32]:
quarterly_rollup.groupby_cols

['portfolio_organization_name',
 'year_quarter',
 'direction_id',
 'time_period',
 'recent_combined_name']

In [33]:
crosswalk_df.shape

(42, 23)

In [45]:
m2 = pd.merge(m1, crosswalk_df, on = quarterly_rollup.groupby_cols)

In [46]:
len(m1), len(m2)

(36, 42)

In [47]:
m2.loc[m2.year_quarter == "2025_Q1"].T

Unnamed: 0,34,35,36,37
portfolio_organization_name,"Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)","Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)","Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)","Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)"
year_quarter,2025_Q1,2025_Q1,2025_Q1,2025_Q1
direction_id,0,0,1,1
time_period,all_day,offpeak,all_day,peak
recent_combined_name,86 80-89 Coastal Express,86 80-89 Coastal Express,86 80-89 Coastal Express,86 80-89 Coastal Express
minutes_atleast1_vp,174.33,174.33,356.67,356.67
minutes_atleast2_vp,171.00,171.00,350.67,350.67
total_rt_service_minutes,516.93,516.93,907.89,907.89
total_vp,512.33,512.33,1046.33,1046.33
vp_in_shape,90.00,90.00,279.00,279.00


In [48]:
m2 = m2.rename(
    columns={
        "total_rt_service_minutes": "rt_service_minutes",
        "total_scheduled_service_minutes": "scheduled_service_minutes",
    }).pipe(
     metrics.calculate_rt_vs_schedule_metrics
    ).rename(
       columns={
        "rt_service_minutes": "total_rt_service_minutes",
        "scheduled_service_minutes": "total_scheduled_service_minutes"

    })

In [49]:
m2["rt_sched_journey_ratio"] = (
        m2.total_rt_service_minutes / m2.total_scheduled_service_minutes
    )

In [58]:
col_proper_order = list(coastal_86_dedup.columns)

In [60]:
col_proper_order.remove("service_date")

In [62]:
m2 = m2[col_proper_order + ["year_quarter"]]

In [63]:
m2.loc[m2.year_quarter == "2025_Q1"].T

Unnamed: 0,34,35,36,37
schedule_gtfs_dataset_key,1770249a5a2e770ca90628434d4934b1,1770249a5a2e770ca90628434d4934b1,1770249a5a2e770ca90628434d4934b1,1770249a5a2e770ca90628434d4934b1
route_id,4145,4145,4145,4145
direction_id,0,0,1,1
time_period,all_day,offpeak,all_day,peak
avg_scheduled_service_minutes,112.50,112.50,106.00,106.00
avg_stop_miles,39.07,39.07,2.88,2.88
n_scheduled_trips,42,42,42,42
frequency,0.08,0.12,0.08,0.25
is_express,1.00,1.00,1.00,1.00
is_ferry,0.00,0.00,0.00,0.00


In [69]:
DIGEST_RT_SCHED_QTR = GTFS_DATA_DICT.digest_tables.quarterly_route_schedule_vp

In [73]:
qtr_df = pd.read_parquet(
        f"{RT_SCHED_GCS}{DIGEST_RT_SCHED_QTR}.parquet"
    )

In [74]:
qtr_df.loc[qtr_df.portfolio_organization_name == portfolio_name].shape

(2998, 46)

In [75]:
qtr_df.loc[qtr_df.portfolio_organization_name == portfolio_name].recent_combined_name.unique()

array(['1 Route 1', '10 Route 10', '2 Route 2', '20 Route 20',
       '30 Route 30', '40 Route 40 Newbury Park', '41 Route 41 Midtown A',
       '42 Route 42 Midtown B', '43 Route 43 TOB Express',
       '44 Route 44 Crosstown', '50 50-Hwy101', '60 60-Hwy126',
       '62 62-Hwy126', '70 70-East County', '72 72-East County',
       '73 73-East County', '73X 73x-East County',
       '77 77-Cross County Limited', '80 80-Coastal Express',
       '80X 80x-Coastal Express', '81 81-Coastal Express',
       '81B 81B-Coastal Express', '84 84-Coastal Express',
       '84U 84u-Coastal Express', '85 85-Coastal Express',
       '85C 85c-Coastal Express', '86 86-Coastal Express',
       '87 87-Coastal Express', '88 88-Coastal Express',
       '90 90-C Street/CSUCI', '97 97-Cam Metrolink/CSUCI',
       'Fillmore Fillmore Loop', 'Fixed Route Fixed Route Service',
       'KS Kanan Shuttle', 'Piru Piru', 'Route 1 Route 1',
       'Route 10 Route 10', 'Route 11 Route 11', 'Route 15 Route 15',
       'Rou

In [77]:
qtr_df.year_quarter.unique()

array(['2023 Q1', '2023 Q2', '2023 Q3', '2023 Q4', '2024 Q1', '2024 Q2',
       '2024 Q3', '2024 Q4', '2025 Q1', '2025 Q2'], dtype=object)

In [80]:
qtr_df.loc[
    (qtr_df.recent_combined_name == "86 80-89 Coastal Express") & (qtr_df.year_quarter == "2025 Q1")
].shape

(4, 46)