# Quarterly Roll Up 
* 6/9: Something is wrong with it, there are multiple rows repeated for the same quarter when there should only be one row for one quarter for one route. 
[Issue](https://github.com/cal-itp/data-analyses/issues/1416)

In [1]:
import _report_route_dir_visuals
import _route_dir_data_prep
import altair as alt
import calitp_data_analysis.magics
import deploy_portfolio_yaml
import pandas as pd
import quarterly_rollup
import yaml
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import (
    gtfs_schedule_wrangling,
    metrics,
    project_vars,
    segment_calcs,
    time_series_utils,
)
from shared_utils import (
    catalog_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
    rt_utils,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
from omegaconf import OmegaConf

readable_dict = OmegaConf.load("readable2.yml")

In [4]:
analysis_date_list = rt_dates.y2025_dates

In [5]:
#  portfolio_name = "City and County of San Francisco"
portfolio_name = "Ventura County (VCTC, Gold Coast, Cities of Camarillo, Moorpark, Ojai, Simi Valley, Thousand Oaks)"

In [6]:
# Read in og df
# This is not original, the original dataframes are like speed/crosswalk/etc 
# If there is a problem at the end, then this is not the original. 
# 
DIGEST_RT_SCHED_MONTH = GTFS_DATA_DICT.digest_tables.monthly_route_schedule_vp

In [7]:
monthly_df = pd.read_parquet(f"{RT_SCHED_GCS}{DIGEST_RT_SCHED_MONTH}.parquet")

In [9]:
DIGEST_RT_SCHED_QTR = GTFS_DATA_DICT.digest_tables.quarterly_route_schedule_vp

In [10]:
quarter_df = pd.read_parquet(
        f"{RT_SCHED_GCS}{DIGEST_RT_SCHED_QTR}.parquet"
    )

In [14]:
qtr_86_coastal = quarter_df.loc[quarter_df.recent_combined_name == "86 80-89 Coastal Express"]

In [15]:
len(qtr_86_coastal), len(qtr_86_coastal.drop_duplicates())

(42, 42)

In [18]:
qtr_86_coastal[["direction_id","time_period","n_scheduled_trips","year_quarter"]].sort_values(by = ["year_quarter","direction_id","time_period"])

Unnamed: 0,direction_id,time_period,n_scheduled_trips,year_quarter
126128,0,all_day,4,2023 Q2
126193,0,offpeak,4,2023 Q2
126303,1,all_day,4,2023 Q2
126376,1,peak,4,2023 Q2
126433,0,all_day,6,2023 Q3
126500,0,offpeak,6,2023 Q3
126613,1,all_day,6,2023 Q3
126687,1,peak,6,2023 Q3
126743,0,all_day,6,2023 Q4
126806,0,offpeak,6,2023 Q4


In [20]:
monthly_df.loc[(monthly_df.recent_combined_name == "86 80-89 Coastal Express") & (monthly_df.direction_id == 0)][["service_date","n_scheduled_trips"]]

Unnamed: 0,service_date,n_scheduled_trips
29616,2023-04-12,2
29617,2023-06-14,2
29618,2023-07-12,2
29619,2023-08-15,2
29620,2023-09-13,2
29621,2023-10-11,2
29622,2023-11-15,2
29623,2023-12-13,2
29624,2024-01-17,2
29625,2024-02-14,2


In [None]:
coastal_86 = monthly_df.loc[
    monthly_df.recent_combined_name == "86 80-89 Coastal Express"
]

In [None]:
len(coastal_86), len(coastal_86.drop_duplicates())

In [None]:
coastal_86.head()

In [None]:
coastal_86_dedup = monthly_df.loc[
    monthly_df.recent_combined_name == "86 80-89 Coastal Express"
]

In [None]:
df = time_helpers.add_quarter(coastal_86_dedup, 'service_date')

In [None]:
df.portfolio_organization_name.unique()

In [None]:
rt_metrics = segment_calcs.calculate_weighted_averages(
    df=df[
        quarterly_rollup.groupby_cols + quarterly_rollup.rt_metric_cols + ["n_vp_trips"]
    ],
    group_cols=quarterly_rollup.groupby_cols,
    metric_cols=quarterly_rollup.rt_metric_cols,
    weight_col="n_vp_trips",
)

In [None]:
rt_metrics.shape

In [None]:
rt_metrics.columns

### Check the rolling up

In [None]:
coastal_86_dedup.service_date.unique()

In [None]:
coastal_86_first_quarter = coastal_86_dedup.loc[coastal_86_dedup.service_date.isin(['2025-01-15T00:00:00.000000000', '2025-02-12T00:00:00.000000000',
       '2025-03-12T00:00:00.000000000'])]

In [None]:
coastal_86_first_quarter.loc[coastal_86_first_quarter.direction_id == 0][["total_scheduled_service_minutes", "n_vp_trips"]].drop_duplicates()

In [None]:
len(coastal_86_first_quarter), len(coastal_86_first_quarter.drop_duplicates())

In [None]:
coastal_86_first_quarter.loc[coastal_86_first_quarter.direction_id == 0].drop_duplicates().T

In [None]:
rt_metrics.loc[rt_metrics.year_quarter == "2025_Q1"]

In [None]:
schd_metrics = segment_calcs.calculate_weighted_averages(
    df=df[
        quarterly_rollup.groupby_cols
        + quarterly_rollup.schd_metric_cols
        + ["n_scheduled_trips"]
    ],
    group_cols=quarterly_rollup.groupby_cols,
    metric_cols=quarterly_rollup.schd_metric_cols,
    weight_col="n_scheduled_trips",
)

In [None]:
schd_metrics.loc[schd_metrics.year_quarter == "2025_Q1"]

In [None]:
timeliness_df = df[
    quarterly_rollup.groupby_cols + quarterly_rollup.rt_metric_no_weighted_avg
]
timeliness_df2 = (
    timeliness_df.groupby(quarterly_rollup.groupby_cols)
    .agg({"is_early": "sum", "is_ontime": "sum", "is_late": "sum"})
    .reset_index()
)

In [None]:
timeliness_df2.loc[timeliness_df2.year_quarter == "2025_Q1"]

In [None]:
# Merge all the dataframes
m1 = pd.merge(rt_metrics, schd_metrics, on=quarterly_rollup.groupby_cols).merge(
    timeliness_df2, on=quarterly_rollup.groupby_cols
)

In [None]:
len(m1.loc[m1.year_quarter == "2025_Q1"])

In [None]:
len(m1.loc[m1.year_quarter == "2025_Q1"].drop_duplicates())

In [None]:
m1.loc[m1.year_quarter == "2025_Q1"].T

In [None]:
set(monthly_df.columns) - set(m1.columns)

In [None]:
crosswalk_df = df[
    [
        "base64_url",
        "caltrans_district",
        "combined_name",
        "is_coverage",
        "is_downtown_local",
        "is_express",
        "is_ferry",
        "is_local",
        "is_rail",
        "is_rapid",
        "name",
        "recent_route_id",
        "route_id",
        "route_primary_direction",
        "sched_rt_category",
        "schedule_gtfs_dataset_key",
        "schedule_source_record_id",
        "typology",
        "portfolio_organization_name",
        "year_quarter",
        "direction_id",
        "time_period",
        "recent_combined_name"
    ]
].drop_duplicates()

In [None]:
quarterly_rollup.groupby_cols

In [None]:
crosswalk_df.shape

In [None]:
m2 = pd.merge(m1, crosswalk_df, on = quarterly_rollup.groupby_cols)

In [None]:
len(m1), len(m2)

In [None]:
m2.loc[m2.year_quarter == "2025_Q1"].T

In [None]:
m2 = m2.rename(
    columns={
        "total_rt_service_minutes": "rt_service_minutes",
        "total_scheduled_service_minutes": "scheduled_service_minutes",
    }).pipe(
     metrics.calculate_rt_vs_schedule_metrics
    ).rename(
       columns={
        "rt_service_minutes": "total_rt_service_minutes",
        "scheduled_service_minutes": "total_scheduled_service_minutes"

    })

In [None]:
m2["rt_sched_journey_ratio"] = (
        m2.total_rt_service_minutes / m2.total_scheduled_service_minutes
    )

In [None]:
col_proper_order = list(coastal_86_dedup.columns)

In [None]:
col_proper_order.remove("service_date")

In [None]:
m2 = m2[col_proper_order + ["year_quarter"]]

In [None]:
m2.loc[m2.year_quarter == "2025_Q1"].T

In [None]:
DIGEST_RT_SCHED_QTR = GTFS_DATA_DICT.digest_tables.quarterly_route_schedule_vp

In [None]:
qtr_df = pd.read_parquet(
        f"{RT_SCHED_GCS}{DIGEST_RT_SCHED_QTR}.parquet"
    )

In [None]:
qtr_df.loc[qtr_df.portfolio_organization_name == portfolio_name].shape

In [None]:
qtr_df.loc[qtr_df.portfolio_organization_name == portfolio_name].recent_combined_name.unique()

In [None]:
qtr_df.year_quarter.unique()

In [None]:
qtr_df.loc[
    (qtr_df.recent_combined_name == "86 80-89 Coastal Express") & (qtr_df.year_quarter == "2025 Q1")
].shape