# Compare number of rows from single day sample to longer time average 

If we aggregate across time (single dates rolled into quarterly and annual averages), how many rows get reduced? Are we capturing the same number of groupings?

If we aggregate across space within operator (shapes-stop_seq -> route/dir/stop_pair), how many rows get reduced?

Scripts:
* [quarterly and annual averages](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/quarter_year_averages.py)

Past notebooks:
* [segment stability notebook](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/39_segment_stability.ipynb)
* [weekly average speeds](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/26_weekly_avg_speeds.ipynb)
* [stop combinations notebook](https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/16_stop_combos_for_segments.ipynb)

In [1]:
import polars as pl
import pandas as pd

from shared_utils import dask_utils, rt_dates
from segment_speed_utils.project_vars import GTFS_DATA_DICT, SEGMENT_GCS

In [2]:
# compare Mar-Dec 2023 dates + Jan-Oct 2024 dates 
# (shape speeds are not produced after Oct)
analysis_date_list = rt_dates.y2023_dates + rt_dates.y2024_dates[:-2]

SPEED_FILE = GTFS_DATA_DICT.stop_segments.route_dir_single_segment
SPEED_SHAPE_FILE = "rollup_singleday/speeds_shape_stop_segments"

route_dir_cols = ["route_id", "direction_id", "stop_pair"]
shape_cols = ["shape_array_key", "stop_pair"]

# including caltrans_district will separate some dates for an operator
# since the caltrans_district value changed at some point
operator_cols = ["name"] 
other_metrics = ["n_trips"]

In [3]:
def concatenated_df(
    file_name: str,
    analysis_date_list: list,
    columns: list,
    time_period: str = "all_day",
) -> pd.DataFrame:
    """
    Import all the dates and compare some metrics on
    number of groupings, rows, etc for each operator.
    """
    paths = [f"{SEGMENT_GCS}{file_name}" for d in analysis_date_list]

    df = dask_utils.get_ddf(
        paths, 
        analysis_date_list, 
        data_type = "df",
        get_pandas = True,
        add_date = True,
        columns = columns,
        filters = [[("time_period", "==", time_period)]]
    )
    
    return df

In [4]:
def operator_groupings_by_date(
    df: pd.DataFrame,
    group_cols: list
) -> pd.DataFrame:
    """
    Count number of groups for each operator by date.
    Also count total trips covered.
    Since a group is a "segment", n_trips is counting 
    trips covered by that segment.
    """
    df2 = (
        df
        .groupby(group_cols, group_keys=False, dropna=False)
        .agg({
            "n_trips": "sum",
            "stop_pair": "count"
        })
        .reset_index()
        .rename(columns = {"stop_pair": "n_groups"})
    )

    return df2

In [5]:
route_dir_df = concatenated_df(
    SPEED_FILE,
    analysis_date_list,
    columns = operator_cols + route_dir_cols + other_metrics
).pipe(operator_groupings_by_date, operator_cols + ["service_date"])

shape_df = concatenated_df(
    SPEED_SHAPE_FILE,
    analysis_date_list,
    columns = operator_cols + shape_cols + other_metrics
).pipe(operator_groupings_by_date, operator_cols + ["service_date"])

In [6]:
df = pd.merge(
    route_dir_df.rename(columns = {
        "n_trips": "route_dir_n_trips",
        "n_groups": "route_dir_groups"
    }),
    shape_df.rename(columns = {
        "n_trips": "shape_n_trips",
        "n_groups": "shape_groups"
    }),
    on = operator_cols + ["service_date"],
    how = "inner",
)

In [7]:
# Calculate differences
df = df.assign(
    n_groups_diff = df.route_dir_groups - df.shape_groups,
    n_trips_diff = df.route_dir_n_trips - df.shape_n_trips,
)

In [8]:
def transform_for_nanoplot(
    df: pd.DataFrame,
    group_cols: list = ["name"]
):
    
    other_cols = [c for c in df.columns if c not in group_cols and c != "service_date"]
    df2 = (
        df
        .sort_values(group_cols + ["service_date"])
        .groupby(group_cols)
        .agg({
            **{c: lambda x: list(x) for c in other_cols}
        })
        .reset_index()
    )
    
    return df2

In [9]:
df_flat = transform_for_nanoplot(df)

In [10]:
from great_tables import GT, nanoplot_options

df_pl = pl.from_pandas(df_flat)

In [11]:
nano_options=nanoplot_options(
    data_bar_stroke_color="dodgerblue",
    data_bar_stroke_width=2,
    data_bar_fill_color="dodgerblue",
    data_bar_negative_stroke_color="coral",
    data_bar_negative_stroke_width=1,
    data_bar_negative_fill_color="coral",
    reference_line_color="darkgray",
    reference_area_fill_color="green",
    vertical_guide_stroke_color="dodgerblue",
)

In [12]:
def make_plot(df):
    """
    Make table.
    """
    table = (GT(df)
     .fmt_nanoplot(
        columns = "n_groups_diff", 
        reference_line = 0,
        plot_type = "bar",
        options = nano_options
    ).fmt_nanoplot(
         columns = "n_trips_diff", 
         reference_line = 0,
         plot_type = "bar", 
         options = nano_options
     ).cols_label(
         name = "Operator",
         n_groups_diff = "# Groups (Route-Dir - Shape)",
         n_trips_diff = "# Trips (Route-Dir - Shape)",
     ).cols_hide(
         ["route_dir_n_trips", "route_dir_groups",
          "shape_n_trips", "shape_groups"]
     ).tab_header(
         title = "Aggregation Groups by Date",
         subtitle = "route-direction-stop_pair vs shape-stop_pair"
     )
    )

    return table

In [13]:
make_plot(df_pl)

Aggregation Groups by Date,Aggregation Groups by Date,Aggregation Groups by Date
route-direction-stop_pair vs shape-stop_pair,route-direction-stop_pair vs shape-stop_pair,route-direction-stop_pair vs shape-stop_pair
Operator,# Groups (Route-Dir - Shape),# Trips (Route-Dir - Shape)
Antelope Valley Transit Authority Schedule,054-77-32-23-20-33-340-43-65-77-32-20-44-73-43-51-5054-43-56-14,02.09K-95110110110101801-30-952.09K-45311
Avalon Schedule,05-500000000000,07000000000007
B-Line Schedule,00-205-204-191-203-177-173-175-205-190,0260000002600
Banning Pass Schedule,00-16-16-13-7-12,010-100-10100
Bay Area 511 AC Transit Schedule,00-2.82K-1.68K-1.61K-1.82K-1.01K-1.01K-1.75K-1.91K-1.96K-1.92K-1.89K-1.95K-1.98K-2.76K-2.82K-2.00K-1.20K-687-1.30K-1.90K-1.72K,033.7K-3330000000000001750-333-11533.7K-119-1020
Bay Area 511 ACE Schedule,01-10000-10-1010,08-8000080-1-800
Bay Area 511 Caltrain Schedule,01-146-66-58-76-38-48-1461,0100000001
Bay Area 511 Capitol Corridor Schedule,01-37-35-34-32-35-35-36-35-370-11,07-60000702-60-13
Bay Area 511 County Connection Schedule,00-244-179-161-179-85-88-186-192-193-155-196-186-113-183-189-204-87-104-244-125,02.57K-136000000000000220252.57K-136-140
Bay Area 511 Dumbarton Express Schedule,00-30-30-30-30-29-29-27-27-23-27-25-27-27-27-27-27-26-22-24-24,0128-110000000000000-11-8128270


### Quick sanity check for inner merge fails

Only 1 operator is in common, and that date had very sparse observations.

In [14]:
merged = pd.merge(
    route_dir_df.rename(columns = {
        "n_trips": "route_dir_n_trips",
        "n_groups": "route_dir_groups"
    }),
    shape_df.rename(columns = {
        "n_trips": "shape_n_trips",
        "n_groups": "shape_groups"
    }),
    on = operator_cols + ["service_date"],
    how = "outer",
    indicator=True
)

In [15]:
merged._merge.value_counts()

both          1475
right_only     118
left_only        4
Name: _merge, dtype: int64

In [16]:
merged[merged._merge=="left_only"][operator_cols].drop_duplicates()

Unnamed: 0,name
38,B-Line Schedule
73,Bay Area 511 ACE Schedule
909,Mendocino Schedule
1102,Redwood Coast Schedule


In [17]:
merged[merged._merge=="right_only"][operator_cols].drop_duplicates()

Unnamed: 0,name
1479,Anaheim Resort Schedule
1498,Anaheim Resort Schedule v2
1500,Arcadia Schedule
1510,Bay Area 511 ACE Schedule
1511,BruinBus Schedule
1515,Irvine CONNECT Schedule
1519,LAX Flyaway Bus Schedule
1521,LAX Shuttles Schedule
1523,MV Shuttle Schedule
1525,Montebello Schedule


In [18]:
merged[(merged._merge=="left_only") & (merged.name=="Bay Area 511 ACE Schedule")]

Unnamed: 0,name,service_date,route_dir_n_trips,route_dir_groups,shape_n_trips,shape_groups,_merge
73,Bay Area 511 ACE Schedule,2024-09-18,2.0,2.0,,,left_only


In [19]:
merged[(merged._merge=="right_only") & (merged.name=="Bay Area 511 ACE Schedule")]

Unnamed: 0,name,service_date,route_dir_n_trips,route_dir_groups,shape_n_trips,shape_groups,_merge
1510,Bay Area 511 ACE Schedule,2024-07-17,,,1.0,1.0,right_only
