# Monthly Trends 

Now that we're beginning to add monthly aggregations to schedule data, let's see how it all plays together. Use this to get out the kinks of combining all our current data products.

* Start with a single day across multiple months.
* `scheduled_service_hours`
    * service hours, scheduled trips in peak vs offpeak
    * average stop spacing
* `speeds` - single day aggregation to route-direction. don't use weekly average...we only produced this for 2 months
    * segment speeds
    * route-direction avg speed with common shape?
* `rt_vs_schedule_metrics`

In [1]:
import altair as alt
import geopandas as gpd
import pandas as pd

from shared_utils.rt_dates import y2023_dates, y2024_dates
from segment_speed_utils.project_vars import RT_SCHED_GCS
import merge_data

analysis_date_list = y2024_dates + y2023_dates 

## Schedule + Average Speeds

In [2]:
df = pd.read_parquet(
    f"{RT_SCHED_GCS}digest/schedule_vp_metrics.parquet"
)

In [11]:
df = df.astype({"direction_id": "float"})

In [23]:
from calitp_data_analysis import calitp_color_palette as cp

def base_line_chart(df: pd.DataFrame, y_col: str) -> alt.Chart:
    
    df = df.reset_index(drop=True)
    
    this_route = df.route_id.iloc[0]
    this_direction = df.direction_id.iloc[0]
    selected_colors = cp.CALITP_CATEGORY_BRIGHT_COLORS[2:]
    
    chart = (alt.Chart(df)
         .mark_line()
         .encode(
             x = alt.X("service_date:T", title = "Date",),
             y = alt.Y(f"{y_col}:Q", 
                       #scale = alt.Scale(domain=[0, 50])
                      ),
             color = alt.Color(
                 "time_period:N", title = "",
                 scale = alt.Scale(range = selected_colors)
             ),
             tooltip = ["route_id", "direction_id", 
                        "time_period", y_col]
         ).properties(
             width=350, height=250,
        ).interactive()
    )
    
    return chart

In [32]:
one_operator = df[
    df.sched_rt_category == "schedule_and_vp"
].schedule_gtfs_dataset_key.unique()[5]

df2 = df[df.schedule_gtfs_dataset_key==one_operator]
df2.service_date.value_counts()

2024-01-17    42
2023-12-13    42
2023-11-15    42
2023-10-11    42
2023-09-13    42
2023-07-12    42
2023-06-14    42
2023-05-17    42
2023-04-12    42
2023-03-15    42
2023-08-15    39
Name: service_date, dtype: int64

In [33]:
def dual_chart(
    df, 
    control_field: str,
    y_col: str
):
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """
    input_dropdown = alt.binding_select(
        options=df.route_id.unique().tolist(), 
        name='Routes '
    )
        
    # Column that controls the bar charts
    category_selector = alt.selection_multi(
        fields=[control_field], 
        bind=input_dropdown
    )
    
    df0 = df[df.direction_id==0]
    df1 = df[df.direction_id==1]
    
    charts = []
    
    if len(df0) > 0:
    
        # Build first chart
        chart1 = base_line_chart(
            df0,
            y_col = y_col,
        ).add_params(category_selector).transform_filter(category_selector)
        
        charts.append(chart1)
        
    
    if len(df1) > 0:
        # Build second chart
        chart2 = base_line_chart(
            df[df.direction_id==1],
            y_col = y_col,
        ).transform_filter(category_selector)

        charts.append(chart2)
    
    chart = alt.vconcat(*charts)
   
    return chart

In [37]:
df2[df2.route_id=="15"]

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,avg_sched_service_min,avg_stop_meters,n_trips,frequency,service_date,speed_mph,sched_rt_category
1180,4b59b468244e0d5139d91fc698acc9d6,15,0.0,all_day,65.0,16375.48,3.0,0.12,2024-01-17,71.692005,schedule_and_vp
1181,4b59b468244e0d5139d91fc698acc9d6,15,0.0,offpeak,65.0,16375.48,1.0,0.06,2024-01-17,71.692005,schedule_and_vp
1182,4b59b468244e0d5139d91fc698acc9d6,15,0.0,peak,65.0,16375.48,2.0,0.25,2024-01-17,,schedule_only
10238,4b59b468244e0d5139d91fc698acc9d6,15,0.0,all_day,65.0,16375.48,3.0,0.12,2023-12-13,21.612993,schedule_and_vp
10239,4b59b468244e0d5139d91fc698acc9d6,15,0.0,offpeak,65.0,16375.48,1.0,0.06,2023-12-13,78.064774,schedule_and_vp
10240,4b59b468244e0d5139d91fc698acc9d6,15,0.0,peak,65.0,16375.48,2.0,0.25,2023-12-13,18.790404,schedule_and_vp
30161,4b59b468244e0d5139d91fc698acc9d6,15,0.0,all_day,65.0,16375.48,3.0,0.12,2023-11-15,5.859741,schedule_and_vp
30162,4b59b468244e0d5139d91fc698acc9d6,15,0.0,offpeak,65.0,16375.48,1.0,0.06,2023-11-15,5.204318,schedule_and_vp
30163,4b59b468244e0d5139d91fc698acc9d6,15,0.0,peak,65.0,16375.48,2.0,0.25,2023-11-15,6.25352,schedule_and_vp
40556,4b59b468244e0d5139d91fc698acc9d6,15,0.0,all_day,65.0,16375.48,3.0,0.12,2023-10-11,8.075666,schedule_and_vp


In [34]:
dual_chart(df2, control_field = "route_id", y_col = "speed_mph")



### Segment Speeds

In [None]:
segment_speeds = merge_data.concatenate_segment_speeds_by_route_direction(
    analysis_date_list)

## RT vs Schedule

In [None]:
trips_to_route = pd.concat([
    helpers.import_scheduled_trips(
        d,
        columns = ["gtfs_dataset_key", 
                   "trip_instance_key", "route_id", "direction_id"],
        get_pandas = True
    ).assign(
        service_date = pd.to_datetime(d)
    ).astype({"direction_id": "Int64"}) 
    for d in analysis_date_list], 
    axis=0, ignore_index=True
)

In [None]:
rt_sched_metrics = pd.concat([
    pd.read_parquet(
        f"{RT_SCHED_GCS}trip_level_metrics/{d}_metrics.parquet"
    ).assign(service_date = pd.to_datetime(d))
    for d in analysis_date_list[:-1]
    ], axis=0, ignore_index=True
)

`rt_vs_sched` trip-level metrics need January 2024

In [None]:
rt_sched_metrics2 = pd.merge(
    rt_sched_metrics,
    trips_to_route,
    on = ["trip_instance_key", "service_date"],
    how = "left",
    indicator = True
)

# outer merge here has a lot of right onlys, which would be
# expected for all the trips that do not have RT

In [None]:
rt_sched_metrics2._merge.value_counts()

In [None]:
# Interesting -- left only means it's found in RT but not in schedule
rt_sched_metrics2[rt_sched_metrics2._merge=="left_only"].service_date.value_counts()

In [None]:
rt_sched_metrics2.dtypes

In [None]:
rt_sched_metrics_route = (rt_sched_metrics2
                          .groupby(["schedule_gtfs_dataset_key", 
                                    "route_id", "direction_id", 
                                    "service_date"], 
                                   observed=True, group_keys=False)
                          .agg({
                              "rt_service_min": "mean",
                              "pings_per_min": "mean",
                              "spatial_accuracy_pct": "mean",
                              "rt_triptime_w_gtfs_pct": "mean",
                          })
                          .reset_index().round(2)
                         )

In [None]:
rt_sched_metrics_route.head()

In [None]:
pd.merge(
    df_sched_speeds,
    rt_sched_metrics_route,
    on = ["schedule_gtfs_dataset_key",
          "route_id", "direction_id", "service_date"],
    how = "outer",
    indicator = "sched_speeds_to_rt"
).sched_speeds_to_rt.value_counts()

In [None]:
df_sched_speeds_metrics = pd.merge(
    df_sched_speeds,
    rt_sched_metrics_route,
    on = ["schedule_gtfs_dataset_key", 
          "route_id", "direction_id", "service_date"],
    how = "outer",
    indicator = True
)

In [None]:
df_sched_speeds_metrics._merge.value_counts()

In [None]:
def merge_in_standardized_route_names(df):
    standardized_route_names = pd.read_parquet(
        f"{SCHED_GCS}standardized_route_ids.parquet",
        columns = ["schedule_gtfs_dataset_key", "name", 
                   "route_id", "service_date",
                   "recent_route_id2", "recent_combined_name"
                  ]
    )
    
    df = pd.merge(
        df,
        standardized_route_names,
        on = ["schedule_gtfs_dataset_key", "route_id", "service_date"],
        how = "left",
    )
    
    # Clean up, round columns, get it as close to ready for charts
    df = df.assign(
        route_short_name = (df.recent_combined_name
                            .str.split("__", expand=True)[0]),
        route_long_name = (df.recent_combined_name
                           .str.split("__", expand=True)[1]),
        direction_id = df.direction_id.astype("int"),
        avg_sched_service_min = df.avg_sched_service_min.round(1),
        avg_stop_meters = df.avg_stop_meters.round(1),
    ).drop(
        columns = ["route_id", "recent_combined_name"]
    ).rename(
        columns = {"recent_route_id2": "route_id"}
    )
    
    return df

In [None]:
df_sched_speeds_metrics.columns

In [None]:
merge_in_standardized_route_names(df_sched_speeds_metrics).head(10)