# Monthly Trends 

Now that we're beginning to add monthly aggregations to schedule data, let's see how it all plays together. Use this to get out the kinks of combining all our current data products.

* Start with a single day across multiple months.
* `scheduled_service_hours`
    * service hours, scheduled trips in peak vs offpeak
    * average stop spacing
* `speeds` - single day aggregation to route-direction. don't use weekly average...we only produced this for 2 months
    * segment speeds
    * route-direction avg speed with common shape?
* `rt_vs_schedule_metrics`

In [1]:
import altair as alt
import geopandas as gpd
import pandas as pd

from shared_utils.rt_dates import y2023_dates, y2024_dates
from segment_speed_utils.project_vars import RT_SCHED_GCS
import merge_data

analysis_date_list = y2024_dates + y2023_dates 

## Schedule + Average Speeds

In [2]:
df = pd.read_parquet(
    f"{RT_SCHED_GCS}digest/schedule_vp_metrics.parquet"
)

In [3]:
df

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,time_period,avg_sched_service_min,avg_stop_meters,n_trips,frequency,service_date,speed_mph,sched_rt_category
0,749380f1a9f225d9123762d83ea2f50d,CCA/Adobe,0,all_day,14.56,179.41,16.0,0.67,2024-01-17,,schedule_only
1,749380f1a9f225d9123762d83ea2f50d,CCA/Adobe,0,peak,14.56,179.41,16.0,2.00,2024-01-17,,schedule_only
2,475bf19661c279a99f8c250eee77dd59,Delta,0,all_day,110.00,4265.21,4.0,0.17,2024-01-17,,schedule_only
3,475bf19661c279a99f8c250eee77dd59,Delta,0,offpeak,110.00,4265.21,2.0,0.12,2024-01-17,,schedule_only
4,475bf19661c279a99f8c250eee77dd59,Delta,0,peak,110.00,4265.21,2.0,0.25,2024-01-17,,schedule_only
...,...,...,...,...,...,...,...,...,...,...,...
113348,444700afe086ed24e3cb888cecd3037c,800,0,offpeak,,,,,2023-03-15,6.705920,vp_only
113349,444700afe086ed24e3cb888cecd3037c,800,0,all_day,,,,,2023-03-15,6.705920,vp_only
113350,444700afe086ed24e3cb888cecd3037c,800,1,offpeak,,,,,2023-03-15,28.200829,vp_only
113351,444700afe086ed24e3cb888cecd3037c,800,1,peak,,,,,2023-03-15,51.744120,vp_only


### Segment Speeds

In [None]:
segment_speeds = merge_data.concatenate_segment_speeds_by_route_direction(
    analysis_date_list)

## RT vs Schedule

In [None]:
trips_to_route = pd.concat([
    helpers.import_scheduled_trips(
        d,
        columns = ["gtfs_dataset_key", 
                   "trip_instance_key", "route_id", "direction_id"],
        get_pandas = True
    ).assign(
        service_date = pd.to_datetime(d)
    ).astype({"direction_id": "Int64"}) 
    for d in analysis_date_list], 
    axis=0, ignore_index=True
)

In [None]:
rt_sched_metrics = pd.concat([
    pd.read_parquet(
        f"{RT_SCHED_GCS}trip_level_metrics/{d}_metrics.parquet"
    ).assign(service_date = pd.to_datetime(d))
    for d in analysis_date_list[:-1]
    ], axis=0, ignore_index=True
)

`rt_vs_sched` trip-level metrics need January 2024

In [None]:
rt_sched_metrics2 = pd.merge(
    rt_sched_metrics,
    trips_to_route,
    on = ["trip_instance_key", "service_date"],
    how = "left",
    indicator = True
)

# outer merge here has a lot of right onlys, which would be
# expected for all the trips that do not have RT

In [None]:
rt_sched_metrics2._merge.value_counts()

In [None]:
# Interesting -- left only means it's found in RT but not in schedule
rt_sched_metrics2[rt_sched_metrics2._merge=="left_only"].service_date.value_counts()

In [None]:
rt_sched_metrics2.dtypes

In [None]:
rt_sched_metrics_route = (rt_sched_metrics2
                          .groupby(["schedule_gtfs_dataset_key", 
                                    "route_id", "direction_id", 
                                    "service_date"], 
                                   observed=True, group_keys=False)
                          .agg({
                              "rt_service_min": "mean",
                              "pings_per_min": "mean",
                              "spatial_accuracy_pct": "mean",
                              "rt_triptime_w_gtfs_pct": "mean",
                          })
                          .reset_index().round(2)
                         )

In [None]:
rt_sched_metrics_route.head()

In [None]:
pd.merge(
    df_sched_speeds,
    rt_sched_metrics_route,
    on = ["schedule_gtfs_dataset_key",
          "route_id", "direction_id", "service_date"],
    how = "outer",
    indicator = "sched_speeds_to_rt"
).sched_speeds_to_rt.value_counts()

In [None]:
df_sched_speeds_metrics = pd.merge(
    df_sched_speeds,
    rt_sched_metrics_route,
    on = ["schedule_gtfs_dataset_key", 
          "route_id", "direction_id", "service_date"],
    how = "outer",
    indicator = True
)

In [None]:
df_sched_speeds_metrics._merge.value_counts()

In [None]:
def merge_in_standardized_route_names(df):
    standardized_route_names = pd.read_parquet(
        f"{SCHED_GCS}standardized_route_ids.parquet",
        columns = ["schedule_gtfs_dataset_key", "name", 
                   "route_id", "service_date",
                   "recent_route_id2", "recent_combined_name"
                  ]
    )
    
    df = pd.merge(
        df,
        standardized_route_names,
        on = ["schedule_gtfs_dataset_key", "route_id", "service_date"],
        how = "left",
    )
    
    # Clean up, round columns, get it as close to ready for charts
    df = df.assign(
        route_short_name = (df.recent_combined_name
                            .str.split("__", expand=True)[0]),
        route_long_name = (df.recent_combined_name
                           .str.split("__", expand=True)[1]),
        direction_id = df.direction_id.astype("int"),
        avg_sched_service_min = df.avg_sched_service_min.round(1),
        avg_stop_meters = df.avg_stop_meters.round(1),
    ).drop(
        columns = ["route_id", "recent_combined_name"]
    ).rename(
        columns = {"recent_route_id2": "route_id"}
    )
    
    return df

In [None]:
df_sched_speeds_metrics.columns

In [None]:
merge_in_standardized_route_names(df_sched_speeds_metrics).head(10)