# Monthly Trends 

Now that we're beginning to add monthly aggregations to schedule data, let's see how it all plays together. Use this to get out the kinks of combining all our current data products.

* Start with a single day across multiple months.
* `scheduled_service_hours`
    * service hours, scheduled trips in peak vs offpeak
    * average stop spacing
* `speeds` - single day aggregation to route-direction. don't use weekly average...we only produced this for 2 months
    * segment speeds
    * route-direction avg speed with common shape?
* `rt_vs_schedule_metrics`

In [None]:
import altair as alt
import geopandas as gpd
import pandas as pd

from shared_utils import rt_dates
from segment_speed_utils import helpers
from segment_speed_utils.project_vars import (SEGMENT_GCS, SCHED_GCS, 
                                              RT_SCHED_GCS)

## Datasets
* Start with single day for multiple months
* scheduled service hours, avg stop spacing, parallel / intersecting
* vehicle positions speeds
* rt vs sched metrics

In [None]:
months = [
    "mar", "apr", "may", 
    "jun", "jul", "aug", 
    "sep", "oct", "nov", "dec"
]

analysis_date_list = [
    rt_dates.DATES[f"{m}2023"] for m in months
] + [rt_dates.DATES["jan2024"]]

## Schedule

In [None]:
def concatenate_schedule_by_route_direction(
    date_list: list
) -> pd.DataFrame:
    df = pd.concat([
        pd.read_parquet(
         f"{RT_SCHED_GCS}schedule_route_direction_metrics_{d}.parquet",
            columns = [
                "schedule_gtfs_dataset_key", 
                "route_id", "direction_id",
                "time_period",
                "avg_sched_service_min", 
                "avg_stop_meters",
                "n_trips", "frequency",
            ]
        ).assign(
            service_date = pd.to_datetime(d)
        ).astype({"direction_id": "Int64"}) 
        for d in date_list
    ], axis=0, ignore_index=True)
    
    return df

In [None]:
df_schedule = concatenate_schedule_by_route_direction(analysis_date_list)

In [None]:
df_schedule.head(3)

## Speeds
### Segment Speeds

In [None]:
df_speeds_segment = pd.concat(
    [gpd.read_parquet(
        f"{SEGMENT_GCS}rollup_singleday/"
        f"speeds_route_dir_segments_{d}.parquet"
    ).assign(
        service_date = pd.to_datetime(d)
    ).astype({"direction_id": "Int64"})  
     for d in analysis_date_list], 
    axis=0, ignore_index=True
)

In [None]:
df_speeds_segment.head(3)

### Route Average Speeds

In [None]:
df_speeds_avg = pd.concat(
    [pd.read_parquet(
        f"{SEGMENT_GCS}rollup_singleday/speeds_route_dir_{d}.parquet"
    ).drop(
        columns = ["geometry", "meters_elapsed", "sec_elapsed"]
    ).assign(
        service_date = pd.to_datetime(d)
    ).astype({"direction_id": "Int64"})  
     for d in analysis_date_list], 
    axis=0, ignore_index=True
)

In [None]:
df_speeds_avg.head(3)

In [None]:
df_sched_speeds = pd.merge(
    df_schedule,
    df_speeds_avg,
    on = ["schedule_gtfs_dataset_key", 
          "service_date", "time_period",
          "route_id", "direction_id"],
    how = "outer",
    indicator = "sched_speeds_avg_merge"
)

df_sched_speeds.sched_speeds_avg_merge.value_counts(normalize=True)

In [None]:
df_speeds_combined = pd.merge(
    df_speeds_segment,
    df_speeds_avg,
    on = ["organization_source_record_id", "organization_name",
          "base64_url", "caltrans_district",
          "schedule_gtfs_dataset_key", "name", 
          "service_date",
         "route_id", "direction_id", "time_period"],
    how = "outer",
    indicator = "speeds_avg_seg"
)

df_speeds_combined.speeds_avg_seg.value_counts()

In [None]:
df_speeds_combined

## RT vs Schedule

In [None]:
trips_to_route = pd.concat([
    helpers.import_scheduled_trips(
        d,
        columns = ["gtfs_dataset_key", 
                   "trip_instance_key", "route_id", "direction_id"],
        get_pandas = True
    ).assign(
        service_date = pd.to_datetime(d)
    ).astype({"direction_id": "Int64"}) 
    for d in analysis_date_list], 
    axis=0, ignore_index=True
)

In [None]:
rt_sched_metrics = pd.concat([
    pd.read_parquet(
        f"{RT_SCHED_GCS}trip_level_metrics/{d}_metrics.parquet"
    ).assign(service_date = pd.to_datetime(d))
    for d in analysis_date_list[:-1]
    ], axis=0, ignore_index=True
)

`rt_vs_sched` trip-level metrics need January 2024

In [None]:
rt_sched_metrics2 = pd.merge(
    rt_sched_metrics,
    trips_to_route,
    on = ["trip_instance_key", "service_date"],
    how = "left",
    indicator = True
)

# outer merge here has a lot of right onlys, which would be
# expected for all the trips that do not have RT

In [None]:
rt_sched_metrics2._merge.value_counts()

In [None]:
# Interesting -- left only means it's found in RT but not in schedule
rt_sched_metrics2[rt_sched_metrics2._merge=="left_only"].service_date.value_counts()

In [None]:
rt_sched_metrics2.dtypes

In [None]:
rt_sched_metrics_route = (rt_sched_metrics2
                          .groupby(["schedule_gtfs_dataset_key", 
                                    "route_id", "direction_id", 
                                    "service_date"], 
                                   observed=True, group_keys=False)
                          .agg({
                              "rt_service_min": "mean",
                              "pings_per_min": "mean",
                              "spatial_accuracy_pct": "mean",
                              "rt_triptime_w_gtfs_pct": "mean",
                          })
                          .reset_index().round(2)
                         )

In [None]:
rt_sched_metrics_route.head()

In [None]:
pd.merge(
    df_sched_speeds,
    rt_sched_metrics_route,
    on = ["schedule_gtfs_dataset_key",
          "route_id", "direction_id", "service_date"],
    how = "outer",
    indicator = "sched_speeds_to_rt"
).sched_speeds_to_rt.value_counts()

In [None]:
df_sched_speeds_metrics = pd.merge(
    df_sched_speeds,
    rt_sched_metrics_route,
    on = ["schedule_gtfs_dataset_key", 
          "route_id", "direction_id", "service_date"],
    how = "outer",
    indicator = True
)

In [None]:
df_sched_speeds_metrics._merge.value_counts()

In [None]:
def merge_in_standardized_route_names(df):
    standardized_route_names = pd.read_parquet(
        f"{SCHED_GCS}standardized_route_ids.parquet",
        columns = ["schedule_gtfs_dataset_key", "name", 
                   "route_id", "service_date",
                   "recent_route_id2", "recent_combined_name"
                  ]
    )
    
    df = pd.merge(
        df,
        standardized_route_names,
        on = ["schedule_gtfs_dataset_key", "route_id", "service_date"],
        how = "left",
    )
    
    # Clean up, round columns, get it as close to ready for charts
    df = df.assign(
        route_short_name = (df.recent_combined_name
                            .str.split("__", expand=True)[0]),
        route_long_name = (df.recent_combined_name
                           .str.split("__", expand=True)[1]),
        direction_id = df.direction_id.astype("int"),
        avg_sched_service_min = df.avg_sched_service_min.round(1),
        avg_stop_meters = df.avg_stop_meters.round(1),
    ).drop(
        columns = ["route_id", "recent_combined_name"]
    ).rename(
        columns = {"recent_route_id2": "route_id"}
    )
    
    return df

In [None]:
df_sched_speeds_metrics.columns

In [None]:
merge_in_standardized_route_names(df_sched_speeds_metrics).head(10)