# Monthly Trends 

Now that we're beginning to add monthly aggregations to schedule data, let's see how it all plays together. Use this to get out the kinks of combining all our current data products.

* Start with a single day across multiple months.
* `scheduled_service_hours`
    * service hours, scheduled trips in peak vs offpeak
    * average stop spacing
* `speeds` - single day aggregation to route-direction. don't use weekly average...we only produced this for 2 months
    * segment speeds
    * route-direction avg speed with common shape?
* `rt_vs_schedule_metrics`

In [1]:
import altair as alt
import geopandas as gpd
import pandas as pd

from shared_utils import calitp_color_palette as cp
from shared_utils import portfolio_utils, rt_dates, styleguide
from segment_speed_utils import helpers
from segment_speed_utils.project_vars import (SEGMENT_GCS, SCHED_GCS, 
                                              RT_SCHED_GCS)

In [None]:
name = "City of Santa Monica"

In [None]:
%%capture_parameters
name

## Datasets
* Start with single day for multiple months
* scheduled service hours, avg stop spacing, parallel / intersecting
* vehicle positions speeds
* rt vs sched metrics

In [2]:
months = [
    "mar", "apr", "may", 
    "jun", "jul", "aug", 
    "sep", "oct", "nov", "dec"
]

analysis_date_list = [
    rt_dates.DATES[f"{m}2023"] for m in months
]

In [None]:
def merge_in_organization_identifiers(scheduled_service: pd.DataFrame):
    crosswalk = (tbls.mart_transit_database.dim_provider_gtfs_data()
             >> select(_.organization_source_record_id, 
                       _.organization_name, 
                       _.schedule_source_record_id)
             >> filter(_.schedule_source_record_id != None, 
                       _.organization_source_record_id != None)
             >> distinct()
             >> collect()
            )
    
    df_with_org = pd.merge(
        scheduled_service,
        crosswalk,
        on = "schedule_source_record_id",
        how = "inner"
    ).drop(columns = ["schedule_source_record_id"]
          ).rename(columns = {
        "organization_source_record_id": "source_record_id",
        "organization_name": "name",
    })
    
    return df_with_org

In [33]:
def concatenate_route_direction_and_cleanup(
    date_list: list
) -> pd.DataFrame:
    df_schedule = pd.concat([
        pd.read_parquet(
         f"{RT_SCHED_GCS}schedule_route_direction_metrics_{d}.parquet",
            columns = [
                "schedule_gtfs_dataset_key", 
                "route_id", "direction_id",
                "avg_sched_service_min", 
                "avg_stop_meters",
                "offpeak_n_trips", "peak_n_trips",
                "offpeak_frequency", "peak_frequency"
            ]
        ).assign(service_date = pd.to_datetime(d)) 
        for d in date_list
    ], axis=0, ignore_index=True)
    
    standardized_route_names = pd.read_parquet(
        f"{SCHED_GCS}standardized_route_ids.parquet",
        columns = ["schedule_gtfs_dataset_key", "name", 
                   "route_id", "service_date",
                   "recent_route_id2", "recent_combined_name"
                  ]
    )
    
    df = pd.merge(
        df_schedule,
        standardized_route_names,
        on = ["schedule_gtfs_dataset_key", "route_id", "service_date"],
        how = "left",
    )
    
    # Clean up, round columns, get it as close to ready for charts
    df = df.assign(
        route_short_name = (df.recent_combined_name
                            .str.split("__", expand=True)[0]),
        route_long_name = (df.recent_combined_name
                           .str.split("__", expand=True)[1]),
        direction_id = df.direction_id.astype("int"),
        avg_sched_service_min = df.avg_sched_service_min.round(1),
        avg_stop_meters = df.avg_stop_meters.round(1),
    ).drop(
        columns = ["route_id", "recent_combined_name"]
    ).rename(
        columns = {"recent_route_id2": "route_id"}
    )
    
    return df

In [18]:
df_schedule2 = concatenate_route_direction_and_cleanup(analysis_date_list)

In [46]:
df_schedule2.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'avg_sched_service_min',
       'avg_stop_meters', 'offpeak_n_trips', 'peak_n_trips',
       'offpeak_frequency', 'peak_frequency', 'service_date', 'name',
       'route_id', 'route_short_name', 'route_long_name'],
      dtype='object')

In [47]:
from shared_utils import schedule_rt_utils
'''
crosswalk = schedule_rt_utils.sample_gtfs_dataset_key_to_organization_crosswalk(
    df_schedule2.rename(
        columns = {"schedule_gtfs_dataset_key": "gtfs_dataset_key"}
    ),
    analysis_date_list[-1],
    quartet_data = "schedule",
    dim_gtfs_dataset_cols = [
        "key",
    ],
    dim_organization_cols = ["source_record_id", "name", "caltrans_district"]
)

crosswalk.to_parquet("crosswalk_to_organization.parquet")
'''

In [51]:
crosswalk = pd.read_parquet("crosswalk_to_organization.parquet")

In [57]:
pd.merge(
    df_schedule2,
    crosswalk,
    on = ["schedule_gtfs_dataset_key"],
    how = "outer",
    indicator = True
).query('_merge=="left_only"').service_date.value_counts()

2023-03-15    1998
2023-04-12    1100
2023-08-15     935
2023-05-17     918
2023-07-12     851
2023-06-14     780
2023-09-13      97
2023-10-11      71
2023-11-15      20
Name: service_date, dtype: int64

In [58]:
pd.merge(
    df_schedule2,
    crosswalk,
    on = ["schedule_gtfs_dataset_key"],
    how = "outer",
    indicator = True
).query('_merge=="left_only"').name.value_counts()

San Diego Schedule            1064
Sacramento Schedule            500
North County Schedule          454
Santa Clarita Schedule         417
StanRTA Schedule               408
                              ... 
Inglewood Schedule               3
Lawndale Beat GMV Schedule       2
Cudahy Schedule                  2
La Campana Schedule              2
Santa Maria Schedule             1
Name: name, Length: 71, dtype: int64

In [41]:
d = analysis_date_list[-2]

In [45]:
gpd.read_parquet(f"{SEGMENT_GCS}trip_summary/route_speeds_{d}.parquet").head()

Unnamed: 0,org_id,agency,route_id,route_name,direction_id,common_shape_id,time_of_day,speed_mph,n_trips,avg_sched_trip_min,avg_rt_trip_min,base64_url,district_name,geometry
0,rec00qSzZL8KqiXAo,Long Beach Transit,51,LONG BEACH BLVD,0,510078,AM Peak,5.0,13,50.7,124.5,aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL3VjP2V4cG9ydD...,07 - Los Angeles,"LINESTRING (-118.22298 33.87534, -118.22293 33..."
1,rec00qSzZL8KqiXAo,Long Beach Transit,51,LONG BEACH BLVD,0,510078,Early AM,5.8,8,43.8,109.5,aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL3VjP2V4cG9ydD...,07 - Los Angeles,"LINESTRING (-118.22298 33.87534, -118.22293 33..."
2,rec00qSzZL8KqiXAo,Long Beach Transit,51,LONG BEACH BLVD,0,510078,Evening,6.0,4,42.0,105.3,aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL3VjP2V4cG9ydD...,07 - Los Angeles,"LINESTRING (-118.22298 33.87534, -118.22293 33..."
3,rec00qSzZL8KqiXAo,Long Beach Transit,51,LONG BEACH BLVD,0,510078,Midday,4.6,20,55.8,135.0,aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL3VjP2V4cG9ydD...,07 - Los Angeles,"LINESTRING (-118.22298 33.87534, -118.22293 33..."
4,rec00qSzZL8KqiXAo,Long Beach Transit,51,LONG BEACH BLVD,0,510078,PM Peak,4.9,16,54.0,129.7,aHR0cHM6Ly9kcml2ZS5nb29nbGUuY29tL3VjP2V4cG9ydD...,07 - Los Angeles,"LINESTRING (-118.22298 33.87534, -118.22293 33..."


## Prep `route-time_of_day-month` schedule and RT dfs

In [None]:
'''
df = (tbls.mart_ad_hoc.fct_scheduled_service_by_daypart()
      >> filter(_.year == 2023)
      >> collect()
     )
     
df.to_parquet(f"{SCHED_GCS}service_daypart_2023_m01_m07.parquet")
'''

In [None]:
DAY_TYPE_DICT = {
    1: "Sunday",
    2: "Monday",
    3: "Tuesday",
    4: "Wednesday",
    5: "Thursday",
    6: "Friday",
    7: "Saturday"
}

WEEKDAY_WEEKEND_DICT = {
    "weekday": [2, 3, 4, 5, 6],
    "weekend": [1, 7]
}

In [None]:
def aggregate_scheduled_service_by_time_of_day(
    df: pd.DataFrame,
    group_cols: list
) -> pd.DataFrame:
    """
    Aggregate ttl_service_hours up to time-of-day/service_type (weekday/weekend)
    and calculate avg_service_hours.
    """
    # Get weekday and weekend service by time-of-day
    # https://stackoverflow.com/questions/61135954/pandas-map-column-from-dict-values-in-a-list
    df = df.assign(
        service_type = df.day_type.map(
            {item: k for k, v in WEEKDAY_WEEKEND_DICT.items() for item in v})
    )

    df2 = (df.groupby(group_cols)
        .agg({
            "ttl_service_hours": "sum", 
            "n_trips": "sum"
        }).reset_index()
    )
    
    df2 = df2.assign(
        avg_service_hours = df2.ttl_service_hours.divide(df2.n_trips).round(2),
        ttl_service_hours = df2.ttl_service_hours.round(2),
    )
    
    return df2


In [None]:
service_hours = pd.read_parquet(
    f"{SCHED_GCS}service_daypart_2023_m01_m07.parquet")


route_cols = [
    "source_record_id", 
    "route_id",
    #"route_long_name", "route_short_name", 
    "year", "month",
    "time_of_day", "service_type",
]

service_time_of_day = (aggregate_scheduled_service_by_time_of_day(
    service_hours, route_cols)
    .rename(columns = {
        "source_record_id": "schedule_source_record_id"})
)

In [None]:
def merge_in_organization_identifiers(scheduled_service: pd.DataFrame):
    crosswalk = (tbls.mart_transit_database.dim_provider_gtfs_data()
             >> select(_.organization_source_record_id, 
                       _.organization_name, 
                       _.schedule_source_record_id)
             >> filter(_.schedule_source_record_id != None, 
                       _.organization_source_record_id != None)
             >> distinct()
             >> collect()
            )
    
    df_with_org = pd.merge(
        scheduled_service,
        crosswalk,
        on = "schedule_source_record_id",
        how = "inner"
    ).drop(columns = ["schedule_source_record_id"]
          ).rename(columns = {
        "organization_source_record_id": "source_record_id",
        "organization_name": "name",
    })
    
    return df_with_org

In [None]:
service_df = merge_in_organization_identifiers(service_time_of_day)

In [None]:
# Use avg speed for a single day as stand-in for the month
speed_month_dates = ["mar2023", "apr2023", "may2023", "jun2023"]
speed_dates = [rt_dates.DATES[i] for i in speed_month_dates]
speed_dates

In [None]:
def import_route_speeds(analysis_date: str) -> gpd.GeoDataFrame:
    """
    Import route-direction_time_of_day avg speeds.
    Get it ready to merge with scheduled service.
    """
    df = gpd.read_parquet(
        f"{SEGMENT_GCS}trip_summary/route_speeds_{analysis_date}.parquet")
    
    df = df.assign(
        year = pd.to_datetime(analysis_date).year, 
        month = pd.to_datetime(analysis_date).month,
        analysis_date = pd.to_datetime(analysis_date)
    ).rename(columns = {"org_id": "source_record_id",
                        "agency": "name", 
                        "n_trips": "n_rt_trips"
                       })
    
    return df

In [None]:
speed_df = pd.concat(
    [import_route_speeds(i) for i in speed_dates], 
    axis=0
)

In [None]:
df = pd.merge(
    speed_df,
    service_df,
    on = ["source_record_id", "name", "route_id", 
          "time_of_day", "year", "month"],
    how = "outer",
    indicator=True
)

df._merge.value_counts()

## Check merge results

Speed data not available for Jan or Feb, so we can drop those.

Other than that, take a look at which routes appear in RT but not schedule and vice versa.

In [None]:
df[(df.month < 3) & (df.year == 2023)]._merge.value_counts()

In [None]:
df[(df.month >= 3) & (df.year == 2023)]._merge.value_counts()

In [None]:
df2 = df[(df.month >= 3) & (df.year == 2023)].reset_index(drop=True)

## Sample Route Chart

### Issues to Resolve
* `route_id` may be unstable over many months. For LA Metro, they tack something after a hyphen.
* `route_name` is more stable, but may be less understandable in some cases (Metro's Line 720's standardized route_name (via `portfolio_utils`) is `SANTA MONICA-DOWNTOWN LA VIA WILSHIRE BLVD`)
* speeds have `direction_id`, and scheduled service hours doesn't. Should we add it? This will actually double up the rows from speeds and if we're not careful, double count scheduled service hours.

In [None]:
metro = df2[df2.name.str.contains("Los Angeles County Metropolitan")]
metro[metro.route_id.str.contains("720")].route_id.value_counts()

In [None]:
one_route = metro[(metro.route_id.str.contains("720")) & 
                  (metro.route_id.notna())].route_name.iloc[0]
one_route

In [None]:
service_df = service_df.assign(
    date = pd.to_datetime(service_df.year.astype(str) + " " + 
                          service_df.month.astype(str) + " " + "01")
)

In [None]:
test_operator = "Los Angeles County Metropolitan"
test_route = "720"

metro = service_df[
    (service_df.name.str.contains(test_operator)) & 
    (service_df.route_id.str.contains(test_route)) ] 

In [None]:
sort_time_of_day = ["Early AM", "AM Peak", 
                      "Midday", "PM Peak", 
                      "Evening", "OWL"]
(alt.Chart(metro)
 .mark_line()
 .encode(
     x = alt.X("month(date):T"),
     y = alt.Y("sum(avg_service_hours):Q", 
               title = "Avg Service Hours"
              ),
     color = alt.Color(
         "time_of_day:N", title = "Time of Day",
        scale = alt.Scale(
            domain = sort_time_of_day, 
            range = cp.CALITP_CATEGORY_BRIGHT_COLORS)
        ),
     tooltip = ["time_of_day", 
                "sum(avg_service_hours)", 
               ]
 ).properties(title = f"Route {metro.route_id.iloc[0]}")
 .interactive()
)

In [None]:
metro_speed = speed_df[(speed_df.name.str.contains(test_operator)) & 
        (speed_df.route_id.str.contains(test_route))]

In [None]:
(alt.Chart(metro_speed[metro_speed.direction_id==0])
 .mark_bar()
 .encode(
     x = alt.X("month:O", title = "Date"),
     y = alt.Y("speed_mph:Q", title="Avg Speed"),
     column = alt.Column("time_of_day:N", title = "Time of Day",
                         sort = sort_time_of_day),
     color = alt.Color(
         "time_of_day:N", title = "Time of Day",
         scale = alt.Scale(
             domain = sort_time_of_day, 
             range = cp.CALITP_CATEGORY_BRIGHT_COLORS)
     )
 ).properties(title = f"{metro_speed.route_id.iloc[0]} direction 0")
 .interactive()
)

In [None]:
(alt.Chart(metro_speed[metro_speed.direction_id==1])
 .mark_bar()
 .encode(
     x = alt.X("month:O", title = "Date"),
     y = alt.Y("speed_mph:Q", title="Avg Speed"),
     column = alt.Column("time_of_day:N", title = "Time of Day",
                         sort = sort_time_of_day),
     color = alt.Color(
         "time_of_day:N", title = "Time of Day",
         scale = alt.Scale(
             domain = sort_time_of_day, 
             range = cp.CALITP_CATEGORY_BRIGHT_COLORS)
     )
 ).properties(title = f"{metro_speed.route_id.iloc[0]} direction 0")
 .interactive()
)

In [None]:
# Does it make sense to take a 6 month avg of speeds by 
# route-direction-time_of_day?
metro_speed_aggregated = (metro_speed.groupby(["source_record_id", "name", 
                                               "route_name", "direction_id", 
                                               "time_of_day", ])
                          .agg({"speed_mph": "mean"}
                          ).reset_index()
                         )

In [None]:
route_crosswalk = metro_speed[
    ["source_record_id", "name", "route_id", "route_name", "geometry"]
].drop_duplicates(subset=["source_record_id", "name", "route_name"])

In [None]:
metro_speed_aggregated = pd.merge(
    route_crosswalk,
    metro_speed_aggregated, 
    on = ["source_record_id", "name", "route_name"]
)

In [None]:
metro_speed_aggregated.columns

In [None]:
COLORSCALE = branca.colormap.step.RdYlGn_10.scale(vmin=0, vmax=40)
COLORSCALE.caption = "Speed (miles per hour)"

for t in sort_time_of_day:
    subset_df = metro_speed_aggregated[
        metro_speed_aggregated.time_of_day==t]
    
    if len(subset_df) > 0:
        m = subset_df.explore("speed_mph", cmap = COLORSCALE, 
                        tiles = 'CartoDBPositron')
        print(f"{t}")
        display(m)