# Monthly Trends 

Now that we're beginning to add monthly aggregations to schedule data, let's see how it all plays together. Use this to get out the kinks of combining all our current data products.

In [1]:
import altair as alt
import branca
import geopandas as gpd
import pandas as pd

from calitp_data_analysis.tables import tbls
from siuba import *

from shared_utils import calitp_color_palette as cp
from shared_utils import portfolio_utils, rt_dates, styleguide
from segment_speed_utils.project_vars import SEGMENT_GCS, SCHED_GCS


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


## Prep `route-time_of_day-month` schedule and RT dfs

In [2]:
'''
df = (tbls.mart_ad_hoc.fct_scheduled_service_by_daypart()
      >> filter(_.year == 2023)
      >> collect()
     )
     
df.to_parquet(f"{SCHED_GCS}service_daypart_2023_m01_m07.parquet")
'''

'\ndf = (tbls.mart_ad_hoc.fct_scheduled_service_by_daypart()\n      >> filter(_.year == 2023)\n      >> collect()\n     )\n     \ndf.to_parquet(f"{SCHED_GCS}service_daypart_2023_m01_m07.parquet")\n'

In [3]:
DAY_TYPE_DICT = {
    1: "Sunday",
    2: "Monday",
    3: "Tuesday",
    4: "Wednesday",
    5: "Thursday",
    6: "Friday",
    7: "Saturday"
}

WEEKDAY_WEEKEND_DICT = {
    "weekday": [2, 3, 4, 5, 6],
    "weekend": [1, 7]
}

In [4]:
def aggregate_scheduled_service_by_time_of_day(
    df: pd.DataFrame,
    group_cols: list
) -> pd.DataFrame:
    """
    Aggregate ttl_service_hours up to time-of-day/service_type (weekday/weekend)
    and calculate avg_service_hours.
    """
    # Get weekday and weekend service by time-of-day
    # https://stackoverflow.com/questions/61135954/pandas-map-column-from-dict-values-in-a-list
    df = df.assign(
        service_type = df.day_type.map(
            {item: k for k, v in WEEKDAY_WEEKEND_DICT.items() for item in v})
    )

    df2 = (df.groupby(group_cols)
        .agg({
            "ttl_service_hours": "sum", 
            "n_trips": "sum"
        }).reset_index()
    )
    
    df2 = df2.assign(
        avg_service_hours = df2.ttl_service_hours.divide(df2.n_trips).round(2),
        ttl_service_hours = df2.ttl_service_hours.round(2),
    )
    
    return df2


In [5]:
service_hours = pd.read_parquet(
    f"{SCHED_GCS}service_daypart_2023_m01_m07.parquet")


route_cols = [
    "source_record_id", 
    "route_id",
    #"route_long_name", "route_short_name", 
    "year", "month",
    "time_of_day", "service_type",
]

service_time_of_day = (aggregate_scheduled_service_by_time_of_day(
    service_hours, route_cols)
    .rename(columns = {
        "source_record_id": "schedule_source_record_id"})
)

In [6]:
def merge_in_organization_identifiers(scheduled_service: pd.DataFrame):
    crosswalk = (tbls.mart_transit_database.dim_provider_gtfs_data()
             >> select(_.organization_source_record_id, 
                       _.organization_name, 
                       _.schedule_source_record_id)
             >> filter(_.schedule_source_record_id != None, 
                       _.organization_source_record_id != None)
             >> distinct()
             >> collect()
            )
    
    df_with_org = pd.merge(
        scheduled_service,
        crosswalk,
        on = "schedule_source_record_id",
        how = "inner"
    ).drop(columns = ["schedule_source_record_id"]
          ).rename(columns = {
        "organization_source_record_id": "source_record_id",
        "organization_name": "name",
    })
    
    return df_with_org

In [7]:
service_df = merge_in_organization_identifiers(service_time_of_day)

In [8]:
# Use avg speed for a single day as stand-in for the month
speed_month_dates = ["mar2023", "apr2023", "may2023", "jun2023"]
speed_dates = [rt_dates.DATES[i] for i in speed_month_dates]
speed_dates

['2023-03-15', '2023-04-12', '2023-05-17', '2023-06-14']

In [9]:
def import_route_speeds(analysis_date: str) -> gpd.GeoDataFrame:
    """
    Import route-direction_time_of_day avg speeds.
    Get it ready to merge with scheduled service.
    """
    df = gpd.read_parquet(
        f"{SEGMENT_GCS}trip_summary/route_speeds_{analysis_date}.parquet")
    
    df = df.assign(
        year = pd.to_datetime(analysis_date).year, 
        month = pd.to_datetime(analysis_date).month,
        analysis_date = pd.to_datetime(analysis_date)
    ).rename(columns = {"org_id": "source_record_id",
                        "agency": "name", 
                        "n_trips": "n_rt_trips"
                       })
    
    return df

In [10]:
speed_df = pd.concat(
    [import_route_speeds(i) for i in speed_dates], 
    axis=0
)

In [11]:
df = pd.merge(
    speed_df,
    service_df,
    on = ["source_record_id", "name", "route_id", 
          "time_of_day", "year", "month"],
    how = "outer",
    indicator=True
)

df._merge.value_counts()

right_only    1193922
both            76637
left_only         979
Name: _merge, dtype: int64

## Check merge results

Speed data not available for Jan or Feb, so we can drop those.

Other than that, take a look at which routes appear in RT but not schedule and vice versa.

In [12]:
df[(df.month < 3) & (df.year == 2023)]._merge.value_counts()

right_only    355861
left_only          0
both               0
Name: _merge, dtype: int64

In [13]:
df[(df.month >= 3) & (df.year == 2023)]._merge.value_counts()

right_only    838061
both           76637
left_only        979
Name: _merge, dtype: int64

In [14]:
df2 = df[(df.month >= 3) & (df.year == 2023)].reset_index(drop=True)

## Sample Route Chart

### Issues to Resolve
* `route_id` may be unstable over many months. For LA Metro, they tack something after a hyphen.
* `route_name` is more stable, but may be less understandable in some cases (Metro's Line 720's standardized route_name (via `portfolio_utils`) is `SANTA MONICA-DOWNTOWN LA VIA WILSHIRE BLVD`)
* speeds have `direction_id`, and scheduled service hours doesn't. Should we add it? This will actually double up the rows from speeds and if we're not careful, double count scheduled service hours.

In [15]:
metro = df2[df2.name.str.contains("Los Angeles County Metropolitan")]
metro[metro.route_id.str.contains("720")].route_id.value_counts()

720-13167    96
720-13168    18
Name: route_id, dtype: int64

In [16]:
one_route = metro[(metro.route_id.str.contains("720")) & 
                  (metro.route_id.notna())].route_name.iloc[0]
one_route

'SANTA MONICA-DOWNTOWN LA VIA WILSHIRE BLVD'

In [17]:
service_df = service_df.assign(
    date = pd.to_datetime(service_df.year.astype(str) + " " + 
                          service_df.month.astype(str) + " " + "01")
)

In [18]:
test_operator = "Los Angeles County Metropolitan"
test_route = "720"

metro = service_df[
    (service_df.name.str.contains(test_operator)) & 
    (service_df.route_id.str.contains(test_route)) ] 

In [19]:
sort_time_of_day = ["Early AM", "AM Peak", 
                      "Midday", "PM Peak", 
                      "Evening", "OWL"]
(alt.Chart(metro)
 .mark_line()
 .encode(
     x = alt.X("month(date):T"),
     y = alt.Y("sum(avg_service_hours):Q", 
               title = "Avg Service Hours"
              ),
     color = alt.Color(
         "time_of_day:N", title = "Time of Day",
        scale = alt.Scale(
            domain = sort_time_of_day, 
            range = cp.CALITP_CATEGORY_BRIGHT_COLORS)
        ),
     tooltip = ["time_of_day", 
                "sum(avg_service_hours)", 
               ]
 ).properties(title = f"Route {metro.route_id.iloc[0]}")
 .interactive()
)

In [20]:
metro_speed = speed_df[(speed_df.name.str.contains(test_operator)) & 
        (speed_df.route_id.str.contains(test_route))]

In [21]:
(alt.Chart(metro_speed[metro_speed.direction_id==0])
 .mark_bar()
 .encode(
     x = alt.X("month:O", title = "Date"),
     y = alt.Y("speed_mph:Q", title="Avg Speed"),
     column = alt.Column("time_of_day:N", title = "Time of Day",
                         sort = sort_time_of_day),
     color = alt.Color(
         "time_of_day:N", title = "Time of Day",
         scale = alt.Scale(
             domain = sort_time_of_day, 
             range = cp.CALITP_CATEGORY_BRIGHT_COLORS)
     )
 ).properties(title = f"{metro_speed.route_id.iloc[0]} direction 0")
 .interactive()
)

In [22]:
(alt.Chart(metro_speed[metro_speed.direction_id==1])
 .mark_bar()
 .encode(
     x = alt.X("month:O", title = "Date"),
     y = alt.Y("speed_mph:Q", title="Avg Speed"),
     column = alt.Column("time_of_day:N", title = "Time of Day",
                         sort = sort_time_of_day),
     color = alt.Color(
         "time_of_day:N", title = "Time of Day",
         scale = alt.Scale(
             domain = sort_time_of_day, 
             range = cp.CALITP_CATEGORY_BRIGHT_COLORS)
     )
 ).properties(title = f"{metro_speed.route_id.iloc[0]} direction 0")
 .interactive()
)

In [23]:
# Does it make sense to take a 6 month avg of speeds by 
# route-direction-time_of_day?
metro_speed_aggregated = (metro_speed.groupby(["source_record_id", "name", 
                                               "route_name", "direction_id", 
                                               "time_of_day", ])
                          .agg({"speed_mph": "mean"}
                          ).reset_index()
                         )

In [24]:
route_crosswalk = metro_speed[
    ["source_record_id", "name", "route_id", "route_name", "geometry"]
].drop_duplicates(subset=["source_record_id", "name", "route_name"])

In [25]:
metro_speed_aggregated = pd.merge(
    route_crosswalk,
    metro_speed_aggregated, 
    on = ["source_record_id", "name", "route_name"]
)

In [26]:
metro_speed_aggregated.columns

Index(['source_record_id', 'name', 'route_id', 'route_name', 'geometry',
       'direction_id', 'time_of_day', 'speed_mph'],
      dtype='object')

In [27]:
COLORSCALE = branca.colormap.step.RdYlGn_10.scale(vmin=0, vmax=40)
COLORSCALE.caption = "Speed (miles per hour)"

for t in sort_time_of_day:
    subset_df = metro_speed_aggregated[
        metro_speed_aggregated.time_of_day==t]
    
    if len(subset_df) > 0:
        m = subset_df.explore("speed_mph", cmap = COLORSCALE, 
                        tiles = 'CartoDBPositron')
        print(f"{t}")
        display(m)

Early AM


AM Peak


Midday


PM Peak


Evening
