## New service_hour dataset

In [47]:
from datetime import datetime

import _section1_utils as section1
import geopandas as gpd
import numpy as np
import pandas as pd
from IPython.display import HTML, Image, Markdown, display, display_html
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    GTFS_DATA_DICT,
    PROJECT_CRS,
    RT_SCHED_GCS,
    SCHED_GCS,
    SEGMENT_GCS,
)
from shared_utils import catalog_utils, rt_dates, rt_utils
import _report_utils 

In [42]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Create new service_hour dataset? 
* Script already written out (below)
* Where do I add it within `gtfs_funnel`?
* How do I upload it to GCS so it's available publicly?
* Do I need to add diagrams/notes?

In [43]:
def concatenate_trips(
    date_list: list,
) -> pd.DataFrame:
    """
    Concatenate schedule data on the trips grain that's been
    aggregated to route-direction-time_period for multiple days
    to figure out an operator's total scheduled service hours.
    """
    FILE = GTFS_DATA_DICT.schedule_downloads.trips

    df = (
        time_series_utils.concatenate_datasets_across_dates(
            COMPILED_CACHED_VIEWS,
            FILE,
            date_list,
            data_type="df",
            columns=[
                "name",
                "service_date",
                "route_long_name",
                "trip_first_departure_datetime_pacific",
                "service_hours",
            ],
        )
        .sort_values(["service_date"])
        .reset_index(drop=True)
    )

    return df

In [55]:
# time_series_utils.concatenate_datasets_across_dates??

In [50]:
apr_23week = rt_dates.get_week(month="apr2023", exclude_wed=False)

In [51]:
apr_23_test = concatenate_trips(apr_23week)

In [52]:
apr_23_test.shape

(674114, 5)

In [53]:
apr_23_test.head()

Unnamed: 0,name,service_date,route_long_name,trip_first_departure_datetime_pacific,service_hours
0,VCTC GMV Schedule,2023-04-10,Route 11,2023-04-10 06:00:00,0.53
1,Bay Area 511 Marin Schedule,2023-04-10,Downtown San Rafael - Sausalito,2023-04-10 16:59:00,0.93
2,Bay Area 511 Marin Schedule,2023-04-10,Downtown San Rafael - Sausalito,2023-04-10 19:33:00,0.87
3,Bay Area 511 Marin Schedule,2023-04-10,Hamilton - San Marin HS,2023-04-10 14:47:00,0.28
4,Bay Area 511 Marin Schedule,2023-04-10,Hamilton - San Marin HS,2023-04-10 07:30:00,0.78


In [43]:
def get_day_type(date):
    """
    Function to return the day type (e.g., Monday, Tuesday, etc.) from a datetime object.
    """
    days_of_week = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    return days_of_week[date.weekday()]

In [43]:
def weekday_or_weekend(row):
    """
    Tag if a day is a weekday or Saturday/Sunday
    """
    if row.day_type == "Sunday":
        return "Sunday"
    if row.day_type == "Saturday":
        return "Saturday"
    else:
        return "Weekday"

In [43]:
def total_service_hours(date_list: list, name: str) -> pd.DataFrame:
    """
    Total up service hours by departure hour, 
    month, and day type for an operator. 
    """
    # Combine all the days' data for a week
    df = concatenate_trips(date_list)
    
     # Filter to the specific operator
    df = df.loc[df.name == name].reset_index(drop=True)
    
    # Add day type aka Monday, Tuesday, Wednesday...
    df['day_type'] = df['service_date'].apply(get_day_type)
    
    # Tag if the day is a weekday, Saturday, or Sunday
    df["weekend_weekday"] = df.apply(weekday_or_weekend, axis=1)
    
    # Find the minimum departure hour
    df["departure_hour"] = df.trip_first_departure_datetime_pacific.dt.hour
    
    # Delete out the specific day, leave only month & year
    df["month"] = df.service_date.astype(str).str.slice(stop=7)
    
    # Aggregate total service hours by weekday, Saturday, or Sunday.
    df2 = (
        df.groupby(["name", "month", "weekend_weekday", "departure_hour"])
        .agg(
            {
                "service_hours": "sum",
            }
        )
        .reset_index()
    )
    
    # For weekday, we need to divide by 5
    df2["weekday_service_hours"] = df2.service_hours/5
    
    df2 = df2.rename(columns = {'service_hours':'weekend_service_hours'})
    return df2

In [43]:
def total_service_hours_all_months(name: str) -> pd.DataFrame:
    """
    Find service hours for the months we have a full week's 
    worth of data downloaded for one operator.
    """
    # Grab the dataframes with a full week's worth of data. 
    apr_23week = rt_dates.get_week(month="apr2023", exclude_wed=False)
    oct_23week = rt_dates.get_week(month="oct2023", exclude_wed=False)
    apr_24week = rt_dates.get_week(month="apr2024", exclude_wed=False)
    # need to add april 2024 here 
    
    # Sum up total service_hours
    apr_23df = total_service_hours(apr_23week, name)
    oct_23df = total_service_hours(oct_23week, name)
    apr_24df = total_service_hours(apr_24week, name)
    
    # Combine everything
    all_df = pd.concat([apr_23df, oct_23df, apr_24df])
    
    # Rename the columns
    all_df.columns = all_df.columns.map(_report_utils.replace_column_names)
    return all_df

In [44]:
organization_name = "Marin County Transit District"

In [45]:
name = section1.organization_name_crosswalk(organization_name)

In [48]:
scheduled_service = total_service_hours_all_months(name)

In [49]:
scheduled_service.shape

(198, 6)