In [1]:
%%capture
import warnings
warnings.filterwarnings('ignore')

import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import great_tables as gt
import pandas as pd

from IPython.display import display, HTML, Markdown
from great_tables import md

from segment_speed_utils.project_vars import RT_SCHED_GCS
from calitp_data_analysis import calitp_color_palette as cp
from shared_utils import rt_utils, rt_dates

alt.renderers.enable("html")
alt.data_transformers.enable('default', max_rows=None)

In [2]:
name = "SBMTD Schedule"

In [3]:
# %%capture_parameters
# name

In [4]:
df = pd.read_parquet(
    f"{RT_SCHED_GCS}digest/schedule_vp_metrics.parquet",
    filters = [[("name", "==", name)]]
)

most_recent_date = df.service_date.max()

In [5]:
RENAME_COLS = {
    "avg_sched_service_min": "Average Scheduled Service (min)",
    "speed_mph": "Average Route Speed",
    "n_scheduled_trips": "Daily Scheduled Trips",
    "direction_id": "Direction",
    "service_date": "Date",
    "route_combined_name": "Route",
    "pct_in_shape": "% vp in scheduled shape",
    "vp_per_minute": "vp per minute",
    "pct_rt_journey_vp": "% RT journey 1+ vp per minute",
    "pct_rt_journey_atleast2_vp": "% RT journey 2+ vp per minute",
    "pct_sched_journey_atleast1_vp": "% scheduled journey 1+ vp per minute",
    "pct_sched_journey_atleast2_vp": "% scheduled journey at 2+ vp per minute",
}

def labeling(word: str, rename_dict: dict = RENAME_COLS) -> str:
    if word in rename_dict.keys():
        return rename_dict[word]
    else:
        return word.replace('_', ' ').title()

# {name}

## Monthly Trends 

In [6]:
by_date_category = pd.crosstab(
    df.service_date, df.sched_rt_category,
    values = df.n_scheduled_trips,
    aggfunc="sum",
).reset_index().fillna(0)

(gt.GT(by_date_category, rowname_col = "service_date")
.tab_spanner(
        label="Daily Trips by GTFS Availability",
        columns=["schedule_only", "vp_only", "schedule_and_vp"]
    )
 .fmt_integer(["schedule_only", "vp_only", "schedule_and_vp"])
 .tab_options(container_width = "70%")
 .tab_options(table_font_size="12px")
)

0,1,2,3
2023-03-15,4,0,1672
2023-04-12,3,0,1673
2023-05-17,18,0,1658
2023-06-14,2,0,1604
2023-07-12,0,0,1598
2023-08-15,0,0,1612
2023-09-13,14,0,1684
2023-10-11,12,0,1716
2023-12-13,18,0,1710
2024-01-17,14,0,1702


In [7]:
#pd.crosstab(test.service_date, test.sched_rt_category,
#            values=test.n_scheduled_trips,
#            aggfunc="sum",
#            normalize="index"
#            )

In [8]:
route_categories = df[
    df.time_period=="all_day"
].groupby("sched_rt_category").agg(
    {"route_combined_name": "nunique"}
).reset_index()

(gt.GT(data=route_categories.dropna())
 .fmt_integer(columns = ["route_combined_name"], compact=True)
 .cols_label(
     route_combined_name = "# Routes",
     sched_rt_category = "Category"
 ).tab_options(container_width = "50%")
 .tab_header(
     title=md("Routes with GTFS Availability"),
 ).tab_options(
    table_font_size="12px"
 )
)

Routes with GTFS Availability,Routes with GTFS Availability.1
schedule_only,16
vp_only,3
schedule_and_vp,35
Category,# Routes


In [10]:
df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'road_freq_category', 'road_typology', 'pct_typology',
       'service_date', 'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'rt_service_minutes', 'scheduled_service_minutes', 'total_vp',
       'vp_in_shape', 'n_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'n_vp_trips', 'pct_rt_journey_atleast1_vp', 'sched_rt_category',
       'speed_mph', 'name', 'route_id', 'route_combined_name', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district'],
      dtype='object')

In [9]:
route_merge_cols = ["route_combined_name", "direction_id"]

all_day_stats = df[
    (df.service_date == most_recent_date) & 
    (df.time_period=="all_day")
][route_merge_cols + [
   "avg_sched_service_min", "avg_stop_meters", "n_scheduled_trips",
   "sched_rt_category"
]]

peak_stats = df[
    (df.service_date == most_recent_date) & 
    (df.time_period=="peak")
][route_merge_cols + [
   "speed_mph", "n_scheduled_trips", "frequency"]
 ].rename(columns = {
    "speed_mph": "peak_avg_speed",
    "n_scheduled_trips": "peak_scheduled_trips",
    "frequency": "peak_hourly_freq"
})

offpeak_stats = df[
    (df.service_date == most_recent_date) & 
    (df.time_period=="offpeak")
][route_merge_cols + [
   "speed_mph", "n_scheduled_trips", "frequency"]
 ].rename(columns = {
    "speed_mph": "offpeak_avg_speed",
    "n_scheduled_trips": "offpeak_scheduled_trips",
    "frequency": "offpeak_hourly_freq"
})

KeyError: "['avg_sched_service_min', 'avg_stop_meters'] not in index"

In [None]:
table_df = pd.merge(
    all_day_stats,
    peak_stats,
    on = route_merge_cols,
).merge(
    offpeak_stats,
    on = route_merge_cols
).sort_values(
    ["route_combined_name", "direction_id"]
).reset_index(drop=True)


numeric_cols = table_df.select_dtypes(include='number').columns
table_df[numeric_cols] = table_df[numeric_cols].fillna(0)

integrify = ["direction_id"]
table_df[integrify] = table_df[integrify].astype(int)

round_me = ["peak_avg_speed", "offpeak_avg_speed", 
            "avg_stop_meters", "avg_sched_service_min"]
table_df[round_me] = table_df[round_me].round(1)


In [None]:
(gt.GT(data=table_df)
 .fmt_integer(
     columns = ["peak_scheduled_trips", "offpeak_scheduled_trips", 
                "n_scheduled_trips"], 
     compact=True
 ).fmt_number(
     columns = ["avg_stop_meters", "avg_sched_service_min"],
     compact=True,
     sep_mark=","
 ).cols_hide(["sched_rt_category"])
 .cols_label(
     route_combined_name = "Route",
     #sched_rt_category = "GTFS Category",
     avg_sched_service_min = "Avg Scheduled Service (min)",
     avg_stop_meters = "Avg Stop Distance (meters)",
     n_scheduled_trips = "Daily Scheduled Trips",
     peak_scheduled_trips = "Scheduled Peak Trips",
     offpeak_scheduled_trips = "Scheduled Offpeak Trips",
     direction_id = "Direction",
     peak_avg_speed = "Peak Avg Speed (mph)",
     offpeak_avg_speed = "Offpeak Avg Speed (mph)",
     peak_hourly_freq = "Peak Hourly Freq",
     offpeak_hourly_freq = "Offpeak Hourly Freq",
 ).tab_options(container_width = "100%")
 .tab_header(
     title=md("Route Stats"),
 ).tab_options(
    table_font_size="12px"
 )
)

In [14]:
# avg_scheduled_service_min
df.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'road_freq_category', 'road_typology', 'pct_typology',
       'service_date', 'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'rt_service_minutes', 'scheduled_service_minutes', 'total_vp',
       'vp_in_shape', 'n_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'n_vp_trips', 'pct_rt_journey_atleast1_vp', 'sched_rt_category',
       'speed_mph', 'name', 'route_id', 'route_combined_name', 'base64_url',
       'organization_source_record_id', 'organization_name',
       'caltrans_district'],
      dtype='object')

In [11]:
def base_route_chart(df: pd.DataFrame, y_col: str) -> alt.Chart:
    """
    """
    df = df.assign(
        time_period = df.time_period.str.replace('_', ' ').str.title()
    ).reset_index(drop=True)
    
    selected_colors = [
        cp.CALITP_CATEGORY_BRIGHT_COLORS[0], # blue
        cp.CALITP_CATEGORY_BRIGHT_COLORS[3], # green
        cp.CALITP_CATEGORY_BOLD_COLORS[1], # orange,
    ]
    
    #https://stackoverflow.com/questions/26454649/python-round-up-to-the-nearest-ten
    
    if "pct" in y_col:
        max_y = 1.2
    elif "per_minute" in y_col:
        max_y = round(df[y_col].max())
    else:
        max_y = round(df[y_col].max(), -1)
    
    chart = (
        alt.Chart(df)
        .mark_line()
        .encode(
             x = alt.X("yearmonthdate(service_date):O", title = "Date",
                       axis = alt.Axis(format = '%b %Y')
                      ),
             y = alt.Y(f"{y_col}:Q", title = labeling(y_col),
                       scale = alt.Scale(domain=[0, max_y])
                      ),
             color = alt.Color(
                 "time_period:N", title = labeling("time_period"),
                 scale = alt.Scale(range=cp.CALITP_SEQUENTIAL_COLORS)
             ),
             tooltip = ["route_combined_name", "route_id", "direction_id", 
                        "time_period", y_col]
         ).facet(
             column = alt.Column("direction_id:N", 
                                 title=labeling("direction_id")),
         ).interactive()
    ).properties(title = labeling(y_col))
    
    return chart

In [17]:
# https://stackoverflow.com/questions/62103632/altair-change-the-position-of-a-slider
display(
    HTML(
        """
        <style>
        form.vega-bindings {
            position: absolute;
            right: 0px;
            top: 0px;
            }
        </style>
        """
    )
)

def filtered_route_charts(
    df: pd.DataFrame,
    control_field: str,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """
    route_dropdown = alt.binding_select(
        options=sorted(df[control_field].unique().tolist()), 
        name='Routes ', 
    )
        
    # Column that controls the bar charts
    route_selector = alt.selection_point(
        fields=[control_field], 
        bind=route_dropdown,
    )
    
    sched_df = df[df.sched_rt_category != "vp_only"]
    vp_df = df[df.sched_rt_category != "schedule_only"]
    
    sched_service_chart = base_route_chart(
        sched_df[sched_df.time_period=="all_day"], "avg_scheduled_service_minutes"
    ).add_params(route_selector).transform_filter(route_selector)
   
    sched_trips_chart = base_route_chart(
        sched_df, "n_scheduled_trips"
    ).add_params(route_selector).transform_filter(route_selector)

    sched_freq_chart = base_route_chart(
        sched_df, "frequency"
    ).add_params(route_selector).transform_filter(route_selector)

    speeds_chart = base_route_chart(
        vp_df, "speed_mph"
    ).add_params(route_selector).transform_filter(route_selector)
    
    ping_density_chart = base_route_chart(
        vp_df, "vp_per_minute"
    ).add_params(route_selector).transform_filter(route_selector)
    
    spatial_accuracy_chart = base_route_chart(
        vp_df, "pct_in_shape"
    ).add_params(route_selector).transform_filter(route_selector)

    atleast1vp_chart = base_route_chart(
        vp_df, "pct_rt_journey_vp"
    ).add_params(route_selector).transform_filter(route_selector)
    
    atleast2vp_chart = base_route_chart(
        vp_df, "pct_rt_journey_atleast2_vp"
    ).add_params(route_selector).transform_filter(route_selector)    
    
    chart_list = [
        sched_service_chart, sched_trips_chart,
        sched_freq_chart, speeds_chart,
        ping_density_chart, spatial_accuracy_chart,
        atleast1vp_chart, atleast2vp_chart
    ]
    
    chart = alt.vconcat(*chart_list).resolve_scale(y="independent")
    
    return chart

In [18]:
filtered_route_charts(df, "route_combined_name")

### Segment Speeds

In [19]:

selected_date = pd.to_datetime(rt_dates.DATES["nov2023"])

speeds = gpd.read_parquet(
    f"{RT_SCHED_GCS}digest/segment_speeds.parquet",
    filters = [[("name", "==", name), 
                ("service_date", "==", selected_date)]]
)

if len(speeds) > 0:
    speeds = speeds.drop(columns = "service_date").to_crs("EPSG:3310")
    speeds = speeds.assign(
        geometry_arrowized = speeds.apply(
            lambda x: rt_utils.arrowize_segment(x.geometry), axis=1
        ).set_crs("EPSG:3310")
    )
else:
    pass


In [20]:
def make_map(gdf: gpd.GeoDataFrame, time_period: str):
    subset = gdf[gdf.time_period==time_period]
             
    if len(subset) > 0:
        subset = (subset.set_geometry("geometry_arrowized")
                  .drop(columns = "geometry")
                  .dropna()
                 )
        m = subset.explore(
            "p50_mph", tiles = "CartoDB Positron", 
            cmap = rt_utils.ZERO_THIRTY_COLORSCALE
        )
        display(m)
    else:
        display(Markdown("No speeds data to display"))

In [21]:
make_map(speeds, "offpeak")

No speeds data to display

In [22]:
make_map(speeds, "peak")

No speeds data to display