# Quarterly Mass Transit Performance Objective

**01 - Increase total amount of service on the SHN and reliability of that service by 2024**

Metrics: total service hours, average service hours, and number of transit routes (n, %).

## Routes on the State Highway Network (SHN)

Transit routes along the SHN can be categorized into 3 groups:
1. **On SHN** - where at least 20% of the transit route runs the SHN (within 50 ft) 
2. **Intersects SHN** - where at least 35% of the transit route runs within 0.5 mile of the SHN.
3. **Other** - all other transit routes.

In [1]:
import altair as alt
import branca
import geopandas as gpd
import intake
import pandas as pd

from IPython.display import HTML, Markdown

import B1_report_metrics as report_metrics
from update_vars import ANALYSIS_DATE, BUS_SERVICE_GCS
from shared_utils import geography_utils, styleguide
from shared_utils import calitp_color_palette as cp
from bus_service_utils import chart_utils, report_utils

hq_catalog = intake.open_catalog("../high_quality_transit_areas/*.yml")
catalog = intake.open_catalog("*.yml")



In [2]:
df = catalog.routes_categorized_with_delay.read()

#df = gpd.read_parquet(
#    f"{BUS_SERVICE_GCS}routes_categorized_with_delay_{ANALYSIS_DATE}.parquet")

In [3]:
def route_type_names(row): 
    if row.route_type in ['0', '1', '2']:
        return "Rail"
    elif row.route_type == '3':
        return "Bus"
    elif row.route_type == '4':
        return "Ferry"
    else:
        return "Unknown"

#df["route_type_name"] = df.apply(lambda x: route_type_names(x), axis=1)

In [None]:
#df.route_type_name.value_counts()
#df[(df.route_type_name=="Unknown")].calitp_itp_id.value_counts()
#df[df.category=="on_shn"].route_type_name.value_counts()
#df[df.category=="intersects_shn"].route_type_name.value_counts()

In [4]:
# Should I subset to df[df._merge=="both"]?
# both means that it found a corresponding match in itp_id-route_id 
# since it's been aggregated up to route_id level (shape_id can mismatch more easily)
# Decide here, this is the subset of data I will use for rest of notebook
plot_df = df[df._merge=="both"]

## Statewide Stats  

* How many service hours are scheduled for a typical weekday for (1)?

In [5]:
summary = report_metrics.get_service_hours_summary_table(plot_df)  

In [6]:
all_hours = geography_utils.aggregate_by_geography(
    summary.assign(category="All"),
    group_cols = ["category"],
    sum_cols = ["unique_route", "total_service_hours"]
)

In [7]:
STATEWIDE_HOURS = all_hours.total_service_hours.iloc[0]
FORMATTED_HOURS = f'{STATEWIDE_HOURS:,}' 

display(
    Markdown(
        f"### Q2 2022 ({ANALYSIS_DATE}): "
        f"{FORMATTED_HOURS} total service hours statewide"
    )
)

### Q2 2022 (2022-08-17): 98,352 total service hours statewide

In [8]:
service_cols_dict = {
    "category": "Category",
    "total_service_hours": "Total Service Hours",
    "pct_total_service_hours": "% Service Hours",
    "unique_route": "# Routes",
    "pct_unique_route": "% Routes",
    "service_hrs_per_route": "Service Hours per Route",
}

summary_styled = report_utils.style_table(
    summary, 
    rename_cols = service_cols_dict, 
    integer_cols = ["Total Service Hours", "# Routes"],
    one_decimal_cols = ["Service Hours per Route"],
    left_align_cols = "first",
    center_align_cols = "all",
    custom_format_cols = {'{:.1%}': ["% Service Hours", "% Routes"]},
    display_table = True
)

Category,Total Service Hours,# Routes,% Service Hours,% Routes,Service Hours per Route
On SHN,15636,657,15.9%,24.2%,23.8
Intersects SHN,53232,1503,54.1%,55.4%,35.42
Other,29484,551,30.0%,20.3%,53.51


## Reliability (Delay)

Be careful here, since delay is not merged onto every route. 

Need apples to apples comparison across quarters.

In [None]:
delay_df = plot_df[plot_df.merge_delay=="both"]
delay_summary = report_metrics.get_delay_summary_table(delay_df)

In [None]:
delay_cols_dict = {
    "category": "Category",
    "delay_hours": "Total Delay Hours",
    "pct_delay_hours": "% Delay Hours",
    "unique_route": "# Routes",
    "pct_unique_route": "% Routes",
    "delay_hours_per_route": "Delay Hours per Route",
}

delay_summary_styled = report_utils.style_table(
    delay_summary, 
    rename_cols = delay_cols_dict, 
    integer_cols = ["Total Delay Hours", "# Routes"],
    two_decimal_cols = ["Delay Hours per Route"],
    left_align_cols = "first",
    center_align_cols = "all",
    custom_format_cols = {'{:.1%}': ["% Delay Hours", "% Routes"]},
    display_table = True
)

In [None]:
# Where district is missing, it's not parallel routes
# So let's ignore those sections and focus on just on_shn and do breakdown
#plot_df[(plot_df.District.isna())].category.value_counts()

In [None]:
def by_district_on_shn_breakdown(df: pd.DataFrame, sum_cols: list) -> pd.DataFrame:
    by_district = geography_utils.aggregate_by_geography(
        df[df.category=="on_shn"],
        group_cols = ["District"],
        sum_cols = sum_cols
    ).astype(int)

    by_district = (report_metrics.add_percent(
        by_district, 
        sum_cols)
        .sort_values("District")
    )
    
    pct_cols = [f"pct_{c}" for c in sum_cols]
    
    for c in pct_cols:
        by_district[c] = by_district[c].round(3)
    
    return by_district

In [None]:
plot_df = plot_df.assign(
    delay_hours = round(plot_df.delay_seconds / 60 ** 2, 2)
)

In [None]:
%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Raleway');
@import url('https://fonts.googleapis.com/css?family=Nunito+Sans');
@import url('https://fonts.googleapis.com/css?family=Bitter');
</style>

In [None]:
def base_bar(df: pd.DataFrame) -> alt.Chart:
    chart = (alt.Chart(df)
             .mark_bar()
             .encode(
                 x=alt.X("District:N", title="District")
             )
            )
    return chart


def make_bar(df: pd.DataFrame, y_col: str) -> alt.Chart:
    """
    Make bar chart that's total service hours or 
    average service hours by district.
    """
    y_title = f"{y_col.replace('_', ' ').title()}"
    
    if y_col == "total_service_hours":
        value_format = ",.0f"
        y_buffer = 1_400
    elif y_col == "avg_delay_hours": 
        value_format = ",.1f"
        y_buffer = 1
    else:
        value_format = ",.1f"
        y_buffer = 5
    
    Y_MAX = df[y_col].max() + y_buffer
    
    bar = base_bar(df)
    
    bar = (bar.encode(
        y=alt.Y(f"{y_col}:Q", title=f"{y_title}", 
                scale=alt.Scale(domain=[0, Y_MAX]),
                axis=None
               ),
        color=alt.Color("District:N", 
                        scale=alt.Scale(
                            range=cp.CALITP_CATEGORY_BRIGHT_COLORS
                        ), legend=None
                )
             )
            )
    #https://stackoverflow.com/questions/54015250/altair-setting-constant-label-color-for-bar-chart
    text = (bar
            .mark_text(align="center", baseline="bottom",
                       color="black", dy=-5  
                      )
            .encode(text=alt.Text(y_col, format=value_format), 
                    # Set color here, because encoding for mark_text gets 
                    # superseded by alt.Color
                   color=alt.value("black"), 
                   tooltip=["District:N", 
                            alt.Tooltip(f"{y_col}:Q", format=",",
                                        title=f"{y_col.replace('_', ' '.title())}"
                                       )] 
                    
        )
    )
      
    chart = (bar+text)
    
    chart = (styleguide.preset_chart_config(chart)
             .properties(title= {
                 "text": f"{y_title} by District",
                 "subtitle": "Routes on SHN"
            }).configure_axis(grid=False)
             .configure_view(strokeWidth=0)
             .interactive()
            )
        
    display(chart)

In [None]:
by_district_service = by_district_on_shn_breakdown(
    plot_df, ["total_service_hours", "unique_route"])

by_district_service = by_district_service.assign(
    avg_service_hours = by_district_service.total_service_hours.divide(
        by_district_service.unique_route).round(1)
)

metrics = [
    "total_service_hours", 
    "avg_service_hours"
]

for m in metrics:
    make_bar(by_district_service, m)

In [None]:
by_district_delay = by_district_on_shn_breakdown(
    plot_df, ["delay_hours", "unique_route"]
)

by_district_delay = by_district_delay.assign(
    avg_delay_hours = by_district_delay.delay_hours.divide(
        by_district_delay.unique_route).round(2)
)

metrics = [
    "delay_hours", 
    "avg_delay_hours"
]

for m in metrics:
    make_bar(by_district_delay, m)


## Map of Routes on or Intersecting SHN

In [None]:
def prep_data_for_viz(df: gpd.GeoDataFrame):
    category_labeling = {
        "on_shn": "On SHN",
        "intersects_shn": "Intersects SHN",
        "other": "Other",
    }

    def labeling(word: str) -> str: 
        return chart_utils.labeling(word, category_labeling)
    
        
    gdf = df.assign(
        category = df.category.map(category_labeling),
    ) 
    
    # line must fall within CA
    ca = hq_catalog.ca_boundary.read().to_crs(f"EPSG: {gdf.crs.to_epsg()}")

    gdf = gpd.sjoin(
        gdf,
        ca,
        how = "inner",
        predicate = "within",
    ).drop(columns= ["index_right"])

    # Buffer to style the line, project to WGS84 for folium
    gdf = gdf.assign(
        geometry = (gdf.geometry.to_crs(geography_utils.CA_StatePlane)
                    .buffer(250).simplify(tolerance=100)
                    .to_crs(geography_utils.WGS84)
                   )
    )
    
    # Drop columns that shouldn't get displayed in tooltip
    drop_cols = ["_merge", "merge_delay", "State"]
    
    gdf2 = gdf.drop(columns = drop_cols)
    
    return gdf2


gdf = prep_data_for_viz(df)

In [None]:
route_map = gdf.explore(
    "category", 
    categorical=True, 
    cmap = [cp.CALITP_CATEGORY_BRIGHT_COLORS[0], 
            cp.CALITP_CATEGORY_BRIGHT_COLORS[1],
            cp.CALITP_CATEGORY_BRIGHT_COLORS[2]
    ],
    tiles = "Carto DB Positron"
)

route_map