## Round 1 
* https://github.com/cal-itp/data-analyses/issues/1059
* cd rt_segment_speeds && pip install altair_transform && pip install -r requirements.txt && cd ../_shared_utils && make setup_env
* https://docs.google.com/document/d/1I1WiqlmU06W6iLCi7cZQrOCLILkrEfABEkcU0Jys7f0/edit
* https://route-speeds--cal-itp-data-analyses.netlify.app/name_bay-area-511-muni-schedule/0__report__name_bay-area-511-muni-schedule
* https://posit-dev.github.io/great-tables/get-started/nanoplots.html
* https://docs.pola.rs/py-polars/html/reference/api/polars.from_pandas.html
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_rt_scheduled_utils.py
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_threshold_utils.py

In [1]:
%%capture
# import warnings
# warnings.filterwarnings('ignore')

import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import great_tables as gt
import pandas as pd
from calitp_data_analysis import calitp_color_palette as cp
from great_tables import md
from IPython.display import HTML, Markdown, display
from segment_speed_utils.project_vars import RT_SCHED_GCS
from shared_utils import rt_dates, rt_utils

alt.renderers.enable("html")
alt.data_transformers.enable("default", max_rows=None)
from typing import List, Union

from altair_transform.extract import extract_transform
from altair_transform.transform import visit
from altair_transform.utils import to_dataframe

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
name = "SBMTD Schedule"

In [4]:
# %%capture_parameters
# name

### General Functions

In [5]:
def labeling(word: str) -> str:
    return (
        word.replace("_", " ")
        .title()
        .replace("N", "Total")
        .replace("Pct", "%")
        .replace("Vp", "VP")
    )

### Data

In [6]:
# calitp-analytics-data/data-analyses/rt_vs_schedule/digest
df = pd.read_parquet(
    f"{RT_SCHED_GCS}digest/schedule_vp_metrics.parquet",
    filters=[[("name", "==", name)]],
)

In [8]:
most_recent_date = df.service_date.max()

In [9]:
most_recent_date

Timestamp('2024-03-13 00:00:00')

In [10]:
df.service_date.min()

Timestamp('2023-03-15 00:00:00')

#### Unsure what these metrics mean
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/metrics.py
* avg_stop_miles: ??
* n_scheduled_trips (over the course of a day?)
* frequency: # of times the route runs per hour 
* pct_typology: 44% of the route fits into downtown local, the rest of the 54% falls under other categories??
* is_early: # of trips in that route that are early
* is_on_time: # of trips that are ontime.
* n_vp_trips: # of trips in total
* Difference between pct_rt_journey_atleast1vp and pct_sched_journey_atleast1vp?
* rt_sched_journey_ratio: how many times longer/shorter the actual trip took compared to the scheduled trip?
* avg_rt_servie_minutes?

In [11]:
df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,road_freq_category,road_typology,pct_typology,service_date,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_id,route_combined_name,base64_url,organization_source_record_id,organization_name,caltrans_district
14844,239f3baf3dd3b9e9464f66a777f9897d,0.0,all_day,15.0,0.18,54,2.25,very_high,downtown_local,0.44,2023-09-13,863,258,900.19,795.0,1136,925,0,45,8,53,1.26,0.81,0.96,0.29,1.0,0.33,1.13,16.98,schedule_and_vp,8.92,SBMTD Schedule,1,1 West Santa Barbara,aHR0cHM6Ly9zYm10ZC5nb3YvZ29vZ2xlX3RyYW5zaXQvZmVlZC56aXA=,recswCrw6a6htmXJ4,Santa Barbara Metropolitan Transit District,05 - San Luis Obispo
14845,239f3baf3dd3b9e9464f66a777f9897d,0.0,all_day,15.0,0.18,54,2.25,very_high,downtown_local,0.44,2023-10-11,839,242,945.11,735.0,1093,848,0,38,11,49,1.16,0.78,0.89,0.26,1.0,0.33,1.29,19.29,schedule_and_vp,4.23,SBMTD Schedule,1,1 West Santa Barbara,aHR0cHM6Ly9zYm10ZC5nb3YvZ29vZ2xlX3RyYW5zaXQvZmVlZC56aXA=,recswCrw6a6htmXJ4,Santa Barbara Metropolitan Transit District,05 - San Luis Obispo


In [12]:
df.time_period.value_counts()

all_day    731
peak       731
offpeak    451
Name: time_period, dtype: int64

#### Categorize

In [13]:
def frequency_tags(row):
    if row.rt_sched_journey_ratio < 1:
        return "Early"
    elif row.rt_sched_journey_ratio < 1.1:
        return "On Time"
    elif 1.1 <= row.rt_sched_journey_ratio < 1.26:
        return "Late by 1-25% of the scheduled time"
    elif 1.26 <= row.rt_sched_journey_ratio < 1.51:
        return "Late by 26-50% of the scheduled time"
    elif 1.51 <= row.rt_sched_journey_ratio:
        return "Late by 50%+ of the scheduled time"
    else:
        return "No Info"


df["rt_sched_journey_ratio_cat"] = df.apply(frequency_tags, axis=1)

In [14]:
df["rt_sched_journey_ratio_cat"].value_counts()

Early                                   837
On Time                                 470
Late by 1-25% of the scheduled time     340
No Info                                 116
Late by 26-50% of the scheduled time    103
Late by 50%+ of the scheduled time       47
Name: rt_sched_journey_ratio_cat, dtype: int64

In [15]:
df[["rt_sched_journey_ratio", "rt_sched_journey_ratio_cat"]].sample(10)

Unnamed: 0,rt_sched_journey_ratio,rt_sched_journey_ratio_cat
15482,0.85,Early
14846,1.45,Late by 26-50% of the scheduled time
14921,1.11,Late by 1-25% of the scheduled time
15341,1.01,On Time
15641,0.98,Early
15185,0.97,Early
106220,0.94,Early
106551,0.89,Early
15583,,No Info
135854,1.01,On Time


In [16]:
def frequency_tags(row):
    if row.frequency < 2:
        return "<1 trip/hour"
    elif 1 <= row.frequency < 2:
        return "1 trip/hour"
    elif 2 <= row.frequency < 3:
        return "2 trips/hour"
    elif 3 <= row.frequency:
        return "3+ trips/hour"
    else:
        return "No Info"


df["frequency_cat"] = df.apply(frequency_tags, axis=1)

In [17]:
df["frequency_cat"].value_counts()

<1 trip/hour     1615
No Info           180
2 trips/hour       70
3+ trips/hour      48
Name: frequency_cat, dtype: int64

In [18]:
df[["frequency", "frequency_cat"]].sample(10)

Unnamed: 0,frequency,frequency_cat
106191,0.04,<1 trip/hour
106550,0.75,<1 trip/hour
15768,1.75,<1 trip/hour
15579,0.12,<1 trip/hour
106035,0.12,<1 trip/hour
106084,0.12,<1 trip/hour
131543,,No Info
105987,3.75,3+ trips/hour
14952,0.62,<1 trip/hour
106213,0.12,<1 trip/hour


In [19]:
def two_vps_tag(row):
    if row.pct_rt_journey_atleast2_vp < 0.25:
        return "<25% of trip has 2+ pings"
    elif 0.25 <= row.pct_rt_journey_atleast2_vp < 0.51:
        return "25-50% of trip has 2+ pings"
    elif 0.51 <= row.pct_rt_journey_atleast2_vp < 0.76:
        return "51-75% of trip has 2+ pings"
    elif 0.76 <= row.pct_rt_journey_atleast2_vp < 0.99:
        return "76-99% of trip has 2+ pings"
    elif row.pct_rt_journey_atleast2_vp == 1:
        return "100% of trip has 2+ pings"
    else:
        return "No Info"


df["pct_rt_journey_atleast2_vp_cat"] = df.apply(two_vps_tag, axis=1)

In [20]:
df["pct_rt_journey_atleast2_vp_cat"].value_counts()

25-50% of trip has 2+ pings    1257
<25% of trip has 2+ pings       525
No Info                         116
51-75% of trip has 2+ pings      14
76-99% of trip has 2+ pings       1
Name: pct_rt_journey_atleast2_vp_cat, dtype: int64

In [21]:
df[["pct_rt_journey_atleast2_vp", "pct_rt_journey_atleast2_vp_cat"]].sample(10)

Unnamed: 0,pct_rt_journey_atleast2_vp,pct_rt_journey_atleast2_vp_cat
15135,0.35,25-50% of trip has 2+ pings
15147,0.34,25-50% of trip has 2+ pings
15518,0.12,<25% of trip has 2+ pings
131560,0.2,<25% of trip has 2+ pings
106256,0.3,25-50% of trip has 2+ pings
106174,0.2,<25% of trip has 2+ pings
15400,0.41,25-50% of trip has 2+ pings
106451,0.34,25-50% of trip has 2+ pings
14973,0.3,25-50% of trip has 2+ pings
15497,0.43,25-50% of trip has 2+ pings


In [22]:
def spatial_accuracy_tag(row):
    if row.pct_in_shape < 0.5:
        return "<50% of VPs in shape"
    elif 0.5 <= row.pct_in_shape < 0.76:
        return "50-75% of VPs in shape"
    elif 0.76 <= row.pct_in_shape < 1:
        return "76-99% of VPs in shape"
    elif row.pct_in_shape == 1:
        return "100% of VPs in shape"
    else:
        return "No Info"


df["pct_in_shape_cat"] = df.apply(spatial_accuracy_tag, axis=1)

In [23]:
df[["pct_in_shape", "pct_in_shape_cat"]].sample(10)

Unnamed: 0,pct_in_shape,pct_in_shape_cat
106331,1.0,100% of VPs in shape
15008,1.0,100% of VPs in shape
106363,1.0,100% of VPs in shape
131494,0.92,76-99% of VPs in shape
15221,0.88,76-99% of VPs in shape
106399,0.93,76-99% of VPs in shape
15575,1.0,100% of VPs in shape
14897,1.0,76-99% of VPs in shape
15045,0.99,76-99% of VPs in shape
106069,1.0,100% of VPs in shape


In [24]:
def vp_per_min_tag(row):
    if row.vp_per_minute < 1:
        return "<1 ping/minute"
    elif 1 <= row.vp_per_minute < 2:
        return "<3 pings/minute"
    elif 2 <= row.vp_per_minute < 3:
        return "<3 pings/minute"
    elif 3 <= row.vp_per_minute:
        return "3+ pings per minute (target)"
    else:
        return "No Info"


df["vp_per_minute_cat"] = df.apply(vp_per_min_tag, axis=1)

In [25]:
df[["vp_per_minute", "vp_per_minute_cat"]].sample(10)

Unnamed: 0,vp_per_minute,vp_per_minute_cat
106171,1.04,<3 pings/minute
105880,1.32,<3 pings/minute
106277,1.45,<3 pings/minute
106096,1.36,<3 pings/minute
14888,1.11,<3 pings/minute
105946,1.23,<3 pings/minute
15567,1.37,<3 pings/minute
106541,1.12,<3 pings/minute
105848,1.24,<3 pings/minute
15267,1.43,<3 pings/minute


In [26]:
df.vp_per_minute_cat.value_counts()

<3 pings/minute    1740
No Info             116
<1 ping/minute       57
Name: vp_per_minute_cat, dtype: int64

### Test out Altair `extract_data`
* https://altair-viz.github.io/user_guide/transform/index.html

In [27]:
__all__ = ["apply", "extract_data", "transform_chart"]

In [28]:
def reverse_snakecase(df):
    """
    Clean up columns to remove underscores and spaces.
    """
    df.columns = df.columns.str.replace("_", " ").str.strip().str.title()
    return df

In [29]:
def apply(
    df: pd.DataFrame,
    transform: Union[alt.Transform, List[alt.Transform]],
    inplace: bool = False,
) -> pd.DataFrame:
    """Apply transform or transforms to dataframe.

    Parameters
    ----------
    df : pd.DataFrame
    transform : list|dict
        A transform specification or list of transform specifications.
        Each specification must be valid according to Altair's transform
        schema.
    inplace : bool
        If True, then dataframe may be modified in-place. Default: False.

    Returns
    -------
    df_transformed : pd.DataFrame
        The transformed dataframe.

    Example
    -------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': range(5), 'y': list('ABCAB')})
    >>> chart = alt.Chart(data).transform_aggregate(sum_x='sum(x)', groupby=['y'])
    >>> apply(data, chart.transform)
       y  sum_x
    0  A      3
    1  B      5
    2  C      2
    """
    if not inplace:
        df = df.copy()
    if transform is alt.Undefined:
        return df
    return visit(transform, df)

In [30]:
def extract_data(
    chart: alt.Chart, apply_encoding_transforms: bool = True
) -> pd.DataFrame:
    """Extract transformed data from a chart.

    This only works with data and transform defined at the
    top level of the chart.

    Parameters
    ----------
    chart : alt.Chart
        The chart instance from which the data and transform
        will be extracted
    apply_encoding_transforms : bool
        If True (default), then apply transforms specified within an
        encoding as well as those specified directly in the transforms
        attribute.

    Returns
    -------
    df_transformed : pd.DataFrame
        The extracted and transformed dataframe.

    Example
    -------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': range(5), 'y': list('ABCAB')})
    >>> chart = alt.Chart(data).mark_bar().encode(x='sum(x)', y='y')
    >>> extract_data(chart)
       y  sum_x
    0  A      3
    1  B      5
    2  C      2
    """
    if apply_encoding_transforms:
        chart = extract_transform(chart)
    return apply(to_dataframe(chart.data, chart), chart.transform)

In [31]:
data = pd.DataFrame({"x": range(5), "y": list("ABCAB")})

In [32]:
chart = alt.Chart(data).mark_bar().encode(x="sum(x)", y="y")

In [33]:
chart

In [34]:
extract_data(chart)

Unnamed: 0,y,sum_x
0,A,3
1,B,5
2,C,2


In [35]:
def extract_data_altair(chart):
    chart_dict = chart.to_dict()
    encoding = chart_dict["datasets"]
    df1 = pd.DataFrame(encoding)

    column = df1.columns[0]
    normalized_df = pd.json_normalize(df1[column])
    # Combine the original DataFrame with the extracted values DataFrame
    df2 = pd.concat([df1.drop(column, axis=1), normalized_df], axis=1)
    return df2

### Monthly aggregated service hours by day_type, time_of_day

In [36]:
from segment_speed_utils.project_vars import SCHED_GCS

In [37]:
year = "2023"

In [38]:
monthly_service_df = pd.read_parquet(
    f"{SCHED_GCS}scheduled_service_by_route_{year}.parquet",
    filters=[[("name", "==", name)]],
)

In [39]:
# Paid single digit int with 0
monthly_service_df["month"] = monthly_service_df["month"].astype(str).str.zfill(2)

In [40]:
monthly_service_df["full_date"] = (
    monthly_service_df.month.astype(str) + "-" + monthly_service_df.year.astype(str)
)

In [41]:
monthly_service_df.sample(2)

Unnamed: 0,key,name,source_record_id,route_id,route_short_name,route_long_name,time_of_day,month,year,day_type,n_trips,ttl_service_hours,full_date
2959,c774ce7081d10f3da8986e1c50fb7654,SBMTD Schedule,rectQfIeiKDBeJSAV,19X,19X,Carp SBCC Express,AM Peak,8,2023,3,2,1.3,08-2023
3020,4726f2ad4aea2913ed674f010541c34a,SBMTD Schedule,rectQfIeiKDBeJSAV,3,3,Oak Park,AM Peak,4,2023,4,52,33.73,04-2023


In [42]:
def tag_day(df: pd.DataFrame) -> pd.DataFrame:
    # Function to determine if a date is a weekend day or a weekday
    def which_day(date):
        if date == 1:
            return "Monday"
        elif date == 2:
            return "Tuesday"
        elif date == 3:
            return "Wednesday"
        elif date == 4:
            return "Thursday"
        elif date == 5:
            return "Friday"
        elif date == 6:
            return "Saturday"
        else:
            return "Sunday"

    # Apply the function to each value in the "service_date" column
    df["day_type"] = df["day_type"].apply(which_day)

    return df

In [43]:
monthly_service_df = tag_day(monthly_service_df)

In [44]:
monthly_service = (
    monthly_service_df.groupby(
        [
            "full_date",
            "month",
            "name",
            "day_type",
            "time_of_day",
        ]
    )
    .agg(
        {
            "ttl_service_hours": "mean",
        }
    )
    .reset_index()
)

####  Fix
* Dropdown menu should have the fully spelled months?
* Dropdown menu does not like datetime values, pad single digit months with a 0 and concat it with the year?

In [45]:
def bar_chart_dropdown(
    df: pd.DataFrame,
    x_col: str,
    y_col: str,
    offset_col: str,
    title: str,
    dropdown_col: str,
):
    dropdown_list = sorted(df[dropdown_col].unique().tolist())

    initialize_first_val = sorted(dropdown_list)[0]

    dropdown = alt.binding_select(options=dropdown_list, name=labeling(dropdown_col))

    selector = alt.selection_single(
        name=labeling(dropdown_col), fields=[dropdown_col], bind=dropdown
    )

    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({y_col}):Q")
    )

    chart = (
        alt.Chart(monthly_service)
        .mark_bar()
        .encode(
            x=alt.X(
                f"{x_col}:N",
                title="Day",
                scale=alt.Scale(
                    domain=[
                        "Monday",
                        "Tuesday",
                        "Wednesday",
                        "Thursday",
                        "Friday",
                        "Saturday",
                        "Sunday",
                    ]
                ),
            ),
            y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
            xOffset=f"{offset_col}:N",
            color=alt.Color(
                f"{offset_col}:N",
                scale=alt.Scale(
                    range=cp.CALITP_SEQUENTIAL_COLORS,
                ),
            ),
            tooltip=df.columns.tolist(),
        )
    )
    chart = (chart + ruler).properties(title=title, width=600, height=400)
    chart = chart.add_params(selector).transform_filter(selector)
    # data = chart.transformed_data()

    display(chart)

In [46]:
bar_chart_dropdown(
    monthly_service,
    "day_type",
    "ttl_service_hours",
    "time_of_day",
    "Average Total Service Hours across all Routes in 2023",
    "full_date",
)



### Monthly Trends
* https://posit-dev.github.io/great-tables/articles/intro.html

In [47]:
by_date_category = (
    pd.crosstab(
        df.service_date,
        df.sched_rt_category,
        values=df.n_scheduled_trips,
        aggfunc="sum",
    )
    .reset_index()
    .fillna(0)
)

In [48]:
(
    gt.GT(by_date_category, rowname_col="service_date")
    .tab_header(
        title="Daily Trips by GTFS Availability",
        subtitle="Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data.",
    )
    .cols_label(
        schedule_only="Schedule Only",
        vp_only="VP Only",
        schedule_and_vp="Schedule and VP",
    )
    .fmt_integer(["schedule_only", "vp_only", "schedule_and_vp"])
    .tab_options(container_width="100%")
    .tab_options(table_font_size="12px")
)

Daily Trips by GTFS Availability,Daily Trips by GTFS Availability,Daily Trips by GTFS Availability,Daily Trips by GTFS Availability
Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data.,Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data..1,Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data..2,Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data..3
2023-03-15,4,0,1672
2023-04-12,3,0,1673
2023-05-17,18,0,1658
2023-06-14,2,0,1604
2023-07-12,0,0,1598
2023-08-15,0,0,1612
2023-09-13,14,0,1684
2023-10-11,12,0,1716
2023-12-13,18,0,1710
2024-01-17,14,0,1702


#### GTFS Availability

In [49]:
route_categories = (
    df[df.time_period == "all_day"]
    .groupby("sched_rt_category")
    .agg({"route_combined_name": "nunique"})
    .reset_index()
)

In [50]:
route_categories.sched_rt_category = (
    route_categories.sched_rt_category.str.replace("_", " ")
    .str.title()
    .str.replace("Vp", "VP")
)

In [51]:
(
    gt.GT(data=route_categories.dropna())
    .fmt_integer(columns=["route_combined_name"], compact=True)
    .cols_label(route_combined_name="Total Routes", sched_rt_category="Category")
    .tab_options(container_width="100%")
    .tab_header(
        title="Routes with GTFS Availability",
        subtitle="Schedule only indicates the route(s) were found only in static, schedule data. Vehicle Positions (VP) only indicates the route(s) were found only in real-time data.",
    )
    .tab_options(table_font_size="12px")
)

Routes with GTFS Availability,Routes with GTFS Availability
"Schedule only indicates the route(s) were found only in static, schedule data. Vehicle Positions (VP) only indicates the route(s) were found only in real-time data.","Schedule only indicates the route(s) were found only in static, schedule data. Vehicle Positions (VP) only indicates the route(s) were found only in real-time data..1"
Schedule Only,16
VP Only,3
Schedule And VP,35
Category,Total Routes


#### Route Stats

In [52]:
def route_stats(df: pd.DataFrame) -> pd.DataFrame:
    route_merge_cols = ["route_combined_name", "direction_id"]

    all_day_stats = df[
        (df.service_date == most_recent_date) & (df.time_period == "all_day")
    ][
        route_merge_cols
        + [
            "avg_scheduled_service_minutes",
            "avg_stop_miles",
            "n_scheduled_trips",
            "sched_rt_category",
        ]
    ]

    peak_stats = df[(df.service_date == most_recent_date) & (df.time_period == "peak")][
        route_merge_cols + ["speed_mph", "n_scheduled_trips", "frequency"]
    ].rename(
        columns={
            "speed_mph": "peak_avg_speed",
            "n_scheduled_trips": "peak_scheduled_trips",
            "frequency": "peak_hourly_freq",
        }
    )

    offpeak_stats = df[
        (df.service_date == most_recent_date) & (df.time_period == "offpeak")
    ][route_merge_cols + ["speed_mph", "n_scheduled_trips", "frequency"]].rename(
        columns={
            "speed_mph": "offpeak_avg_speed",
            "n_scheduled_trips": "offpeak_scheduled_trips",
            "frequency": "offpeak_hourly_freq",
        }
    )

    table_df = (
        pd.merge(
            all_day_stats,
            peak_stats,
            on=route_merge_cols,
        )
        .merge(offpeak_stats, on=route_merge_cols)
        .sort_values(["route_combined_name", "direction_id"])
        .reset_index(drop=True)
    )

    numeric_cols = table_df.select_dtypes(include="number").columns
    table_df[numeric_cols] = table_df[numeric_cols].fillna(0)

    return table_df

In [53]:
table_df = route_stats(df)

### Updating  Charts
* checking out another ways to display metrics, which ones can be cut.

In [54]:
sched_df = df[df.sched_rt_category != "vp_only"]
vp_df = df[df.sched_rt_category != "schedule_only"]

sched_service_chart = sched_df[sched_df.time_period == "all_day"]

#### Graph Functions
* What if a subtitle needs to be broken up?

In [55]:
blue_palette = ["#B9D6DF", "#2EA8CE", "#0B405B"]

In [56]:
def grouped_bar_chart(
    df: pd.DataFrame,
    color_col: str,
    y_col: str,
    offset_col: str,
    title: str,
    subtitle: str,
):
    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)

    df[y_col] = df[y_col].fillna(0).astype(int)
    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        "service_date",
        y_col,
    ]
    """ 
    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({y_col}):Q")
    )
    """
    chart = (
        alt.Chart(df)
        .mark_bar(size=10)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title=["Grouped by Direction ID", "Date"],
                axis=alt.Axis(format="%b %Y"),
            ),
            y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
            xOffset=alt.X(f"{offset_col}:N", title=labeling(offset_col)),
            color=alt.Color(
                f"{color_col}:N",
                title=labeling(color_col),
                scale=alt.Scale(
                    range=blue_palette,
                ),
            ),
            tooltip=tooltip_cols,
        )
    )
    chart = (chart).properties(
        title={
            "text": [title],
            "subtitle": [subtitle],
        },
        width=500,
        height=300,
    )

    return chart

In [57]:
def heatmap(
    df: pd.DataFrame,
    color_col: str,
    title: str,
    subtitle1: str,
    subtitle2: str,
    subtitle3: str,
):
    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)

    # Grab original column that wasn't categorized
    original_col = color_col.replace("_cat", "")

    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        color_col,
        original_col,
    ]

    # Round
    # df[color_col] = df[color_col].round(1)
    chart = (
        alt.Chart(df)
        .mark_rect(size=30)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                title=["Grouped by Direction ID", "Service Date"],
            ),
            y=alt.Y("time_period:O", title=["Time Period"]),
            xOffset=alt.X(f"direction_id:N", title="Direction ID"),
            color=alt.Color(
                f"{color_col}:N",
                title=labeling(color_col),
                scale=alt.Scale(range=cp.CALITP_SEQUENTIAL_COLORS),
            ),
            tooltip=tooltip_cols,
        )
        .properties(
            title={"text": [title], "subtitle": [subtitle1, subtitle2, subtitle3]},
            width=500,
            height=300,
        )
    )

    text = chart.mark_text(baseline="middle").encode(
        alt.Text("direction_id"), color=alt.value("white")
    )

    final_chart = chart + text
    return final_chart

In [58]:
def base_route_chart_ah(
    df: pd.DataFrame, y_col: str, title: str, subtitle: str
) -> alt.Chart:

    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)

    selected_colors = [
        cp.CALITP_CATEGORY_BRIGHT_COLORS[0],  # blue
        cp.CALITP_CATEGORY_BRIGHT_COLORS[3],  # green
        cp.CALITP_CATEGORY_BOLD_COLORS[1],  # orange,
    ]

    # https://stackoverflow.com/questions/26454649/python-round-up-to-the-nearest-ten

    if "pct" in y_col:
        max_y = 1.2
    elif "per_minute" in y_col:
        max_y = round(df[y_col].max())
    else:
        max_y = round(df[y_col].max(), -1) + 5

    df[f"{y_col}_str"] = df[y_col].astype(str)

    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean(speed_mph):Q")
    )

    chart = (
        alt.Chart(df)
        .mark_line(size=5)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title="Date",
                axis=alt.Axis(format="%b %Y"),
            ),
            y=alt.Y(
                f"{y_col}:Q", title=labeling(y_col), scale=alt.Scale(domain=[0, max_y])
            ),
            color=alt.Color(
                "time_period:N",
                title=labeling("time_period"),
                scale=alt.Scale(range=blue_palette),
            ),
            tooltip=[
                "route_combined_name",
                "route_id",
                "direction_id",
                "time_period",
                f"{y_col}_str",
            ],
        )
    )

    chart = (chart + ruler).properties(width=250, height=300)
    chart = chart.facet(
        column=alt.Column("direction_id:N", title=labeling("direction_id")),
    ).properties(
        title={
            "text": [title],
            "subtitle": [subtitle],
        },
    )
    return chart

In [59]:
one_route = sched_df.loc[sched_df.route_combined_name == "1 West Santa Barbara"]

#### Avg Scheduled Minutes'
* How come the average_scheduled_minutes is a lot shorter than total_rt_service_minutes and total_scheduled_service_minutes?
* Kind of a boring chart? Should compare actual run time compared to service minutes?

In [60]:
# one_route[one_route.time_period == "all_day"][['avg_scheduled_service_minutes']]

In [61]:
grouped_bar_chart(
    df=one_route[one_route.time_period == "all_day"],
    color_col="direction_id",
    y_col="avg_scheduled_service_minutes",
    offset_col="direction_id",
    title="Average Scheduled Minutes",
    subtitle="The average minutes a route is scheduled to take.",
)

#### Timeliness #1

In [62]:
one_route[one_route.time_period == "all_day"].head(1)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,road_freq_category,road_typology,pct_typology,service_date,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_id,route_combined_name,base64_url,organization_source_record_id,organization_name,caltrans_district,rt_sched_journey_ratio_cat,frequency_cat,pct_rt_journey_atleast2_vp_cat,pct_in_shape_cat,vp_per_minute_cat
14844,239f3baf3dd3b9e9464f66a777f9897d,0.0,all_day,15.0,0.18,54,2.25,very_high,downtown_local,0.44,2023-09-13,863,258,900.19,795.0,1136,925,0,45,8,53,1.26,0.81,0.96,0.29,1.0,0.33,1.13,16.98,schedule_and_vp,8.92,SBMTD Schedule,1,1 West Santa Barbara,aHR0cHM6Ly9zYm10ZC5nb3YvZ29vZ2xlX3RyYW5zaXQvZmVlZC56aXA=,recswCrw6a6htmXJ4,Santa Barbara Metropolitan Transit District,05 - San Luis Obispo,Late by 1-25% of the scheduled time,2 trips/hour,25-50% of trip has 2+ pings,76-99% of VPs in shape,<3 pings/minute


In [63]:
avg_rt_service_minutes = 16.98

In [64]:
avg_scheduled_service_minutes = 15

In [65]:
avg_rt_service_minutes / avg_scheduled_service_minutes

1.1320000000000001

In [66]:
one_route.rt_sched_journey_ratio.describe()

count   72.00
mean     1.04
std      0.21
min      0.80
25%      0.85
50%      1.03
75%      1.21
max      1.79
Name: rt_sched_journey_ratio, dtype: float64

In [67]:
heatmap(
    one_route,
    "rt_sched_journey_ratio_cat",
    "Realtime vs. Scheduled Trip Times",
    "Dividing the average of actual trip times by the average of scheduled trip times gives an idea of how on schedule a trip on this route typically is.",
    "A ratio of 1.13 indicates a trip is late 13% over its scheduled time.",
    "",
)

#### Timeliness #2

In [68]:
def timeliness_trips(df: pd.DataFrame):
    to_keep = [
        "service_date",
        "organization_name",
        "direction_id",
        "time_period",
        "route_combined_name",
        "is_early",
        "is_ontime",
        "is_late",
        "n_vp_trips",
    ]
    df = df[to_keep]
    df2 = df.loc[df.time_period != "all_day"].reset_index(drop=True)

    melted_df = df2.melt(
        id_vars=[
            "service_date",
            "organization_name",
            "route_combined_name",
            "time_period",
            "direction_id",
        ],
        value_vars=["is_early", "is_ontime", "is_late"],
    )
    return melted_df

In [69]:
timeliness = timeliness_trips(df)

In [70]:
timeliness.sample()

Unnamed: 0,service_date,organization_name,route_combined_name,time_period,direction_id,variable,value
3159,2023-07-12,Santa Barbara Metropolitan Transit District,25 Ellwood,offpeak,0.0,is_late,1


In [71]:
heatmap(
    one_route,
    "rt_sched_journey_ratio_cat",
    "Realtime vs. Scheduled Trip Times",
    "Dividing the average of actual trip times by the average of scheduled trip times gives an idea of how on schedule a trip on this route typically is.",
    "A ratio of 1.13 indicates a trip is late 13% over its scheduled time.",
    "",
)

In [72]:
# This chart is too small to read 
""" timeliness_2 = (alt.Chart(timeliness_df)
    .mark_bar(size=5)
    .encode(
        x=alt.X(
            "yearmonthdate(service_date):O",
            title=["Grouped by Direction ID","Service Date"],
            axis=alt.Axis(labelAngle=-45, format="%b %Y"),
        ),
        y=alt.Y(
            "value:Q",
            title=labeling("value"),
            scale=alt.Scale(
                domain=[(timeliness_df.value.min()), (timeliness_df.value.max() + 5)]
            ),
        ),
        xOffset=alt.X("direction_id:N", title=labeling("time_period")),
        color=alt.Color(
            "variable:N",
            title=labeling("variable"),
            scale=alt.Scale(range=blue_palette),
        ),
        tooltip=ontime_late3.columns.tolist(),
    )).properties(width=225, height=300).facet(
    column=alt.Column("time_period:N", title=labeling("Timeliness of Trips")),
).add_params(route_selector).transform_filter(route_selector)"""

' timeliness_2 = (alt.Chart(timeliness_df)\n    .mark_bar(size=5)\n    .encode(\n        x=alt.X(\n            "yearmonthdate(service_date):O",\n            title=["Grouped by Direction ID","Service Date"],\n            axis=alt.Axis(labelAngle=-45, format="%b %Y"),\n        ),\n        y=alt.Y(\n            "value:Q",\n            title=labeling("value"),\n            scale=alt.Scale(\n                domain=[(timeliness_df.value.min()), (timeliness_df.value.max() + 5)]\n            ),\n        ),\n        xOffset=alt.X("direction_id:N", title=labeling("time_period")),\n        color=alt.Color(\n            "variable:N",\n            title=labeling("variable"),\n            scale=alt.Scale(range=blue_palette),\n        ),\n        tooltip=ontime_late3.columns.tolist(),\n    )).properties(width=225, height=300).facet(\n    column=alt.Column("time_period:N", title=labeling("Timeliness of Trips")),\n).add_params(route_selector).transform_filter(route_selector)'

#### Total Scheduled Trips
* Maybe don't differentiate this by direction_id??
* Kind of confusing because it's trip for only one direction...
* Similar to frequency already?

In [73]:
110 / 24

4.583333333333333

In [74]:
# IF we take away direction, see how many times a route is scheduled to run
total_scheduled_trip = (
    one_route.groupby(["service_date", "time_period"])
    .agg({"n_scheduled_trips": "mean"})
    .reset_index()
)

In [75]:
total_scheduled_trip.n_scheduled_trips = total_scheduled_trip.n_scheduled_trips / 2

In [76]:
total_scheduled_trip.head()

Unnamed: 0,service_date,time_period,n_scheduled_trips
0,2023-03-15,all_day,27.25
1,2023-03-15,offpeak,13.5
2,2023-03-15,peak,13.75
3,2023-04-12,all_day,27.25
4,2023-04-12,offpeak,13.5


In [77]:
one_route.groupby(["service_date", "direction_id", "time_period"]).agg(
    {"n_scheduled_trips": "max"}
).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_scheduled_trips
service_date,direction_id,time_period,Unnamed: 3_level_1
2023-03-15,0.0,all_day,54
2023-03-15,0.0,offpeak,27
2023-03-15,0.0,peak,27
2023-03-15,1.0,all_day,55
2023-03-15,1.0,offpeak,27


In [78]:
grouped_bar_chart(
    df=one_route.loc[one_route.time_period != "all_day"],
    color_col="time_period",
    y_col="n_scheduled_trips",
    offset_col="direction_id",
    title="Total Scheduled Trips",
    subtitle="How many times per day is this route is scheduled to run in one particular direction.",
)

#### Frequency
* Maybe shouldn't be a chart since there doesn't seem to be a lot of data for this across a lot of the routes?
* What does frequency mean?
* Simplify down to not take direction_id into consideration?

In [79]:
one_route[["frequency", "frequency_cat"]].sample(5)

Unnamed: 0,frequency,frequency_cat
105821,1.69,<1 trip/hour
14875,3.5,3+ trips/hour
105811,2.29,2 trips/hour
105798,2.25,2 trips/hour
105819,1.69,<1 trip/hour


In [80]:
heatmap(
    one_route,
    "frequency_cat",
    "Frequency of Route",
    "Frequency tracks the number of times per hour this route goes by direction and time period.",
    "For example, a frequency of 2.3 going in the direction of ID 1 means the bus passes by this direction about",
    "twice an hour.",
)

#### Speed MPH
* Needs a different type of chart.

In [81]:
one_route_vp_df = vp_df.loc[vp_df.route_combined_name == "1 West Santa Barbara"]

In [82]:
one_route_vp_df.shape

(72, 43)

In [83]:
one_route_vp_df[
    ["direction_id", "time_period", "service_date", "speed_mph"]
].sort_values(by=["service_date", "time_period"]).head()

Unnamed: 0,direction_id,time_period,service_date,speed_mph
105793,0.0,all_day,2023-03-15,8.75
105811,1.0,all_day,2023-03-15,9.93
105799,0.0,offpeak,2023-03-15,8.99
105817,1.0,offpeak,2023-03-15,10.14
105805,0.0,peak,2023-03-15,8.62


In [84]:
base_route_chart_ah(
    one_route_vp_df,
    "speed_mph",
    "Average Speed",
    "The average miles per hour the bus travels by direction and time of day.",
)

#### VP per Minute (WIP)
* Should be a table?

In [85]:
one_route_vp_df[["vp_per_minute_cat", "vp_per_minute"]].sample(10)

Unnamed: 0,vp_per_minute_cat,vp_per_minute
14845,<3 pings/minute,1.16
105794,<3 pings/minute,1.2
14870,<3 pings/minute,1.27
105812,<3 pings/minute,1.31
14878,<3 pings/minute,1.32
105800,<3 pings/minute,1.21
14865,<3 pings/minute,1.33
14874,<3 pings/minute,1.29
14861,<3 pings/minute,1.23
105824,<3 pings/minute,1.3


In [86]:
heatmap(
    one_route_vp_df,
    "vp_per_minute_cat",
    "Vehicle Positions per Minute",
    "Vehicle positions are recorded each time the GPS on a bus pings.",
    "The number of times vehicle positions collected reflect the density and reliabilty of realtime data",
    "Trips should be in the 3+ VP per minute category.",
)

#### Spatial Accuracy
* Multiple it by 100? Should this be rounded or not?

In [87]:
heatmap(
    one_route_vp_df,
    "pct_in_shape_cat",
    "Spatial Accuracy",
    "By comparing vehicle positions produced by GTFS and against the static scheduled route shape,",
    " we can determine the percentage of actual positions that fell within a reasonable radius.",
    "This percentage reflects the accuracy of the spatial, realtime data collected.",
)

#### At least 2 VP

In [88]:
one_route_vp_df.pct_rt_journey_atleast2_vp.describe()

count   72.00
mean     0.28
std      0.03
min      0.17
25%      0.26
50%      0.28
75%      0.30
max      0.34
Name: pct_rt_journey_atleast2_vp, dtype: float64

In [89]:
heatmap(
    one_route_vp_df,
    "pct_rt_journey_atleast2_vp_cat",
    "% of the Trip with 2+ Vehicle Positions",
    "Dividing the number of minutes with more than 2 vehicle positions by the total duration",
    "to determine the quality of real-time temporal data for a trip. Generally, the GPS should record vehicle positions every 20 seconds.",
    "A percentage of 0.3 indicates only 30% of the trip had 2+ vehicle positions per minute.",
)

#### Text

In [90]:
def create_text_table(df: pd.DataFrame, direction_id: str):

    df = df.loc[df.direction_id == direction_id].reset_index(drop=True)
    df2 = df.melt(
        id_vars=[
            "route_combined_name",
            "direction_id",
        ],
        value_vars=[
            "avg_scheduled_service_minutes",
            "avg_stop_miles",
            "n_scheduled_trips",
            "sched_rt_category",
            "peak_avg_speed",
            "peak_scheduled_trips",
            "peak_hourly_freq",
            "offpeak_avg_speed",
            "offpeak_scheduled_trips",
            "offpeak_hourly_freq",
        ],
    )
    # Create a decoy column to center all the text
    df2["Zero"] = 0

    df2.variable = df2.variable.str.replace("_", " ").str.title()
    df2 = df2.sort_values(by=["direction_id"]).reset_index(drop=True)
    df2["combo_col"] = df2.variable.astype(str) + ": " + df2.value.astype(str)
    text_chart = (
        alt.Chart(df2)
        .mark_text()
        .encode(x=alt.X("Zero:Q", axis=None), y=alt.Y("combo_col", axis=None))
    )

    text_chart = text_chart.encode(text="combo_col:N").properties(
        title=f"Route Statistics for Direction {direction_id}", width=500, height=300
    )
    return text_chart

In [91]:
one_route_table = table_df.loc[
    table_df.route_combined_name == "2510 San Marcos High School"
]

In [92]:
create_text_table(table_df, 0)

#### Putting it all together

In [93]:
def filtered_route(
    df: pd.DataFrame,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """

    route_dropdown = alt.binding_select(
        options=sorted(df["route_combined_name"].unique().tolist()),
        name="Routes ",
    )

    # Column that controls the bar charts
    route_selector = alt.selection_point(
        fields=["route_combined_name"],
        bind=route_dropdown,
    )

    # Data
    sched_df = df[df.sched_rt_category != "vp_only"]
    vp_df = df[df.sched_rt_category != "schedule_only"]
    route_stats_df = route_stats(df)
    timeliness_df = timeliness_trips(vp_df)
    
    # Charts
    route_stats_dir0 = (
        create_text_table(route_stats_df, 0)
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    route_stats_dir1 = (
        create_text_table(route_stats_df, 1)
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    speed = (
        base_route_chart_ah(
            vp_df,
            "speed_mph",
            "Average Speed",
            "The average miles per hour the bus travels by direction and time of day.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    frequency = (
        heatmap(
            sched_df,
            "frequency_cat",
            "Frequency of Route",
            "Frequency tracks the number of times per hour this route goes by direction and time period.",
            "For example, a frequency of 2.3 going in the direction of ID 1 means the bus passes by this direction about",
            "twice an hour.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    timeliness = (
        heatmap(
            sched_df,
            "rt_sched_journey_ratio_cat",
            "Realtime vs. Scheduled Trip Times",
            "Dividing the average of actual trip times by the average of scheduled trip times gives an idea of how on schedule a trip on this route typically is.",
            "A ratio of 1.13 indicates a trip is late 13% over its scheduled time.",
            "",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    
    
    total_scheduled_trips = (
        grouped_bar_chart(
            df=sched_df.loc[sched_df.time_period != "all_day"],
            color_col="time_period",
            y_col="n_scheduled_trips",
            offset_col="direction_id",
            title="Total Scheduled Trips",
            subtitle="How many times per day is this route is scheduled to run in one particular direction.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    atleast_2_vp = (
        heatmap(
            vp_df,
            "pct_rt_journey_atleast2_vp_cat",
            "% of trip with 2+ Vehicle Positions",
            "Dividing the number of minutes with more than 2 vehicle positions by the total duration",
            "to determine the quality of real-time temporal data for a trip. Generally, the GPS should record vehicle positions every 20 seconds.",
            "A percentage of 0.3 indicates only 30% of the trip had 2+ vehicle positions per minute.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    spatial_accuracy = (
        heatmap(
            vp_df,
            "pct_in_shape_cat",
            "Spatial Accuracy of Realtime Data",
            "By comparing vehicle positions produced by GTFS and against the static scheduled route shape,",
            " we can determine the percentage of actual positions that fell within a reasonable radius.",
            "This percentage reflects the accuracy of the spatial, realtime data collected.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    chart_list = [
        route_stats_dir0,
        route_stats_dir1,
        speed,
        frequency,
        timeliness,
        timeliness_2,
        total_scheduled_trips,
        atleast_2_vp,
        spatial_accuracy,
    ]

    # 
    chart = alt.vconcat(*chart_list).properties(
  resolve = alt.Resolve(scale=alt.LegendResolveMap(color=alt.ResolveMode('independent')))
  )
    return chart

In [94]:
filtered_route(df)

NameError: name 'timeliness_2' is not defined

#### Actual Faceted Charts (from Tiffany)
* Issues 
    * Faceting doesn't  with daily scheduled trip and average speed.

In [None]:
def base_route_chart(df: pd.DataFrame, y_col: str) -> alt.Chart:
    """ """
    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)

    selected_colors = [
        cp.CALITP_CATEGORY_BRIGHT_COLORS[0],  # blue
        cp.CALITP_CATEGORY_BRIGHT_COLORS[3],  # green
        cp.CALITP_CATEGORY_BOLD_COLORS[1],  # orange,
    ]

    # https://stackoverflow.com/questions/26454649/python-round-up-to-the-nearest-ten

    if "pct" in y_col:
        max_y = 1.2
    elif "per_minute" in y_col:
        max_y = round(df[y_col].max())
    else:
        max_y = round(df[y_col].max(), -1)

    df[y_col] = df[y_col].fillna(0).astype(int)

    chart = (
        alt.Chart(df)
        .mark_line()
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title="Date",
                axis=alt.Axis(format="%b %Y"),
            ),
            y=alt.Y(
                f"{y_col}:Q", title=labeling(y_col), scale=alt.Scale(domain=[0, max_y])
            ),
            color=alt.Color(
                "time_period:N",
                title=labeling("time_period"),
                scale=alt.Scale(range=cp.CALITP_SEQUENTIAL_COLORS),
            ),
            tooltip=[
                "route_combined_name",
                "route_id",
                "direction_id",
                "time_period",
                y_col,
            ],
        )
        .facet(
            column=alt.Column("direction_id:N", title=labeling("direction_id")),
        )
        .interactive()
    ).properties(title=labeling(y_col))

    return chart

In [None]:
def filtered_route_charts(
    df: pd.DataFrame,
    control_field: str,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """
    route_dropdown = alt.binding_select(
        options=sorted(df[control_field].unique().tolist()),
        name="Routes ",
    )

    # Column that controls the bar charts
    route_selector = alt.selection_point(
        fields=[control_field],
        bind=route_dropdown,
    )

    sched_df = df[df.sched_rt_category != "vp_only"]
    vp_df = df[df.sched_rt_category != "schedule_only"]

    sched_service_chart = (
        base_route_chart(
            sched_df[sched_df.time_period == "all_day"], "avg_scheduled_service_minutes"
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    sched_trips_chart = (
        base_route_chart(sched_df, "n_scheduled_trips")
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    sched_freq_chart = (
        base_route_chart(sched_df, "frequency")
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    speeds_chart = (
        base_route_chart(vp_df, "speed_mph")
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    ping_density_chart = (
        base_route_chart(vp_df, "vp_per_minute")
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    spatial_accuracy_chart = (
        base_route_chart(vp_df, "pct_in_shape")
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    atleast1vp_chart = (
        base_route_chart(vp_df, "pct_rt_journey_atleast1_vp")
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    atleast2vp_chart = (
        base_route_chart(vp_df, "pct_rt_journey_atleast2_vp")
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    chart_list = [
        sched_service_chart,
        sched_trips_chart,
        sched_freq_chart,
        speeds_chart,
        ping_density_chart,
        spatial_accuracy_chart,
        atleast1vp_chart,
        atleast2vp_chart,
    ]

    chart = alt.vconcat(*chart_list).resolve_scale(y="independent")

    return chart

In [None]:
vp_df.columns

In [None]:
# filtered_route_charts(df, "route_combined_name")

#### Fake Faceted Chart  Doesn't work. The dropdowns don't impact the charts.

In [None]:
def fake_faceted_chart2(df: pd.DataFrame, column_of_interest: str, title=str):

    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({column_of_interest}):Q")
    )

    dir_0 = df.loc[df.direction_id == 0]
    dir_1 = df.loc[df.direction_id == 1]
    chart_dir_0 = (
        alt.Chart(
            dir_0,
            title=alt.Title(
                title,
                subtitle="Direction 0",
            ),
        )
        .mark_bar()
        .encode(
            x=alt.X("month_year", title=labeling("month_year")),
            xOffset="time_period:N",
            y=alt.Y(f"{column_of_interest}:Q", title=labeling(column_of_interest)),
            color=alt.Color(
                "time_period:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
        )
        .properties(width=300, height=300)
    )
    chart_dir_1 = (
        alt.Chart(
            dir_1,
            title=alt.Title(
                "Title",
                color="white",
                subtitle="Direction 1",
            ),
        )
        .mark_bar()
        .encode(
            x=alt.X("month_year", title=labeling("month_year")),
            xOffset="time_period:N",
            y=alt.Y(f"{column_of_interest}:Q", axis=None),
            color=alt.Color(
                "time_period:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
        )
        .properties(width=300, height=300)
    )
    chart_dir_0 = chart_dir_0 + ruler
    chart_dir_1 = chart_dir_1 + ruler
    final_chart = alt.hconcat(chart_dir_0, chart_dir_1)
    return final_chart

In [None]:
def fake_faceted_chart(
    df: pd.DataFrame,
    control_field: str,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """
    dropdown_list = df["route_combined_name"].unique().tolist()
    input_dropdown = alt.binding_select(options=dropdown_list, name="Routes")
    selection = alt.selection_point(fields=["route_combined_name"], bind=input_dropdown)

    selection = alt.selection_point(fields=["route_combined_name"], bind=input_dropdown)
    sched_df = df[df.sched_rt_category != "vp_only"]
    vp_df = df[df.sched_rt_category != "schedule_only"]

    daily_trips = fake_faceted_chart2(
        sched_df, "n_scheduled_trips", "Total Scheduled Daily Trips"
    )
    frequency = fake_faceted_chart2(
        sched_df, "frequency", "Frequency of Trips per Hour"
    )
    avg_speeds = fake_faceted_chart2(vp_df, "speed_mph", "Average Speed")

    chart_lists = [daily_trips, frequency, avg_speeds]
    chart = (
        alt.vconcat(*chart_lists)
        .resolve_scale(y="independent")
        .add_params(selection)
        .transform_filter(selection)
    )
    display(chart)

In [None]:
# fake_faceted_chart(df, "route_combined_name")