## Round 1 
* Updating existing charts made by Tiffany. https://gtfs-digest--cal-itp-data-analyses.netlify.app/
* cd rt_segment_speeds && pip install altair_transform && pip install -r requirements.txt && cd ../_shared_utils && make setup_env

Links
* https://github.com/cal-itp/data-analyses/issues/1059
* https://docs.google.com/document/d/1I1WiqlmU06W6iLCi7cZQrOCLILkrEfABEkcU0Jys7f0/edit
* https://route-speeds--cal-itp-data-analyses.netlify.app/name_bay-area-511-muni-schedule/0__report__name_bay-area-511-muni-schedule
* https://posit-dev.github.io/great-tables/get-started/nanoplots.html
* https://docs.pola.rs/py-polars/html/reference/api/polars.from_pandas.html
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_rt_scheduled_utils.py
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_threshold_utils.py

In [29]:
%%capture
# import warnings
# warnings.filterwarnings('ignore')

import _report_utils
import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import great_tables as gt
import pandas as pd
from calitp_data_analysis import calitp_color_palette as cp
from great_tables import md
from IPython.display import HTML, Markdown, display
from segment_speed_utils.project_vars import RT_SCHED_GCS
from shared_utils import rt_dates, rt_utils

alt.renderers.enable("html")
alt.data_transformers.enable("default", max_rows=None)
from typing import List, Union

from altair_transform.extract import extract_transform
from altair_transform.transform import visit
from altair_transform.utils import to_dataframe

In [30]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [31]:
name = "SBMTD Schedule"

In [32]:
# %%capture_parameters
# name

### General Functions

In [33]:
def labeling(word: str) -> str:
    return (
        word.replace("_", " ")
        .title()
        .replace("N", "Total")
        .replace("Pct", "%")
        .replace("Vp", "VP")
    )

In [34]:
blue_palette = ["#B9D6DF", "#2EA8CE", "#0B405B"]

### Data

In [39]:
original = pd.read_parquet(
    f"{RT_SCHED_GCS}digest/schedule_vp_metrics.parquet")

In [46]:
test_ops = ['Bay Area 511 Muni Schedule','TART, North Lake Tahoe Schedule',
           'Yolobus Schedule']

In [47]:
original.loc[original.name.isin(test_ops)][['name','caltrans_district']].drop_duplicates()

Unnamed: 0,name,caltrans_district
22408,Yolobus Schedule,03 - Marysville
40276,"TART, North Lake Tahoe Schedule",03 - Marysville
56826,Bay Area 511 Muni Schedule,04 - Oakland


In [7]:
# calitp-analytics-data/data-analyses/rt_vs_schedule/digest
df = _report_utils.load_schedule_vp_metrics(name)

In [8]:
most_recent_date = df.service_date.max()

In [9]:
most_recent_date

Timestamp('2024-03-13 00:00:00')

In [10]:
df.service_date.min()

Timestamp('2023-03-15 00:00:00')

#### Unsure what these metrics mean
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/metrics.py
* avg_stop_miles: ??
* n_scheduled_trips: over the course of a day
* frequency: # of times the route runs per hour 
* pct_typology: 44% of the route fits into downtown local, the rest of the 54% falls under other categories??
* is_early: # of trips in that route that are early
* is_on_time: # of trips that are ontime.
* n_vp_trips: # of trips in total
* Difference between pct_rt_journey_atleast1vp and pct_sched_journey_atleast1vp?
* rt_sched_journey_ratio: how many times longer/shorter the actual trip took compared to the scheduled trip?

In [11]:
df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,road_freq_category,road_typology,pct_typology,service_date,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_id,route_combined_name,base64_url,organization_source_record_id,organization_name,caltrans_district,rt_sched_journey_ratio_cat,frequency_cat,vp_per_minute_cat,ruler_100_pct,ruler_for_vp_per_min
14844,239f3baf3dd3b9e9464f66a777f9897d,0.0,all_day,15.0,0.18,54,2.25,very_high,downtown_local,44.0,2023-09-13,863,258,900.19,795.0,1136,925,0,45,8,53,1.26,81.0,96.0,29.0,100.0,32.0,1.13,16.98,schedule_and_vp,8.92,SBMTD Schedule,1,1 West Santa Barbara,aHR0cHM6Ly9zYm10ZC5nb3YvZ29vZ2xlX3RyYW5zaXQvZmVlZC56aXA=,recswCrw6a6htmXJ4,Santa Barbara Metropolitan Transit District,05 - San Luis Obispo,Late by 1-25% of the scheduled time,2 trips/hour,<3 pings/minute,100,2
14845,239f3baf3dd3b9e9464f66a777f9897d,0.0,all_day,15.0,0.18,54,2.25,very_high,downtown_local,44.0,2023-10-11,839,242,945.11,735.0,1093,848,0,38,11,49,1.16,78.0,89.0,26.0,100.0,33.0,1.29,19.29,schedule_and_vp,4.23,SBMTD Schedule,1,1 West Santa Barbara,aHR0cHM6Ly9zYm10ZC5nb3YvZ29vZ2xlX3RyYW5zaXQvZmVlZC56aXA=,recswCrw6a6htmXJ4,Santa Barbara Metropolitan Transit District,05 - San Luis Obispo,Late by 26-50% of the scheduled time,2 trips/hour,<3 pings/minute,100,2


In [12]:
df.time_period.value_counts()

all_day    731
peak       731
offpeak    451
Name: time_period, dtype: int64

### Test out Altair `extract_data`
* https://altair-viz.github.io/user_guide/transform/index.html

In [13]:
__all__ = ["apply", "extract_data", "transform_chart"]

In [14]:
def apply(
    df: pd.DataFrame,
    transform: Union[alt.Transform, List[alt.Transform]],
    inplace: bool = False,
) -> pd.DataFrame:
    """Apply transform or transforms to dataframe.

    Parameters
    ----------
    df : pd.DataFrame
    transform : list|dict
        A transform specification or list of transform specifications.
        Each specification must be valid according to Altair's transform
        schema.
    inplace : bool
        If True, then dataframe may be modified in-place. Default: False.

    Returns
    -------
    df_transformed : pd.DataFrame
        The transformed dataframe.

    Example
    -------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': range(5), 'y': list('ABCAB')})
    >>> chart = alt.Chart(data).transform_aggregate(sum_x='sum(x)', groupby=['y'])
    >>> apply(data, chart.transform)
       y  sum_x
    0  A      3
    1  B      5
    2  C      2
    """
    if not inplace:
        df = df.copy()
    if transform is alt.Undefined:
        return df
    return visit(transform, df)

In [15]:
def extract_data(
    chart: alt.Chart, apply_encoding_transforms: bool = True
) -> pd.DataFrame:
    """Extract transformed data from a chart.

    This only works with data and transform defined at the
    top level of the chart.

    Parameters
    ----------
    chart : alt.Chart
        The chart instance from which the data and transform
        will be extracted
    apply_encoding_transforms : bool
        If True (default), then apply transforms specified within an
        encoding as well as those specified directly in the transforms
        attribute.

    Returns
    -------
    df_transformed : pd.DataFrame
        The extracted and transformed dataframe.

    Example
    -------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': range(5), 'y': list('ABCAB')})
    >>> chart = alt.Chart(data).mark_bar().encode(x='sum(x)', y='y')
    >>> extract_data(chart)
       y  sum_x
    0  A      3
    1  B      5
    2  C      2
    """
    if apply_encoding_transforms:
        chart = extract_transform(chart)
    return apply(to_dataframe(chart.data, chart), chart.transform)

In [16]:
data = pd.DataFrame({"x": range(5), "y": list("ABCAB")})

In [17]:
chart = alt.Chart(data).mark_bar().encode(x="sum(x)", y="y")

In [18]:
chart

In [19]:
extract_data(chart)

Unnamed: 0,y,sum_x
0,A,3
1,B,5
2,C,2


In [20]:
def extract_data_altair(chart):
    chart_dict = chart.to_dict()
    encoding = chart_dict["datasets"]
    df1 = pd.DataFrame(encoding)

    column = df1.columns[0]
    normalized_df = pd.json_normalize(df1[column])
    # Combine the original DataFrame with the extracted values DataFrame
    df2 = pd.concat([df1.drop(column, axis=1), normalized_df], axis=1)
    return df2

### Monthly aggregated service hours by day_type, time_of_day

In [21]:
from segment_speed_utils.project_vars import SCHED_GCS

In [22]:
year = "2023"

In [23]:
monthly_service_df = _report_utils.load_scheduled_service(year, name)

In [24]:
monthly_service_df.sample()

Unnamed: 0,key,name,source_record_id,route_id,route_short_name,route_long_name,time_of_day,month,year,day_type,n_trips,ttl_service_hours,full_date
7433,2ea46cccb08042ac9a1c878a9ab6f64b,SBMTD Schedule,rectQfIeiKDBeJSAV,25,25,Ellwood,Midday,3,2023,Monday,20,4.83,03-2023


In [27]:
monthly_service_df_summary = _report_utils.summarize_monthly("2023", name)

In [28]:
monthly_service_df_summary

Unnamed: 0,full_date,month,name,day_type,time_of_day,ttl_service_hours
0,01-2023,1,SBMTD Schedule,Friday,AM Peak,7.14
1,01-2023,1,SBMTD Schedule,Friday,Early AM,4.52
2,01-2023,1,SBMTD Schedule,Friday,Evening,5.87
3,01-2023,1,SBMTD Schedule,Friday,Midday,18.87
4,01-2023,1,SBMTD Schedule,Friday,PM Peak,9.82
5,01-2023,1,SBMTD Schedule,Monday,AM Peak,6.83
6,01-2023,1,SBMTD Schedule,Monday,Early AM,3.79
7,01-2023,1,SBMTD Schedule,Monday,Evening,4.38
8,01-2023,1,SBMTD Schedule,Monday,Midday,14.64
9,01-2023,1,SBMTD Schedule,Monday,PM Peak,11.79


####  Fix
* Dropdown menu should have the fully spelled months?
* Dropdown menu does not like datetime values, pad single digit months with a 0 and concat it with the year?

In [25]:
_report_utils.single_bar_chart_dropdown(
    monthly_service_df,
    "day_type",
    "ttl_service_hours",
    "time_of_day",
    "Average Total Service Hours across all Routes in 2023",
    "full_date",
)



#### Route Stats

In [None]:
table_df = _report_utils.route_stats(df)

### Updating  Charts
* checking out another ways to display metrics, which ones can be cut.

In [None]:
df.sched_rt_category.unique()

### Only use `schedule_and_vp` for charts!!

In [None]:
df_sched_vp_both = df[df.sched_rt_category == "schedule_and_vp"]

#### Graph Functions
* What if a subtitle needs to be broken up?

In [None]:
def grouped_bar_chart(
    df: pd.DataFrame,
    color_col: str,
    y_col: str,
    offset_col: str,
    title: str,
    subtitle: str,
):
    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)

    df[y_col] = df[y_col].fillna(0).astype(int)
    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        "service_date",
        color_col,
        y_col,
    ]
    chart = (
        alt.Chart(df)
        .mark_bar(size=10)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title=["Grouped by Direction ID", "Date"],
                axis=alt.Axis(format="%b %Y"),
            ),
            y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
            xOffset=alt.X(f"{offset_col}:N", title=labeling(offset_col)),
            color=alt.Color(
                f"{color_col}:N",
                title=labeling(color_col),
                scale=alt.Scale(
                    range=blue_palette,
                ),
            ),
            tooltip=tooltip_cols,
        )
    )
    chart = (chart).properties(
        title={
            "text": [title],
            "subtitle": [subtitle],
        },
        width=500,
        height=300,
    )

    return chart

In [None]:
def heatmap(
    df: pd.DataFrame,
    color_col: str,
    title: str,
    subtitle1: str,
    subtitle2: str,
    subtitle3: str,
):
    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)

    # Grab original column that wasn't categorized
    original_col = color_col.replace("_cat", "")

    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        color_col,
        original_col,
    ]

    # Round
    # df[color_col] = df[color_col].round(1)
    chart = (
        alt.Chart(df)
        .mark_rect(size=30)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                title=["Grouped by Direction ID", "Service Date"],
            ),
            y=alt.Y("time_period:O", title=["Time Period"]),
            xOffset=alt.X(f"direction_id:N", title="Direction ID"),
            color=alt.Color(
                f"{color_col}:N",
                title=labeling(color_col),
                scale=alt.Scale(range=cp.CALITP_SEQUENTIAL_COLORS),
            ),
            tooltip=tooltip_cols,
        )
        .properties(
            title={"text": [title], "subtitle": [subtitle1, subtitle2, subtitle3]},
            width=500,
            height=300,
        )
    )

    text = chart.mark_text(baseline="middle").encode(
        alt.Text("direction_id"), color=alt.value("white")
    )

    final_chart = chart + text
    return final_chart

In [None]:
def base_facet_line(
    df: pd.DataFrame, y_col: str, title: str, subtitle: str
) -> alt.Chart:

    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)
    # https://stackoverflow.com/questions/26454649/python-round-up-to-the-nearest-ten

    if "pct" in y_col:
        max_y = 1.2
    elif "per_minute" in y_col:
        max_y = round(df[y_col].max())
    else:
        max_y = round(df[y_col].max(), -1) + 5

    df[f"{y_col}_str"] = df[y_col].astype(str)

    tooltip_cols = [
        "route_combined_name",
        "route_id",
        "direction_id",
        "time_period",
        f"{y_col}_str",
    ]
    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean(speed_mph):Q")
    )

    chart = (
        alt.Chart(df)
        .mark_line(size=5)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title="Date",
                axis=alt.Axis(format="%b %Y"),
            ),
            y=alt.Y(
                f"{y_col}:Q", title=labeling(y_col), scale=alt.Scale(domain=[0, max_y])
            ),
            color=alt.Color(
                "time_period:N",
                title=labeling("time_period"),
                scale=alt.Scale(range=blue_palette),
            ),
            tooltip=tooltip_cols,
        )
    )

    chart = (chart + ruler).properties(width=250, height=300)
    chart = chart.facet(
        column=alt.Column("direction_id:N", title=labeling("direction_id")),
    ).properties(
        title={
            "text": [title],
            "subtitle": [subtitle],
        }
    )
    return chart

In [None]:
def base_facet_circle(
    df: pd.DataFrame, y_col: str, ruler_col: str, title: str, subtitle: str
) -> alt.Chart:

    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "service_date",
        f"{y_col}_str",
        "variable",
    ]
    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title(),
        variable=df.variable.str.replace("_", " ").str.title(),
    ).reset_index(drop=True)
    # https://stackoverflow.com/questions/26454649/python-round-up-to-the-nearest-ten

    if "pct" in y_col:
        max_y = 1.2
    elif "per_minute" in y_col:
        max_y = round(df[y_col].max())
    else:
        max_y = round(df[y_col].max(), -1) + 5

    df[f"{y_col}_str"] = df[y_col].astype(str)

    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"ruler_100_pct:Q")
    )

    chart = (
        alt.Chart(df)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title="Date",
                axis=alt.Axis(format="%b %Y"),
            ),
            y=alt.Y(
                f"{y_col}:Q", title=labeling(y_col), scale=alt.Scale(domain=[0, max_y])
            ),
            color=alt.Color(
                "variable:N",
                title=labeling("variable"),
                scale=alt.Scale(range=blue_palette),
            ),
            tooltip=tooltip_cols,
        )
    )

    chart = chart + ruler
    chart = chart.facet(
        column=alt.Column("direction_id:N", title=labeling("direction_id")),
    ).properties(
        title={
            "text": [title],
            "subtitle": [subtitle],
        }
    )
    return chart

In [None]:
def base_facet_chart(
    df: pd.DataFrame,
    y_col: str,
    color_col: str,
    facet_col: str,
    title: str,
    subtitle: str,
):
    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        "service_date",
        y_col,
        color_col,
    ]
    chart = (
        (
            alt.Chart(df)
            .mark_bar(size=15, clip=True)
            .encode(
                x=alt.X(
                    "yearmonthdate(service_date):O",
                    title=["Service Date"],
                    axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                ),
                y=alt.Y(
                    f"{y_col}:Q",
                    title=labeling(y_col),
                ),
                color=alt.Color(
                    f"{color_col}:N",
                    title=labeling(color_col),
                    scale=alt.Scale(range=blue_palette),
                ),
                tooltip=tooltip_cols,
            )
        )
        .facet(
            column=alt.Column(
                f"{facet_col}:N",
            )
        )
        .properties(
            title={
                "text": title,
                "subtitle": subtitle,
            }
        )
    )
    return chart

In [None]:
def base_facet_with_ruler_chart(
    df: pd.DataFrame, y_col: str, ruler_col: str, title: str, subtitle: str
):

    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        "service_date",
        y_col,
    ]
    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({ruler_col}):Q")
    )
    chart = (
        alt.Chart(df)
        .mark_bar(size=15, clip=True)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title=["Service Date"],
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
            ),
            y=alt.Y(
                f"{y_col}:Q",
                title=labeling(y_col),
            ),
            color=alt.Color(
                f"{y_col}:Q",
                title=labeling(y_col),
                scale=alt.Scale(range=blue_palette),
            ),
            tooltip=df[tooltip_cols].columns.tolist(),
        )
    )

    chart = chart + ruler
    chart = chart.facet(column=alt.Column("direction_id:N",)).properties(
        title={
            "text": title,
            "subtitle": [subtitle],
        }
    )

    return chart

In [None]:
one_route = df_sched_vp_both.loc[
    df_sched_vp_both.route_combined_name == "1 West Santa Barbara"
]

#### Avg Scheduled Minutes (Done)
* How come the average_scheduled_minutes is a lot shorter than total_rt_service_minutes and total_scheduled_service_minutes?
* Kind of a boring chart? Should compare actual run time compared to service minutes?

In [None]:
# one_route[one_route.time_period == "all_day"][['avg_scheduled_service_minutes']]

In [None]:
grouped_bar_chart(
    df=one_route[one_route.time_period == "all_day"],
    color_col="direction_id",
    y_col="avg_scheduled_service_minutes",
    offset_col="direction_id",
    title="Average Scheduled Minutes",
    subtitle="The average minutes a trip is scheduled to run.",
)

#### Timeliness #1 (Don't Use, too  confusing)

In [None]:
one_route[one_route.time_period == "all_day"].head(1)

In [None]:
avg_rt_service_minutes = 16.98

In [None]:
avg_scheduled_service_minutes = 15

In [None]:
avg_rt_service_minutes / avg_scheduled_service_minutes

In [None]:
one_route.rt_sched_journey_ratio.describe()

In [None]:
heatmap(
    one_route,
    "rt_sched_journey_ratio_cat",
    "Realtime vs. Scheduled Trip Times",
    "Dividing the average of actual trip times by the average of scheduled trip times gives an idea of how on schedule a trip on this route typically is.",
    "A ratio of 1.13 indicates a trip is late 13% over its scheduled time.",
    "",
)

#### Timeliness #2 (Done)

In [None]:
timeliness = _report_utils.timeliness_trips(df)

In [None]:
timeliness.sample()

In [None]:
base_facet_chart(
    timeliness.loc[timeliness.direction_id == 0],
    "value",
    "variable",
    "time_period",
    "Breakdown of Trips by Categories for Direction 0",
    "Trips broken down by VP Only (found only in real-time data), Schedule Only (found only in schedule data), or both.",
)

#### Total Scheduled Trips (Don't Use, boring)

In [None]:
# IF we take away direction, see how many times a route is scheduled to run
total_scheduled_trip = (
    one_route.groupby(["service_date", "time_period"])
    .agg({"n_scheduled_trips": "mean"})
    .reset_index()
)

In [None]:
total_scheduled_trip.n_scheduled_trips = total_scheduled_trip.n_scheduled_trips / 2

In [None]:
total_scheduled_trip.head()

In [None]:
one_route.groupby(["service_date", "direction_id", "time_period"]).agg(
    {"n_scheduled_trips": "max"}
).head()

In [None]:
grouped_bar_chart(
    df=one_route.loc[one_route.time_period != "all_day"],
    color_col="time_period",
    y_col="n_scheduled_trips",
    offset_col="direction_id",
    title="Total Scheduled Trips",
    subtitle="How many times per day is this route is scheduled to run in one particular direction.",
)

#### Frequency (Done)
* Maybe shouldn't be a chart since there doesn't seem to be a lot of data for this across a lot of the routes?
* What does frequency mean?
* Simplify down to not take direction_id into consideration?

In [None]:
one_route[["frequency", "frequency_cat"]].sample(5)

In [None]:
heatmap(
    one_route,
    "frequency_cat",
    "Frequency of Route",
    "Frequency tracks the number of times per hour this route goes by direction and time period.",
    "For example, a frequency of 2.3 going in the direction of ID 1 means the bus passes by this direction about",
    "twice an hour.",
)

#### Speed MPH (Done)
* Needs a different type of chart.

In [None]:
base_facet_line(
    one_route,
    "speed_mph",
    "Average Speed",
    "The average miles per hour the bus travels by direction and time of day.",
)

#### VP per Minute (Done)
* Heatmap too confusing & detailed

In [None]:
heatmap(
    one_route,
    "vp_per_minute_cat",
    "Vehicle Positions per Minute",
    "Vehicle positions are recorded each time the GPS on a bus pings.",
    "The number of times vehicle positions collected reflect the density and reliabilty of realtime data",
    "Trips should be in the 3+ VP per minute category.",
)

In [None]:
base_facet_with_ruler_chart(
    one_route.loc[one_route.time_period == "all_day"],
    "vp_per_minute",
    "ruler_for_vp_per_min",
    "Vehicle Positions per Minute",
    "Trips should have 2+ vehicle positions per minute.",
)

#### Spatial Accuracy (Done)
* Multiple it by 100? Should this be rounded or not?

In [None]:
base_facet_with_ruler_chart(
    one_route.loc[one_route.time_period == "all_day"],
    "pct_in_shape",
    "ruler_100_pct",
    "Spatial Accuracy",
    "The percentage of vehicle positions that fall within the static scheduled route shape reflects the accuracy of the spatial, realtime data.",
)

#### % RT journey with 1+/2+ vp (goal line = 100%) - use all_day, one chart shared for 1+ and 2+ (Done need subtitle)

In [None]:
one_route[["pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"]].sample(3)

In [None]:
def pct_vp_journey(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
    to_keep = [
        "service_date",
        "organization_name",
        "direction_id",
        col1,
        col2,
        "route_combined_name",
        "time_period",
        "route_id",
        "ruler_100_pct",
    ]
    df2 = df[to_keep]

    df3 = df2.melt(
        id_vars=[
            "service_date",
            "organization_name",
            "route_combined_name",
            "direction_id",
            "time_period",
            "route_id",
            "ruler_100_pct",
        ],
        value_vars=[col1, col2],
    )

    return df3

In [None]:
pct_rt = pct_vp_journey(
    one_route, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
)

In [None]:
base_facet_circle(
    pct_rt.loc[pct_rt.time_period == "all_day"],
    "value",
    "ruler_100_pct",
    "Percentage of Realtime Trips with 1+ and 2+ Vehicle Positions",
    "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
)

#### % schedule journey with 1+/2+ vp (goal line = 100%) - use all_day, one chart shared for 1+ and 2+ (Done need subtitle)

In [None]:
schd_rt = pct_vp_journey(
    one_route, "pct_sched_journey_atleast1_vp", "pct_sched_journey_atleast2_vp"
)

In [None]:
base_facet_circle(
    schd_rt.loc[schd_rt.time_period == "all_day"],
    "value",
    "ruler_100_pct",
    "Percentage of Scheduled Trips with 1+ and 2+ Vehicle Positions",
    "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
)

#### Number of trips by sched_vp_category (/operator_schedule_rt_category)
* Why is the crosswalk missing SBMTD schedule??

In [None]:
op_sched_rt_cat = pd.read_parquet(
    f"{RT_SCHED_GCS}digest/operator_schedule_rt_category.parquet"
)

In [None]:
list(df.schedule_gtfs_dataset_key.unique())

In [None]:
test = _report_utils.load_operator_schedule_rt_category("de792182088eecc3d5c0bd3f1df62965")

In [None]:
op_sched_rt_cat.schedule_gtfs_dataset_key.nunique()

In [None]:
# op_routes = pd.read_parquet(
#    f"{RT_SCHED_GCS}digest/operator_routes.parquet")

In [None]:
# gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-03-13.parquet
crosswalk_date = most_recent_date.date()

In [None]:
crosswalk = pd.read_parquet(
    f"{SCHED_GCS}crosswalk/gtfs_key_organization_{crosswalk_date}.parquet"
)[["schedule_gtfs_dataset_key", "organization_name"]]

In [None]:
crosswalk.shape

In [None]:
op_sched_rt_cat.n_trips = op_sched_rt_cat.n_trips.astype(int).fillna(0)

In [None]:
# op_sched_rt_cat = pd.merge(op_sched_rt_cat, crosswalk, on = "schedule_gtfs_dataset_key", how = "left")

In [None]:
op_sched_rt_cat.sample()

In [None]:
crosswalk.loc[crosswalk.schedule_gtfs_dataset_key == "de792182088eecc3d5c0bd3f1df62965"]

In [None]:
op_sched_rt_cat.head(4)

In [None]:
op_sched_rt_cat.info()

In [None]:
def basic_bar_chart(
    df: pd.DataFrame, y_col: str, color_col: str, title: str, subtitle: str
):
    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(
                f"yearmonthdate(service_date):O",
                title="Service Date",
                axis=alt.Axis(format="%b %Y"),
            ),
            y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
            color=alt.Color(
                f"{color_col}:N",
                scale=alt.Scale(
                    range=cp.CALITP_SEQUENTIAL_COLORS,
                ),
            ),
            tooltip=df.columns.tolist(),
        )
    ).properties(
        title={
            "text": title,
            "subtitle": subtitle,
        },
        width=500,
        height=300,
    )
    return chart

In [None]:
basic_bar_chart(
    op_sched_rt_cat,
    "n_trips",
    "sched_rt_category",
    "Trips that were found in GTFS, Schedule, or Both Data Sources",
    "",
)

#### Text

In [None]:
def create_text_table(df: pd.DataFrame, direction_id: str):

    df = df.loc[df.direction_id == direction_id].reset_index(drop=True)
    df2 = df.melt(
        id_vars=[
            "route_combined_name",
            "direction_id",
        ],
        value_vars=[
            "avg_scheduled_service_minutes",
            "avg_stop_miles",
            "n_scheduled_trips",
            "sched_rt_category",
            "peak_avg_speed",
            "peak_scheduled_trips",
            "peak_hourly_freq",
            "offpeak_avg_speed",
            "offpeak_scheduled_trips",
            "offpeak_hourly_freq",
        ],
    )
    # Create a decoy column to center all the text
    df2["Zero"] = 0

    df2.variable = df2.variable.str.replace("_", " ").str.title()
    df2 = df2.sort_values(by=["direction_id"]).reset_index(drop=True)
    df2["combo_col"] = df2.variable.astype(str) + ": " + df2.value.astype(str)
    text_chart = (
        alt.Chart(df2)
        .mark_text()
        .encode(x=alt.X("Zero:Q", axis=None), y=alt.Y("combo_col", axis=None))
    )

    text_chart = text_chart.encode(text="combo_col:N").properties(
        title=f"Route Statistics for Direction {direction_id}", width=500, height=300
    )
    return text_chart

In [None]:
create_text_table(table_df, 0)

#### Putting it all together

In [None]:
def filtered_route(
    df: pd.DataFrame,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """

    route_dropdown = alt.binding_select(
        options=sorted(df["route_combined_name"].unique().tolist()),
        name="Routes ",
    )

    # Column that controls the bar charts
    route_selector = alt.selection_point(
        fields=["route_combined_name"],
        bind=route_dropdown,
    )

    # Data
    # Filter for only schedule and vp
    df_sched_vp_both = df[df.sched_rt_category == "schedule_and_vp"].reset_index(
        drop=True
    )

    # Filter for only rows categorized as found in schedule and vp and all_day
    all_day = df_sched_vp_both.loc[
        df_sched_vp_both.time_period == "all_day"
    ].reset_index(drop=True)

    # Create route stats table for the text tables
    route_stats_df = _report_utils.route_stats(df)

    # Manipulate the df for some of the metrics
    timeliness_df = timeliness_trips(df_sched_vp_both)
    rt_journey_vp = pct_vp_journey(
        all_day, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
    )
    sched_journey_vp = pct_vp_journey(
        all_day, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
    )

    # Charts
    avg_scheduled_min = (
        grouped_bar_chart(
            df=all_day,
            color_col="direction_id",
            y_col="avg_scheduled_service_minutes",
            offset_col="direction_id",
            title="Average Scheduled Minutes",
            subtitle="The average minutes a trip is scheduled to run.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    timeliness_trips_dir_0 = (
        base_facet_chart(
            timeliness_df.loc[timeliness_df.direction_id == 0],
            "value",
            "variable",
            "time_period",
            "Breakdown of Trips by Categories for Direction 0",
            "Categorizing whether a trip is early, late, or ontime. A trip is on time if it arrives 5 minutes later or earlier than scheduled.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    timeliness_trips_dir_1 = (
        base_facet_chart(
            timeliness_df.loc[timeliness_df.direction_id == 1],
            "value",
            "variable",
            "time_period",
            "Breakdown of Trips by Categories for Direction 1",
            "Categorizing whether a trip is early, late, or ontime. A trip is on time if it arrives 5 minutes later or earlier than scheduled.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    frequency = (
        heatmap(
            df_sched_vp_both,
            "frequency_cat",
            "Frequency of Route",
            "Frequency tracks the number of times per hour this route goes by direction and time period.",
            "For example, a frequency of 2.3 going in the direction of ID 1 means the bus passes by this direction about",
            "twice an hour.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    speed = base_facet_line(
        df_sched_vp_both,
        "speed_mph",
        "Average Speed",
        "The average miles per hour the bus travels by direction and time of day.",
    )
    vp_per_min = (
        base_facet_with_ruler_chart(
            all_day,
            "vp_per_minute",
            "ruler_for_vp_per_min",
            "Vehicle Positions per Minute",
            "Trips should have 2+ vehicle positions per minute.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    rt_vp_per_min = (
        base_facet_circle(
            rt_journey_vp,
            "value",
            "ruler_100_pct",
            "Percentage of Realtime Trips with 1+ and 2+ Vehicle Positions",
            "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    sched_vp_per_min = (
        base_facet_circle(
            sched_journey_vp,
            "value",
            "sched_journey_vp",
            "Percentage of Scheduled Trips with 1+ and 2+ Vehicle Positions",
            "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    spatial_accuracy = (
        base_facet_with_ruler_chart(
            all_day,
            "pct_in_shape",
            "ruler_100_pct",
            "Spatial Accuracy",
            "The percentage of vehicle positions that fall within the static scheduled route shape reflects the accuracy of the spatial, realtime data.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    trip_category = (
        base_facet_chart(
            total_sched_vp_trips,
            "n_vp_trips",
            "sched_rt_category",
            "direction_id",
            "Breakdown of Trips by Categories",
            "Trips broken down by VP Only (found only in real-time data), Schedule Only (found only in schedule data), or both for All Day.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    text_dir0 = (
        create_text_table(route_stats_df, 0)
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    text_dir1 = (
        create_text_table(route_stats_df, 1)
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    chart_list = [
        avg_scheduled_min,
        timeliness_trips_dir_0,
        timeliness_trips_dir_1,
        frequency,
        speed,
        vp_per_min,
        rt_vp_per_min,
        sched_vp_per_min,
        spatial_accuracy,
        trip_category,
        text_dir0,
        text_dir1,
    ]

    #
    chart = alt.vconcat(*chart_list).properties(
        resolve=alt.Resolve(
            scale=alt.LegendResolveMap(color=alt.ResolveMode("independent"))
        )
    )
    return chart

In [None]:
filtered_route(df)