## Round 1 
* Updating existing charts made by Tiffany. https://gtfs-digest--cal-itp-data-analyses.netlify.app/
* cd rt_segment_speeds && pip install altair_transform && pip install -r requirements.txt && cd ../_shared_utils && make setup_env

Links
* https://github.com/cal-itp/data-analyses/issues/1059
* https://docs.google.com/document/d/1I1WiqlmU06W6iLCi7cZQrOCLILkrEfABEkcU0Jys7f0/edit
* https://route-speeds--cal-itp-data-analyses.netlify.app/name_bay-area-511-muni-schedule/0__report__name_bay-area-511-muni-schedule
* https://posit-dev.github.io/great-tables/get-started/nanoplots.html
* https://docs.pola.rs/py-polars/html/reference/api/polars.from_pandas.html
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_rt_scheduled_utils.py
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_threshold_utils.py

In [1]:
%%capture
# import warnings
# warnings.filterwarnings('ignore')

import _section2_utils as section2
import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import great_tables as gt
import pandas as pd
from calitp_data_analysis import calitp_color_palette as cp
from great_tables import md
from IPython.display import HTML, Markdown, display
from segment_speed_utils.project_vars import RT_SCHED_GCS, SCHED_GCS
from shared_utils import rt_dates, rt_utils

alt.renderers.enable("html")
alt.data_transformers.enable("default", max_rows=None)
from typing import List, Union

from altair_transform.extract import extract_transform
from altair_transform.transform import visit
from altair_transform.utils import to_dataframe

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
name = "Bay Area 511 AC Transit Schedule"

In [4]:
# %%capture_parameters
# name

### General Functions

In [5]:
red_green_yellow = ["#ec5d3b", "#fde18d", "#7cc665"]

### Data

In [6]:
df = section2.load_schedule_vp_metrics(name)

In [7]:
most_recent_date = df.service_date.max()

#### Unsure what these metrics mean
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/segment_speed_utils/metrics.py
* avg_stop_miles: ??
* n_scheduled_trips: over the course of a day
* frequency: # of times the route runs per hour 
* pct_typology: 44% of the route fits into downtown local, the rest of the 54% falls under other categories??
* is_early: # of trips in that route that are early
* is_on_time: # of trips that are ontime.
* n_vp_trips: # of trips in total
* Difference between pct_rt_journey_atleast1vp and pct_sched_journey_atleast1vp?
* rt_sched_journey_ratio: how many times longer/shorter the actual trip took compared to the scheduled trip?

In [8]:
df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,service_date,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,typology,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district,rt_sched_journey_ratio_cat,frequency_cat,vp_per_minute_cat,ruler_100_pct,ruler_for_vp_per_min
270307,444700afe086ed24e3cb888cecd3037c,0.0,all_day,31.37,0.19,63,2.62,2023-03-15,3103,3019,5859.07,1976.0,8552,8551,0,0,63,63,1.46,100.0,53.0,52.0,100.0,100.0,2.97,93.0,schedule_and_vp,11.03,Bay Area 511 AC Transit Schedule,E. 14th St. - Mission,10,10 E. 14th St. - Mission,10,downtown_local,recJjD8JT53sK302o,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BQw==,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,04 - Oakland,Late by 50%+ of the scheduled time,2 trips/hour,<3 pings/minute,100,2
270308,444700afe086ed24e3cb888cecd3037c,0.0,offpeak,31.37,0.19,36,2.25,2023-03-15,1727,1683,4504.88,1100.0,4747,4746,0,0,36,36,1.05,100.0,38.0,37.0,100.0,100.0,4.1,125.14,schedule_and_vp,11.41,Bay Area 511 AC Transit Schedule,E. 14th St. - Mission,10,10 E. 14th St. - Mission,10,downtown_local,recJjD8JT53sK302o,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BQw==,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,04 - Oakland,Late by 50%+ of the scheduled time,2 trips/hour,<3 pings/minute,100,2


In [9]:
df.time_period.value_counts()

all_day    31699
peak       26504
offpeak    23932
Name: time_period, dtype: int64

### Test out Altair `extract_data`
* https://altair-viz.github.io/user_guide/transform/index.html

In [10]:
__all__ = ["apply", "extract_data", "transform_chart"]

In [11]:
def apply(
    df: pd.DataFrame,
    transform: Union[alt.Transform, List[alt.Transform]],
    inplace: bool = False,
) -> pd.DataFrame:
    """Apply transform or transforms to dataframe.

    Parameters
    ----------
    df : pd.DataFrame
    transform : list|dict
        A transform specification or list of transform specifications.
        Each specification must be valid according to Altair's transform
        schema.
    inplace : bool
        If True, then dataframe may be modified in-place. Default: False.

    Returns
    -------
    df_transformed : pd.DataFrame
        The transformed dataframe.

    Example
    -------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': range(5), 'y': list('ABCAB')})
    >>> chart = alt.Chart(data).transform_aggregate(sum_x='sum(x)', groupby=['y'])
    >>> apply(data, chart.transform)
       y  sum_x
    0  A      3
    1  B      5
    2  C      2
    """
    if not inplace:
        df = df.copy()
    if transform is alt.Undefined:
        return df
    return visit(transform, df)

In [12]:
def extract_data(
    chart: alt.Chart, apply_encoding_transforms: bool = True
) -> pd.DataFrame:
    """Extract transformed data from a chart.

    This only works with data and transform defined at the
    top level of the chart.

    Parameters
    ----------
    chart : alt.Chart
        The chart instance from which the data and transform
        will be extracted
    apply_encoding_transforms : bool
        If True (default), then apply transforms specified within an
        encoding as well as those specified directly in the transforms
        attribute.

    Returns
    -------
    df_transformed : pd.DataFrame
        The extracted and transformed dataframe.

    Example
    -------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': range(5), 'y': list('ABCAB')})
    >>> chart = alt.Chart(data).mark_bar().encode(x='sum(x)', y='y')
    >>> extract_data(chart)
       y  sum_x
    0  A      3
    1  B      5
    2  C      2
    """
    if apply_encoding_transforms:
        chart = extract_transform(chart)
    return apply(to_dataframe(chart.data, chart), chart.transform)

In [13]:
data = pd.DataFrame({"x": range(5), "y": list("ABCAB")})

In [14]:
chart = alt.Chart(data).mark_bar().encode(x="sum(x)", y="y")

In [15]:
chart

In [16]:
extract_data(chart)

Unnamed: 0,y,sum_x
0,A,3
1,B,5
2,C,2


In [17]:
def extract_data_altair(chart):
    chart_dict = chart.to_dict()
    encoding = chart_dict["datasets"]
    df1 = pd.DataFrame(encoding)

    column = df1.columns[0]
    normalized_df = pd.json_normalize(df1[column])
    # Combine the original DataFrame with the extracted values DataFrame
    df2 = pd.concat([df1.drop(column, axis=1), normalized_df], axis=1)
    return df2

In [18]:
df.sched_rt_category.unique()

['schedule_and_vp', 'schedule_only', 'vp_only']
Categories (3, object): ['schedule_only', 'vp_only', 'schedule_and_vp']

### Only use `schedule_and_vp` for charts!!

In [19]:
df_sched_vp_both = df[df.sched_rt_category == "schedule_and_vp"]

In [20]:
df_sched_vp_both.shape

(62961, 44)

#### Graph Functions
* What if a subtitle needs to be broken up?

In [21]:
def grouped_bar_chart(
    df: pd.DataFrame,
    color_col: str,
    y_col: str,
    offset_col: str,
    title: str,
    subtitle: str,
):
    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)

    df[y_col] = df[y_col].fillna(0).astype(int)
    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        "service_date",
        color_col,
        y_col,
    ]
    chart = (
        alt.Chart(df)
        .mark_bar(size=10)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title=["Grouped by Direction ID", "Date"],
                axis=alt.Axis(format="%b %Y"),
            ),
            y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
            xOffset=alt.X(f"{offset_col}:N", title=labeling(offset_col)),
            color=alt.Color(
                f"{color_col}:N",
                title=labeling(color_col),
                scale=alt.Scale(
                   range=red_green_yellow,
                ),
            ),
            tooltip=tooltip_cols,
        )
    )
    chart = (chart).properties(
        title={
            "text": [title],
            "subtitle": [subtitle],
        },
        width=500,
        height=300,
    )

    return chart

In [22]:
def heatmap(
    df: pd.DataFrame,
    color_col: str,
    title: str,
    subtitle1: str,
    subtitle2: str,
    subtitle3: str,
):
    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)

    # Grab original column that wasn't categorized
    original_col = color_col.replace("_cat", "")

    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        color_col,
        original_col,
    ]

    # Round
    # df[color_col] = df[color_col].round(1)
    chart = (
        alt.Chart(df)
        .mark_rect(size=30)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                title=["Grouped by Direction ID", "Service Date"],
            ),
            y=alt.Y("time_period:O", title=["Time Period"]),
            xOffset=alt.X(f"direction_id:N", title="Direction ID"),
            color=alt.Color(
                f"{color_col}:N",
                title=labeling(color_col),
                scale=alt.Scale(range=cp.CALITP_SEQUENTIAL_COLORS),
            ),
            tooltip=tooltip_cols,
        )
        .properties(
            title={"text": [title], "subtitle": [subtitle1, subtitle2, subtitle3]},
            width=500,
            height=300,
        )
    )

    text = chart.mark_text(baseline="middle").encode(
        alt.Text("direction_id"), color=alt.value("white")
    )

    final_chart = chart + text
    return final_chart

In [23]:
def base_facet_line(
    df: pd.DataFrame, y_col: str, title: str, subtitle: str
) -> alt.Chart:

    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)
    # https://stackoverflow.com/questions/26454649/python-round-up-to-the-nearest-ten

    if "pct" in y_col:
        max_y = 1.2
    elif "per_minute" in y_col:
        max_y = round(df[y_col].max())
    else:
        max_y = round(df[y_col].max(), -1) + 5

    df[f"{y_col}_str"] = df[y_col].astype(str)

    tooltip_cols = [
        "route_combined_name",
        "route_id",
        "direction_id",
        "time_period",
        f"{y_col}_str",
    ]
    
    chart = (
        alt.Chart(df)
        .mark_line(size=5)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title="Date",
                axis=alt.Axis(labelAngle=-45, format="%b %Y")
            ),
            y=alt.Y(
                f"{y_col}:Q", title=labeling(y_col), scale=alt.Scale(domain=[0, max_y])
            ),
            color=alt.Color(
                "time_period:N",
                title=labeling("time_period"),
                scale=alt.Scale(range=red_green_yellow),
            ),
            tooltip=tooltip_cols,
        )
    )

    chart = chart.properties(width=250, height=300)
    chart = chart.facet(
        column=alt.Column("direction_id:N", title=labeling("direction_id")),
    ).properties(
        title={
            "text": [title],
            "subtitle": [subtitle],
        }
    )
    return chart

In [24]:
def base_facet_circle(
    df: pd.DataFrame, y_col: str, ruler_col: str, title: str, subtitle: str
) -> alt.Chart:

    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "service_date",
        f"{y_col}_str",
        "variable",
    ]
    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title(),
        variable=df.variable.str.replace("_", " ").str.title(),
    ).reset_index(drop=True)
    # https://stackoverflow.com/questions/26454649/python-round-up-to-the-nearest-ten

    if "pct" in y_col:
        max_y = 100
    elif "value" in y_col:
        max_y = 100
    else:
        max_y = round(df[y_col].max(), -1) + 5

    df[f"{y_col}_str"] = df[y_col].astype(str)

    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"ruler_100_pct:Q")
    )

    chart = (
        alt.Chart(df)
        .mark_circle(size=100)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title="Date",
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
            ),
            y=alt.Y(
                f"{y_col}:Q", title=labeling(y_col), scale=alt.Scale(domain=[0, max_y])
            ),
            color=alt.Color(
                "variable:N",
                title=labeling("variable"),
                scale=alt.Scale(range=red_green_yellow),
            ),
            tooltip=tooltip_cols,
        )
    )

    chart = chart + ruler
    chart = chart.facet(
        column=alt.Column("direction_id:N", title=labeling("direction_id")),
    ).properties(
        title={
            "text": [title],
            "subtitle": [subtitle],
        }
    )
    return chart

In [25]:
def base_facet_chart(
    df: pd.DataFrame,
    y_col: str,
    color_col: str,
    facet_col: str,
    title: str,
    subtitle: str,
):
    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        "service_date",
        y_col,
        color_col,
    ]
    chart = (
        (
            alt.Chart(df)
            .mark_bar(size=15, clip=True)
            .encode(
                x=alt.X(
                    "yearmonthdate(service_date):O",
                    title=["Service Date"],
                    axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                ),
                y=alt.Y(
                    f"{y_col}:Q",
                    title=labeling(y_col),
                ),
                color=alt.Color(
                    f"{color_col}:N",
                    title=labeling(color_col),
                    scale=alt.Scale(range=red_green_yellow),
                ),
                tooltip=tooltip_cols,
            )
        )
        .facet(
            column=alt.Column(
                f"{facet_col}:N",
            )
        )
        .properties(
            title={
                "text": title,
                "subtitle": subtitle,
            }
        )
    )
    return chart

In [26]:
def base_facet_with_ruler_chart(
    df: pd.DataFrame, y_col: str, ruler_col: str, title: str, subtitle: str
):
    
    if "pct" in y_col:
        max_y = 100
    elif "vp" in y_col:
        max_y = 5
    else:
        max_y = round(df[y_col].max(), -1) + 5

    df[f"{y_col}_str"] = df[y_col].astype(str)
    
    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        "service_date",
        y_col,
    ]
    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({ruler_col}):Q")
    )
    chart = (
        alt.Chart(df)
        .mark_bar(size=15, clip=True)
        .encode(
            x=alt.X(
                "yearmonthdate(service_date):O",
                title=["Service Date"],
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
            ),
            y=alt.Y(
                f"{y_col}:Q",
                title=labeling(y_col),
                scale=alt.Scale(domain=[0, max_y]
            )),
            color=alt.Color(
                f"{y_col}:Q",
                title=labeling(y_col),
                scale=alt.Scale(range=red_green_yellow),
            ),
            tooltip=df[tooltip_cols].columns.tolist(),
        )
    )

    chart = chart + ruler
    chart = chart.facet(column=alt.Column("direction_id:N",)).properties(
        title={
            "text": title,
            "subtitle": [subtitle],
        }
    )

    return chart

In [27]:
def labeling(word: str) -> str:
    return (
        word.replace("_", " ")
        .title()
        .replace("Pct", "%")
        .replace("Vp", "VP")
        .replace("Route Combined Name", "Route")
    )

In [35]:
df_sched_vp_both.route_combined_name.unique()

array(['10 E. 14th St. - Mission', '12 MLK Jr. - Temescal - Grand',
       '14 14th St - San Antonio - High St',
       '18 Solano - Shattuck - MLK Jr.',
       '1T Uptown Oakland\\ San Leandro BART Southbound OWL',
       '200 Decoto - Newark Blvd. - Mowry',
       '210 Fremont Blvd. - Mission San Jose',
       '212 Fremont Blvd. - Pacific Commons',
       '215 Osgood - Warm Springs - Landing',
       '216 Niles - Stevenson - NewPark',
       '217 Mission San Jose - Milpitas',
       '232 Mission - Decoto - NewPark', '239 Grimmer - Warm Springs',
       '251 Paseo Padre - Thornton - Cherry',
       '28 Alvarado - Castro Valley - B St.',
       '29 Hollis - Peralta - Lakeshore',
       '33 Piedmont - Harrison - Park Blvd.',
       '34 Estudillo - Davis - Meekland',
       '35 Estudillo - Davis -  Lewelling',
       '36 Dwight - Shellmound - Adeline',
       '376 Cutting - North Richmond Nights',
       '39 Skyline - Dimond - Fruitvale',
       '40 Foothill - Bancroft - Bay Fair',
     

In [29]:
# Has very little data: 652 Skyline High - Elmhurst Bay Area 511 AC Transit Schedule
# Has a lot of data: "18 Solano - Shattuck - MLK Jr."

In [37]:
one_route = df_sched_vp_both.loc[
    df_sched_vp_both.route_combined_name == '652 Skyline High - Elmhurst'
]

In [38]:
one_route.shape

(366, 44)

In [39]:
one_route.sample()

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,service_date,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,typology,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district,rt_sched_journey_ratio_cat,frequency_cat,vp_per_minute_cat,ruler_100_pct,ruler_for_vp_per_min
768438,c499f905e33929a641f083dad55c521e,0.0,all_day,24.0,0.16,1,0.04,2023-04-12,22,22,21.28,24.0,65,65,0,1,0,1,3.05,100.0,100.0,100.0,92.0,92.0,0.89,21.28,schedule_and_vp,11.83,Bay Area 511 AC Transit Schedule,Skyline High - Elmhurst,652,652 Skyline High - Elmhurst,652,rapid,recJjD8JT53sK302o,aHR0cHM6Ly9hcGkuNTExLm9yZy90cmFuc2l0L2RhdGFmZWVkcz9vcGVyYXRvcl9pZD1BQw==,recOZgevYf7Jimm9L,Alameda-Contra Costa Transit District,04 - Oakland,Early,<1 trip/hour,3+ pings per minute (target),100,2


In [40]:
one_route.service_date.value_counts()

2023-04-12    39
2023-05-17    39
2023-09-13    39
2023-11-15    39
2023-12-13    39
2024-01-17    39
2024-02-14    39
2024-03-13    39
2023-08-15    31
2023-10-11    19
2023-03-15     4
Name: service_date, dtype: int64

#### Avg Scheduled Minutes (Done)
* How come the average_scheduled_minutes is a lot shorter than total_rt_service_minutes and total_scheduled_service_minutes?
* Kind of a boring chart? Should compare actual run time compared to service minutes?

In [41]:
# one_route[one_route.time_period == "all_day"][['avg_scheduled_service_minutes']]

In [42]:
grouped_bar_chart(
    df=one_route[one_route.time_period == "all_day"],
    color_col="direction_id",
    y_col="avg_scheduled_service_minutes",
    offset_col="direction_id",
    title="Average Scheduled Minutes",
    subtitle="The average minutes a trip is scheduled to run.",
)

#### Timeliness #2 (Done)

In [43]:
timeliness = section2.timeliness_trips(one_route)

In [44]:
timeliness.sample()

Unnamed: 0,service_date,organization_name,route_combined_name,time_period,direction_id,variable,value
496,2024-01-17,Alameda-Contra Costa Transit District,652 Skyline High - Elmhurst,offpeak,1.0,is_late,1


In [45]:
base_facet_chart(
    timeliness.loc[timeliness.direction_id == 0],
    "value",
    "variable",
    "time_period",
    "Breakdown of Trips by Categories for Direction 0",
    "Trips broken down by VP Only (found only in real-time data), Schedule Only (found only in schedule data), or both.",
)

#### Total Scheduled Trips (Don't Use, boring)

In [46]:
# IF we take away direction, see how many times a route is scheduled to run
total_scheduled_trip = (
    one_route.groupby(["service_date", "time_period"])
    .agg({"n_scheduled_trips": "mean"})
    .reset_index()
)

In [47]:
total_scheduled_trip.n_scheduled_trips = total_scheduled_trip.n_scheduled_trips / 2

In [48]:
total_scheduled_trip.head()

Unnamed: 0,service_date,time_period,n_scheduled_trips
0,2023-03-15,all_day,0.5
1,2023-03-15,offpeak,0.5
2,2023-03-15,peak,0.5
3,2023-04-12,all_day,0.5
4,2023-04-12,offpeak,0.5


In [49]:
one_route.groupby(["service_date", "direction_id", "time_period"]).agg(
    {"n_scheduled_trips": "max"}
).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_scheduled_trips
service_date,direction_id,time_period,Unnamed: 3_level_1
2023-03-15,0.0,all_day,1
2023-03-15,0.0,peak,1
2023-03-15,1.0,all_day,1
2023-03-15,1.0,offpeak,1
2023-04-12,0.0,all_day,1


In [50]:
grouped_bar_chart(
    df=one_route.loc[one_route.time_period != "all_day"],
    color_col="time_period",
    y_col="n_scheduled_trips",
    offset_col="direction_id",
    title="Total Scheduled Trips",
    subtitle="How many times per day is this route is scheduled to run in one particular direction.",
)

#### Frequency (Done)
* Maybe shouldn't be a chart since there doesn't seem to be a lot of data for this across a lot of the routes?
* What does frequency mean?
* Simplify down to not take direction_id into consideration?

In [51]:
def frequency_chart(df:pd.DataFrame):
    df = df.assign(
        time_period=df.time_period.str.replace("_", " ").str.title()
    ).reset_index(drop=True)
    chart = alt.Chart(df, width=180, height=alt.Step(10)).mark_bar().encode(
    alt.Y(
        "yearmonthdate(service_date):O",
        title="Date",
        axis=alt.Axis(format="%b %Y"),
    ),
    alt.X("frequency:Q", title = labeling("frequency"), 
          axis=None),
    alt.Color("frequency", scale=alt.Scale(range=red_green_yellow))
    .title(labeling("Frequency")),
    alt.Row("time_period:N").title(labeling("time_period")).header(labelAngle=0),
    alt.Column("direction_id:N").title(labeling("direction_id")))
    
    chart = chart.properties(
        title = "Frequency of Trips per Hour")
    return chart

In [52]:
frequency_chart(one_route)

#### Speed MPH (Done)
* Needs a different type of chart.

In [53]:
base_facet_line(
    one_route,
    "speed_mph",
    "Average Speed",
    "The average miles per hour the bus travels by direction and time of day.",
)

#### VP per Minute (WIP)
* Heatmap too confusing & detailed

In [54]:
def add_background():
    # Sample data
    data = pd.DataFrame({
    'x': range(9),
    'y': [0, 0.5, 1.5, 2.5, 3.5,  2, 1, 3, 4,]
    })
    
    # Background rectangle for color bands
    bg_chart = alt.Chart(data).mark_rect().encode(
    y=alt.Y('y:Q', axis = None),
    y2='y2:Q',
    color=alt.Color('y:Q',  scale=alt.Scale(range=red_green_yellow)),
    opacity=alt.value(0.2)  # Adjust opacity as needed
    ).transform_calculate(
    y2='datum.y + 1'
    )
    
    return bg_chart

In [55]:
add_background()

In [56]:
blue_palette = ["#B9D6DF", "#2EA8CE", "#0B405B"]

#### Spatial Accuracy (Done)
* Multiple it by 100? Should this be rounded or not?

In [58]:
base_facet_with_ruler_chart(
    one_route.loc[one_route.time_period == "all_day"],
    "pct_in_shape",
    "ruler_100_pct",
    "Spatial Accuracy",
    "The percentage of vehicle positions that fall within the static scheduled route shape reflects the accuracy of the spatial, realtime data.",
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{y_col}_str"] = df[y_col].astype(str)


#### % RT journey with 1+/2+ vp (goal line = 100%) - use all_day, one chart shared for 1+ and 2+ (Done need subtitle)

In [59]:
def pct_vp_journey(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
    to_keep = [
        "service_date",
        "organization_name",
        "direction_id",
        col1,
        col2,
        "route_combined_name",
        "time_period",
        "route_id",
        "ruler_100_pct",
    ]
    df2 = df[to_keep]

    df3 = df2.melt(
        id_vars=[
            "service_date",
            "organization_name",
            "route_combined_name",
            "direction_id",
            "time_period",
            "route_id",
            "ruler_100_pct",
        ],
        value_vars=[col1, col2],
    )

    return df3

In [60]:
pct_rt = pct_vp_journey(
    one_route, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
)

In [61]:
"""base_facet_circle(
    pct_rt.loc[pct_rt.time_period == "all_day"],
    "value",
    "ruler_100_pct",
    "Percentage of Realtime Trips with 1+ and 2+ Vehicle Positions",
    "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
)"""

'base_facet_circle(\n    pct_rt.loc[pct_rt.time_period == "all_day"],\n    "value",\n    "ruler_100_pct",\n    "Percentage of Realtime Trips with 1+ and 2+ Vehicle Positions",\n    "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",\n)'

#### % schedule journey with 1+/2+ vp (goal line = 100%) - use all_day, one chart shared for 1+ and 2+ (Done need subtitle)

In [62]:
schd_rt = pct_vp_journey(
    one_route, "pct_sched_journey_atleast1_vp", "pct_sched_journey_atleast2_vp"
)

In [63]:
 """ base_facet_circle(
    schd_rt.loc[schd_rt.time_period == "all_day"],
    "value",
    "ruler_100_pct",
    "Percentage of Scheduled Trips with 1+ and 2+ Vehicle Positions",
    "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
)"""

' base_facet_circle(\n   schd_rt.loc[schd_rt.time_period == "all_day"],\n   "value",\n   "ruler_100_pct",\n   "Percentage of Scheduled Trips with 1+ and 2+ Vehicle Positions",\n   "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",\n)'

#### Number of trips by sched_vp_category (/operator_schedule_rt_category)
* Why is the crosswalk missing SBMTD schedule??

In [64]:
op_sched_rt_cat = pd.read_parquet(
    f"{RT_SCHED_GCS}digest/operator_schedule_rt_category.parquet"
)

In [65]:
list(df.schedule_gtfs_dataset_key.unique())

['444700afe086ed24e3cb888cecd3037c', 'c499f905e33929a641f083dad55c521e']

In [66]:
test = section2.load_operator_schedule_rt_category("de792182088eecc3d5c0bd3f1df62965")

In [67]:
op_sched_rt_cat.schedule_gtfs_dataset_key.nunique()

272

In [68]:
# op_routes = pd.read_parquet(
#    f"{RT_SCHED_GCS}digest/operator_routes.parquet")

In [69]:
# gs://calitp-analytics-data/data-analyses/gtfs_schedule/crosswalk/gtfs_key_organization_2024-03-13.parquet
crosswalk_date = most_recent_date.date()

In [70]:
crosswalk = pd.read_parquet(
    f"{SCHED_GCS}crosswalk/gtfs_key_organization_{crosswalk_date}.parquet"
)[["schedule_gtfs_dataset_key", "organization_name"]]

In [71]:
crosswalk.shape

(171, 2)

In [72]:
op_sched_rt_cat.n_trips = op_sched_rt_cat.n_trips.astype(int).fillna(0)

In [73]:
# op_sched_rt_cat = pd.merge(op_sched_rt_cat, crosswalk, on = "schedule_gtfs_dataset_key", how = "left")

In [74]:
op_sched_rt_cat.sample()

Unnamed: 0,schedule_gtfs_dataset_key,service_date,sched_rt_category,n_trips
3111,4c6b107352b318297bb39173c796f357,2024-01-17,schedule_only,0


In [75]:
crosswalk.loc[crosswalk.schedule_gtfs_dataset_key == "de792182088eecc3d5c0bd3f1df62965"]

Unnamed: 0,schedule_gtfs_dataset_key,organization_name


In [76]:
op_sched_rt_cat.head(4)

Unnamed: 0,schedule_gtfs_dataset_key,service_date,sched_rt_category,n_trips
0,014d0998350083249a9eb310635548c2,2023-03-15,schedule_only,0
1,014d0998350083249a9eb310635548c2,2023-03-15,vp_only,0
2,014d0998350083249a9eb310635548c2,2023-03-15,schedule_and_vp,0
3,014d0998350083249a9eb310635548c2,2023-04-12,schedule_only,0


In [77]:
op_sched_rt_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10608 entries, 0 to 10607
Data columns (total 4 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   schedule_gtfs_dataset_key  10608 non-null  object        
 1   service_date               10608 non-null  datetime64[ns]
 2   sched_rt_category          10608 non-null  category      
 3   n_trips                    10608 non-null  int64         
dtypes: category(1), datetime64[ns](1), int64(1), object(1)
memory usage: 259.2+ KB


In [78]:
def basic_bar_chart(
    df: pd.DataFrame, y_col: str, color_col: str, title: str, subtitle: str
):
    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(
                f"yearmonthdate(service_date):O",
                title="Service Date",
                axis=alt.Axis(format="%b %Y"),
            ),
            y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
            color=alt.Color(
                f"{color_col}:N",
                scale=alt.Scale(
                    range=cp.CALITP_SEQUENTIAL_COLORS,
                ),
            ),
            tooltip=df.columns.tolist(),
        )
    ).properties(
        title={
            "text": title,
            "subtitle": subtitle,
        },
        width=500,
        height=300,
    )
    return chart

In [79]:
"""basic_bar_chart(
    op_sched_rt_cat,
    "n_trips",
    "sched_rt_category",
    "Trips that were found in GTFS, Schedule, or Both Data Sources",
    "",
) """

'basic_bar_chart(\n    op_sched_rt_cat,\n    "n_trips",\n    "sched_rt_category",\n    "Trips that were found in GTFS, Schedule, or Both Data Sources",\n    "",\n) '

#### Text

In [80]:
def create_text_table(df: pd.DataFrame, direction_id: str):

    df = df.loc[df.direction_id == direction_id].reset_index(drop=True)
    df2 = df.melt(
        id_vars=[
            "route_combined_name",
            "direction_id",
        ],
        value_vars=[
            "avg_scheduled_service_minutes",
            "avg_stop_miles",
            "n_scheduled_trips",
            "sched_rt_category",
            "peak_avg_speed",
            "peak_scheduled_trips",
            "peak_hourly_freq",
            "offpeak_avg_speed",
            "offpeak_scheduled_trips",
            "offpeak_hourly_freq",
        ],
    )
    # Create a decoy column to center all the text
    df2["Zero"] = 0

    df2.variable = df2.variable.str.replace("_", " ").str.title()
    df2 = df2.sort_values(by=["direction_id"]).reset_index(drop=True)
    df2["combo_col"] = df2.variable.astype(str) + ": " + df2.value.astype(str)
    text_chart = (
        alt.Chart(df2)
        .mark_text()
        .encode(x=alt.X("Zero:Q", axis=None), y=alt.Y("combo_col", axis=None))
    )

    text_chart = text_chart.encode(text="combo_col:N").properties(
        title=f"Route Statistics for Direction {direction_id}", width=500, height=300
    )
    return text_chart

In [81]:
table_df = section2.route_stats(one_route)

In [86]:
section2.route_stats??

[0;31mSignature:[0m [0msection2[0m[0;34m.[0m[0mroute_stats[0m[0;34m([0m[0mdf[0m[0;34m:[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m)[0m [0;34m->[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mroute_stats[0m[0;34m([0m[0mdf[0m[0;34m:[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m)[0m [0;34m->[0m [0mpd[0m[0;34m.[0m[0mDataFrame[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0mmost_recent_date[0m [0;34m=[0m [0mdf[0m[0;34m.[0m[0mservice_date[0m[0;34m.[0m[0mmax[0m[0;34m([0m[0;34m)[0m[0;34m[0m
[0;34m[0m    [0mroute_merge_cols[0m [0;34m=[0m [0;34m[[0m[0;34m"route_combined_name"[0m[0;34m,[0m [0;34m"direction_id"[0m[0;34m][0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0mall_day_stats[0m [0;34m=[0m [0mdf[0m[0;3

In [84]:
table_df

Unnamed: 0,route_combined_name,direction_id,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,sched_rt_category,peak_avg_speed,peak_scheduled_trips,peak_hourly_freq,offpeak_avg_speed,offpeak_scheduled_trips,offpeak_hourly_freq


In [87]:
 most_recent_date = one_route.service_date.max()

In [88]:
most_recent_date

Timestamp('2024-03-13 00:00:00')

In [89]:
route_merge_cols = ["route_combined_name", "direction_id"]

all_day_stats = one_route[
        (one_route.service_date == most_recent_date) & (one_route.time_period == "all_day")
    ][
        route_merge_cols
        + [
            "avg_scheduled_service_minutes",
            "avg_stop_miles",
            "n_scheduled_trips",
            "sched_rt_category",
        ]
    ]

In [91]:
all_day_stats.head()

Unnamed: 0,route_combined_name,direction_id,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,sched_rt_category
768526,652 Skyline High - Elmhurst,0.0,24.0,0.16,1,schedule_and_vp
768527,652 Skyline High - Elmhurst,0.0,24.0,0.16,1,schedule_and_vp
768528,652 Skyline High - Elmhurst,0.0,24.0,0.16,1,schedule_and_vp
768529,652 Skyline High - Elmhurst,0.0,24.0,0.16,1,schedule_and_vp
768530,652 Skyline High - Elmhurst,0.0,24.0,0.16,1,schedule_and_vp


In [93]:
peak_stats = one_route[(one_route.service_date == most_recent_date) & (one_route.time_period == "peak")][
        route_merge_cols + ["speed_mph", "n_scheduled_trips", "frequency"]
    ].rename(
        columns={
            "speed_mph": "peak_avg_speed",
            "n_scheduled_trips": "peak_scheduled_trips",
            "frequency": "peak_hourly_freq",
        }
    )

In [95]:
peak_stats.head()

Unnamed: 0,route_combined_name,direction_id,peak_avg_speed,peak_scheduled_trips,peak_hourly_freq
768626,652 Skyline High - Elmhurst,0.0,17.59,1,0.12
768627,652 Skyline High - Elmhurst,0.0,17.59,1,0.12
768628,652 Skyline High - Elmhurst,0.0,17.59,1,0.12
768629,652 Skyline High - Elmhurst,0.0,17.59,1,0.12
768630,652 Skyline High - Elmhurst,0.0,17.59,1,0.12


In [94]:
offpeak_stats = one_route[
        (one_route.service_date == most_recent_date) & (one_route.time_period == "offpeak")
    ][route_merge_cols + ["speed_mph", "n_scheduled_trips", "frequency"]].rename(
        columns={
            "speed_mph": "offpeak_avg_speed",
            "n_scheduled_trips": "offpeak_scheduled_trips",
            "frequency": "offpeak_hourly_freq",
        }
    )

In [96]:
offpeak_stats.head()

Unnamed: 0,route_combined_name,direction_id,offpeak_avg_speed,offpeak_scheduled_trips,offpeak_hourly_freq
768808,652 Skyline High - Elmhurst,1.0,16.03,1,0.06
768809,652 Skyline High - Elmhurst,1.0,16.03,1,0.06
768810,652 Skyline High - Elmhurst,1.0,16.03,1,0.06
768811,652 Skyline High - Elmhurst,1.0,16.03,1,0.06
768812,652 Skyline High - Elmhurst,1.0,16.03,1,0.06


In [None]:
    table_df = (
        pd.merge(
            all_day_stats,
            peak_stats,
            on=route_merge_cols,
        )
        .merge(offpeak_stats, on=route_merge_cols)
        .sort_values(["route_combined_name", "direction_id"])
        .reset_index(drop=True)
    )

In [82]:
create_text_table(table_df, 0)



In [83]:
stop

NameError: name 'stop' is not defined

#### Putting it all together

In [None]:
def filtered_route(
    df: pd.DataFrame,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """

    route_dropdown = alt.binding_select(
        options=sorted(df["route_combined_name"].unique().tolist()),
        name="Routes ",
    )

    # Column that controls the bar charts
    route_selector = alt.selection_point(
        fields=["route_combined_name"],
        bind=route_dropdown,
    )

    # Data
    # Filter for only schedule and vp
    df_sched_vp_both = df[df.sched_rt_category == "schedule_and_vp"].reset_index(
        drop=True
    )

    # Filter for only rows categorized as found in schedule and vp and all_day
    all_day = df_sched_vp_both.loc[
        df_sched_vp_both.time_period == "all_day"
    ].reset_index(drop=True)

    # Create route stats table for the text tables
    route_stats_df = section2.route_stats(df)

    # Manipulate the df for some of the metrics
    timeliness_df = section2.timeliness_trips(df_sched_vp_both)
    rt_journey_vp = pct_vp_journey(
        all_day, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
    )
    sched_journey_vp = pct_vp_journey(
        all_day, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
    )

    # Charts
    avg_scheduled_min = (
        grouped_bar_chart(
            df=all_day,
            color_col="direction_id",
            y_col="avg_scheduled_service_minutes",
            offset_col="direction_id",
            title="Average Scheduled Minutes",
            subtitle="The average minutes a trip is scheduled to run.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    timeliness_trips_dir_0 = (
        base_facet_chart(
            timeliness_df.loc[timeliness_df.direction_id == 0],
            "value",
            "variable",
            "time_period",
            "Breakdown of Trips by Categories for Direction 0",
            "Categorizing whether a trip is early, late, or ontime. A trip is on time if it arrives 5 minutes later or earlier than scheduled.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    timeliness_trips_dir_1 = (
        base_facet_chart(
            timeliness_df.loc[timeliness_df.direction_id == 1],
            "value",
            "variable",
            "time_period",
            "Breakdown of Trips by Categories for Direction 1",
            "Categorizing whether a trip is early, late, or ontime. A trip is on time if it arrives 5 minutes later or earlier than scheduled.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    frequency = (frequency_chart(df_sched_vp_both)
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    speed = base_facet_line(
        df_sched_vp_both,
        "speed_mph",
        "Average Speed",
        "The average miles per hour the bus travels by direction and time of day.",
    )
    vp_per_min = (
        base_facet_with_ruler_chart(
            all_day,
            "vp_per_minute",
            "ruler_for_vp_per_min",
            "Vehicle Positions per Minute",
            "Trips should have 2+ vehicle positions per minute.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    rt_vp_per_min = (
        base_facet_circle(
            rt_journey_vp,
            "value",
            "ruler_100_pct",
            "Percentage of Realtime Trips with 1+ and 2+ Vehicle Positions",
            "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    sched_vp_per_min = (
        base_facet_circle(
            sched_journey_vp,
            "value",
            "sched_journey_vp",
            "Percentage of Scheduled Trips with 1+ and 2+ Vehicle Positions",
            "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    spatial_accuracy = (
        base_facet_with_ruler_chart(
            all_day,
            "pct_in_shape",
            "ruler_100_pct",
            "Spatial Accuracy",
            "The percentage of vehicle positions that fall within the static scheduled route shape reflects the accuracy of the spatial, realtime data.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    text_dir0 = (
        create_text_table(route_stats_df, 0)
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    text_dir1 = (
        create_text_table(route_stats_df, 1)
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    chart_list = [
        avg_scheduled_min,
        timeliness_trips_dir_0,
        timeliness_trips_dir_1,
        frequency,
        speed,
        vp_per_min,
        rt_vp_per_min,
        sched_vp_per_min,
        spatial_accuracy,
        text_dir0,
        text_dir1,
    ]

    #
    chart = alt.vconcat(*chart_list).properties(
        resolve=alt.Resolve(
            scale=alt.LegendResolveMap(color=alt.ResolveMode("independent"))
        )
    )
    return chart

In [None]:
# filtered_route(df_sched_vp_both.head(10000))