## Round 1 
* Updating existing charts made by Tiffany. https://gtfs-digest--cal-itp-data-analyses.netlify.app/
* cd rt_segment_speeds && pip install altair_transform && pip install -r requirements.txt && cd ../_shared_utils && make setup_env

Links
* https://github.com/cal-itp/data-analyses/issues/1059
* https://docs.google.com/document/d/1I1WiqlmU06W6iLCi7cZQrOCLILkrEfABEkcU0Jys7f0/edit
* https://route-speeds--cal-itp-data-analyses.netlify.app/name_bay-area-511-muni-schedule/0__report__name_bay-area-511-muni-schedule
* https://posit-dev.github.io/great-tables/get-started/nanoplots.html
* https://docs.pola.rs/py-polars/html/reference/api/polars.from_pandas.html
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_rt_scheduled_utils.py
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_threshold_utils.py

In [1]:
import _section3_utils as section3
import _report_utils
import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import great_tables as gt
import pandas as pd
from calitp_data_analysis import calitp_color_palette as cp
from great_tables import md
from IPython.display import HTML, Markdown, display
from segment_speed_utils.project_vars import RT_SCHED_GCS, SCHED_GCS
from shared_utils import rt_dates, rt_utils, catalog_utils

alt.renderers.enable("html")
alt.data_transformers.enable("default", max_rows=None)
from typing import List, Union

from altair_transform.extract import extract_transform
from altair_transform.transform import visit
from altair_transform.utils import to_dataframe

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

In [4]:
org_name = "North County Transit District"

In [5]:
# Set drop down menu to be on the upper right
display(
    HTML(
        """
<style>
form.vega-bindings {
  position: absolute;
  right: 0px;
  top: 0px;
}
</style>
"""
    )
)

### Original File 

In [6]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"
    

In [7]:
og = pd.read_parquet(schd_vp_url, filters=[[("organization_name", "==", org_name)]],
                        columns = ["organization_name", "caltrans_district", "name"]
    )

In [8]:
og.shape

(16067, 3)

### Duplicates Dropped

In [9]:
df = section3.load_schedule_vp_metrics(org_name)

In [10]:
df.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,time_period,avg_scheduled_service_minutes,avg_stop_miles,n_scheduled_trips,frequency,service_date,minutes_atleast1_vp,minutes_atleast2_vp,total_rt_service_minutes,total_scheduled_service_minutes,total_vp,vp_in_shape,is_early,is_ontime,is_late,n_vp_trips,vp_per_minute,pct_in_shape,pct_rt_journey_atleast1_vp,pct_rt_journey_atleast2_vp,pct_sched_journey_atleast1_vp,pct_sched_journey_atleast2_vp,rt_sched_journey_ratio,avg_rt_service_minutes,sched_rt_category,speed_mph,name,route_long_name,route_short_name,route_combined_name,route_id,typology,schedule_source_record_id,base64_url,organization_source_record_id,organization_name,caltrans_district,rt_sched_journey_ratio_cat,frequency_cat,vp_per_minute_cat,ruler_100_pct,ruler_for_vp_per_min
0,1e93c380452cfb80eac5e14e4f227992,0.0,all_day,110.41,0.31,32,1.33,2023-04-12,4332,4262,4448.13,3533.0,12504,10595,0,0,32,32,2.81,85.0,97.0,96.0,100.0,100.0,1.26,139.0,schedule_and_vp,16.01,North County Schedule,Oceanside - UTC/VA/UCSD,101,101 Oceanside - UTC/VA/UCSD,301,rapid,recltDJn10nOSilgD,aHR0cHM6Ly93d3cuZ29uY3RkLmNvbS9nb29nbGVfdHJhbnNpdC56aXA=,recRBcrX4ZvTyvSnm,North County Transit District,11 - San Diego,Late by 26-50% of the scheduled time,<1 trip/hour,<3 pings/minute,100,2
1,1e93c380452cfb80eac5e14e4f227992,0.0,all_day,110.41,0.31,32,1.33,2023-05-17,4334,4237,4405.71,3533.0,12383,10466,0,0,32,32,2.81,84.0,98.0,96.0,100.0,100.0,1.25,137.68,schedule_and_vp,15.83,North County Schedule,Oceanside - UTC/VA/UCSD,101,101 Oceanside - UTC/VA/UCSD,301,rapid,recltDJn10nOSilgD,aHR0cHM6Ly93d3cuZ29uY3RkLmNvbS9nb29nbGVfdHJhbnNpdC56aXA=,recRBcrX4ZvTyvSnm,North County Transit District,11 - San Diego,Late by 1-25% of the scheduled time,<1 trip/hour,<3 pings/minute,100,2


In [11]:
df.time_period.value_counts()

all_day    966
peak       954
offpeak    811
Name: time_period, dtype: int64

In [12]:
df.sched_rt_category.unique()

['schedule_and_vp']
Categories (3, object): ['schedule_only', 'vp_only', 'schedule_and_vp']

### Checkout Duplicates

In [13]:
len(df)

2731

In [15]:
categories_only = df[['sched_rt_category','direction_id','time_period','service_date','route_combined_name','n_scheduled_trips']]

In [16]:
categories_only = categories_only.loc[categories_only.time_period == "all_day"]

In [17]:
march_2024 = df.loc[df.service_date == "2024-03-13"]

In [19]:
categories_only.loc[categories_only.service_date == "2024-03-13"].n_scheduled_trips.sum()

1401

In [20]:
section3.trips_by_gtfs(df)

Daily Trips by GTFS Availability,Daily Trips by GTFS Availability,Daily Trips by GTFS Availability,Daily Trips by GTFS Availability
Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data.,Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data..1,Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data..2,Schedule only indicates the trip(s) were found only in schedule data. Vehicle Positions (VP) only indicates the trip(s) were found only in real-time data..3
2023-03-15,0,0,1307
2023-04-12,0,0,1338
2023-05-17,0,0,1343
2023-06-14,0,0,1326
2023-07-12,0,0,1310
2023-08-15,0,0,1334
2023-09-13,0,0,1342
2023-10-11,0,0,1391
2023-11-15,0,0,1392
2023-12-13,0,0,1392


### One Route

In [None]:
df.route_combined_name.unique()

In [None]:
one_route = df.loc[
    df.route_combined_name == '6 Wellness Express'
]

In [None]:
one_route.shape

In [None]:
one_route.sample()

In [None]:
one_route_all_day = one_route.loc[one_route.time_period == "all_day"]

In [None]:
one_route_all_day.service_date.value_counts()

#### Avg Scheduled Minutes (Done)
* How come the average_scheduled_minutes is a lot shorter than total_rt_service_minutes and total_scheduled_service_minutes?
* Kind of a boring chart? Should compare actual run time compared to service minutes?

In [None]:
import yaml

with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

In [None]:
readable_dict['avg_scheduled_min']['title']

In [None]:
section3.grouped_bar_chart(
    df=one_route[one_route.time_period == "all_day"].drop_duplicates(),
    color_col="direction_id",
    y_col="avg_scheduled_service_minutes",
    offset_col="direction_id",
    title=readable_dict['avg_scheduled_min']['title'],
    subtitle=readable_dict['avg_scheduled_min']['subtitle'],
)

#### Timeliness #2 (Done)

In [None]:
timeliness = section3.timeliness_trips(one_route)

In [None]:
timeliness.sample()

In [None]:
section3.base_facet_chart(
    timeliness.loc[timeliness.direction_id == 1].drop_duplicates(),
    "value",
    "variable",
    "time_period",
    "Breakdown of Trips by Categories for Direction 1",
    "Trips broken down by VP Only (found only in real-time data), Schedule Only (found only in schedule data), or both.",
).interactive()

#### Total Scheduled Trips (Don't Use, boring)

In [None]:
# IF we take away direction, see how many times a route is scheduled to run
total_scheduled_trip = (
    one_route.groupby(["service_date", "time_period"])
    .agg({"n_scheduled_trips": "mean"})
    .reset_index()
)

In [None]:
total_scheduled_trip.n_scheduled_trips = total_scheduled_trip.n_scheduled_trips / 2

In [None]:
total_scheduled_trip.head()

In [None]:
one_route.groupby(["service_date", "direction_id", "time_period"]).agg(
    {"n_scheduled_trips": "max"}
).head()

In [None]:
section3.grouped_bar_chart(
    df=one_route.loc[one_route.time_period != "all_day"].drop_duplicates(),
    color_col="time_period",
    y_col="n_scheduled_trips",
    offset_col="direction_id",
    title="Total Scheduled Trips",
    subtitle="How many times per day is this route is scheduled to run in one particular direction.",
)

#### Frequency (Done)
* Maybe shouldn't be a chart since there doesn't seem to be a lot of data for this across a lot of the routes?
* What does frequency mean?
* Simplify down to not take direction_id into consideration?

In [None]:
alt.Chart(one_route, width=180, height=alt.Step(10)).mark_bar().encode(
    alt.Y(
        "yearmonthdate(service_date):O",
        title="Date",
        axis=alt.Axis(format="%b %Y"),
    ),
    alt.X("frequency:Q", title=_report_utils.labeling("frequency"), axis=None),
    alt.Color("frequency:Q", scale=alt.Scale(range=_report_utils.red_green_yellow)).title(
        _report_utils.labeling("Frequency")
    ),
    alt.Row("time_period:N").title(_report_utils.labeling("time_period")).header(labelAngle=0),
    alt.Column("direction_id:N").title(_report_utils.labeling("direction_id")),
    tooltip=["yearmonthdate(service_date)", "frequency", "time_period", "direction_id"]
)

#### Speed MPH (Done)
* Needs a different type of chart.

In [None]:
section3.base_facet_line(
    one_route,
    "speed_mph",
    "Average Speed",
    "The average miles per hour the bus travels by direction and time of day.",
)

#### VP per Minute (WIP)
* Heatmap too confusing & detailed

In [None]:
def add_background():
    # Sample data
    data = pd.DataFrame(
        {
            "x": range(9),
            "y": [
                0,
                0.5,
                1.5,
                2.5,
                3.5,
                2,
                1,
                3,
                4,
            ],
        }
    )

    # Background rectangle for color bands
    bg_chart = (
        alt.Chart(data)
        .mark_rect()
        .encode(
            y=alt.Y("y:Q", axis=None),
            y2="y2:Q",
            color=alt.Color("y:Q", scale=alt.Scale(range=red_green_yellow)),
            opacity=alt.value(0.2),  # Adjust opacity as needed
        )
        .transform_calculate(y2="datum.y + 1")
    )

    return bg_chart

In [None]:
section3.base_facet_with_ruler_chart(
                one_route.drop_duplicates(),
                "vp_per_minute",
                "ruler_for_vp_per_min",
                readable_dict['vp_per_min']['title'],
                readable_dict['vp_per_min']['subtitle'],
            )

In [None]:
blue_palette = ["#B9D6DF", "#2EA8CE", "#0B405B"]

#### Spatial Accuracy (Done)
* Multiple it by 100? Should this be rounded or not?

In [None]:
section3.base_facet_with_ruler_chart(
    one_route.loc[one_route.time_period == "all_day"].drop_duplicates(),
    "pct_in_shape",
    "ruler_100_pct",
    "Spatial Accuracy",
    "The percentage of vehicle positions that fall within the static scheduled route shape reflects the accuracy of the spatial, realtime data.",
)

#### % RT journey with 1+/2+ vp (goal line = 100%) - use all_day, one chart shared for 1+ and 2+ (Done need subtitle)

In [None]:
def pct_vp_journey(df: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
    to_keep = [
        "service_date",
        "organization_name",
        "direction_id",
        col1,
        col2,
        "route_combined_name",
        "time_period",
        "route_id",
        "ruler_100_pct",
    ]
    df2 = df[to_keep]

    df3 = df2.melt(
        id_vars=[
            "service_date",
            "organization_name",
            "route_combined_name",
            "direction_id",
            "time_period",
            "route_id",
            "ruler_100_pct",
        ],
        value_vars=[col1, col2],
    )

    return df3

In [None]:
pct_rt = pct_vp_journey(
    one_route, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
)

#### % schedule journey with 1+/2+ vp (goal line = 100%) - use all_day, one chart shared for 1+ and 2+ (Done need subtitle)

In [None]:
schd_rt = pct_vp_journey(
    one_route, "pct_sched_journey_atleast1_vp", "pct_sched_journey_atleast2_vp"
)

In [None]:
""" base_facet_circle(
    schd_rt.loc[schd_rt.time_period == "all_day"],
    "value",
    "ruler_100_pct",
    "Percentage of Scheduled Trips with 1+ and 2+ Vehicle Positions",
    "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
)"""

#### Number of trips by sched_vp_category (/operator_schedule_rt_category)
* Why is the crosswalk missing SBMTD schedule??

In [None]:
op_sched_rt_cat = pd.read_parquet(
    f"{RT_SCHED_GCS}digest/operator_schedule_rt_category.parquet"
)

In [None]:
list(df.schedule_gtfs_dataset_key.unique())

In [None]:
test = section3.load_operator_schedule_rt_category("de792182088eecc3d5c0bd3f1df62965")

In [None]:
op_sched_rt_cat.schedule_gtfs_dataset_key.nunique()

In [None]:
def basic_bar_chart(
    df: pd.DataFrame, y_col: str, color_col: str, title: str, subtitle: str
):
    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(
                f"yearmonthdate(service_date):O",
                title="Service Date",
                axis=alt.Axis(format="%b %Y"),
            ),
            y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
            color=alt.Color(
                f"{color_col}:N",
                scale=alt.Scale(
                    range=cp.CALITP_SEQUENTIAL_COLORS,
                ),
            ),
            tooltip=df.columns.tolist(),
        )
    ).properties(
        title={
            "text": title,
            "subtitle": subtitle,
        },
        width=500,
        height=300,
    )
    return chart

In [None]:
"""basic_bar_chart(
    op_sched_rt_cat,
    "n_trips",
    "sched_rt_category",
    "Trips that were found in GTFS, Schedule, or Both Data Sources",
    "",
) """

#### Text

In [None]:
table_df = section3.route_stats(one_route)

In [None]:
section3.create_text_table(table_df, 0)

In [None]:
section3.create_text_table(table_df, 1)

#### Putting it all together

In [None]:
def filtered_route(
    df: pd.DataFrame,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """
    
    routes_list = df["route_combined_name"].unique().tolist()

    route_dropdown = alt.binding_select(
        options=routes_list,
        name="Routes",
    )

    # Column that controls the bar charts
    route_selector = alt.selection_point(
        fields=["route_combined_name"],
        bind=route_dropdown,
    )

    # Filter for only rows categorized as found in schedule and vp and all_day
    all_day = df.loc[
        df.time_period == "all_day"
    ].reset_index(drop=True)

    # Create route stats table for the text tables
    route_stats_df = section3.route_stats(df)

    # Manipulate the df for some of the metrics
    timeliness_df = section3.timeliness_trips(df)
    rt_journey_vp = section3.pct_vp_journey(
        all_day, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
    )
    sched_journey_vp = section3.pct_vp_journey(
        all_day, "pct_rt_journey_atleast1_vp", "pct_rt_journey_atleast2_vp"
    )

    # Charts
    avg_scheduled_min = (
        section3.grouped_bar_chart(
            df=all_day,
            color_col="direction_id",
            y_col="avg_scheduled_service_minutes",
            offset_col="direction_id",
            title="Average Scheduled Minutes",
            subtitle="The average minutes a trip is scheduled to run.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    timeliness_trips_dir_0 = (
        (
            section3.base_facet_chart(
                timeliness_df.loc[timeliness_df.direction_id == 0],
                "value",
                "variable",
                "time_period",
                "Breakdown of Trips by Categories for Direction 0",
                "Categorizing whether a trip is early, late, or ontime. A trip is on time if it arrives 5 minutes later or earlier than scheduled.",
            )
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    timeliness_trips_dir_1 = (
        (
            section3.base_facet_chart(
                timeliness_df.loc[timeliness_df.direction_id == 1],
                "value",
                "variable",
                "time_period",
                "Breakdown of Trips by Categories for Direction 1",
                "Categorizing whether a trip is early, late, or ontime. A trip is on time if it arrives 5 minutes later or earlier than scheduled.",
            )
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    frequency = (
        section3.frequency_chart(df)
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    speed = (
        section3.base_facet_line(
            df,
            "speed_mph",
            "Average Speed",
            "The average miles per hour the bus travels by direction and time of day.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    vp_per_min = (
        (
            section3.base_facet_with_ruler_chart(
                all_day,
                "vp_per_minute",
                "ruler_for_vp_per_min",
                "Vehicle Positions per Minute",
                "Trips should have 2+ vehicle positions per minute.",
            )
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    rt_vp_per_min = (
        section3.base_facet_circle(
            rt_journey_vp,
            "value",
            "ruler_100_pct",
            "Percentage of Realtime Trips with 1+ and 2+ Vehicle Positions",
            "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    sched_vp_per_min = (
        section3.base_facet_circle(
            sched_journey_vp,
            "value",
            "sched_journey_vp",
            "Percentage of Scheduled Trips with 1+ and 2+ Vehicle Positions",
            "The goal is for almost 100% of trips to have 2 or more Vehicle Positions per minute.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    spatial_accuracy = (
        section3.base_facet_with_ruler_chart(
            all_day,
            "pct_in_shape",
            "ruler_100_pct",
            "Spatial Accuracy",
            "The percentage of vehicle positions that fall within the static scheduled route shape reflects the accuracy of the spatial, realtime data.",
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    text_dir0 = (
        (section3.create_text_table(route_stats_df, 0))
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    text_dir1 = (
        section3.create_text_table(route_stats_df, 1)
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    chart_list = [
        avg_scheduled_min,
        timeliness_trips_dir_0,
        timeliness_trips_dir_1,
        frequency,
        speed,
        vp_per_min,
        rt_vp_per_min,
        sched_vp_per_min,
        spatial_accuracy,
        text_dir0,
        text_dir1,
    ]

    chart = alt.vconcat(*chart_list).properties(
        resolve=alt.Resolve(
            scale=alt.LegendResolveMap(color=alt.ResolveMode("independent"))
        )
    )
    return chart

In [None]:
filtered_route(df)

In [None]:
# LOTS of duplicates??
len(df)

In [None]:
len(df.drop_duplicates())

In [None]:
df.service_date.describe()