# GTFS Digest Tweaks [PR](https://github.com/cal-itp/data-analyses/issues/1416)

In [1]:
import _report_utils
import _section1_utils as section1
import _section2_utils as section2
import geopandas as gpd
import merge_data
import merge_operator_data
import numpy as np
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import (
    catalog_utils,
    portfolio_utils,
    rt_dates,
    rt_utils,
    time_helpers,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
# Data Dictionary
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")

import yaml

with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

# Color Palette
with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)
import altair as alt

alt.data_transformers.enable("default", max_rows=None)
import _report_utils

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
analysis_date_list = [rt_dates.DATES["feb2025"]]

In [5]:
analysis_date = rt_dates.DATES["feb2025"]

In [6]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [7]:
sdi = "San Diego Metropolitan Transit System"

In [8]:
df = section2.load_schedule_vp_metrics(sdi)

In [9]:
# Create dropdown
routes_list = df["Route"].unique().tolist()

route_dropdown = alt.binding_select(
    options=routes_list,
    name="Routes: ",
)
# Column that controls the bar charts
xcol_param = alt.selection_point(
    fields=["Route"], value=routes_list[0], bind=route_dropdown
)

In [10]:
def clean_data_charts(df: pd.DataFrame, y_col: str) -> pd.DataFrame:
    """
    Do some basic cleaning to the datafarmes.
    """
    df["Period"] = df["Period"].str.replace("_", " ").str.title()
    df[y_col] = df[y_col].fillna(0).astype(int)
    df[f"{y_col}_str"] = df[y_col].astype(str)

    return df


def set_y_axis(df, y_col):
    """
    Set y_axis automatically depending on the
    column used to generate the y_axis.
    """
    if "%" in y_col:
        max_y = 100
    elif "VP" in y_col:
        max_y = 3
    elif "Minute" in y_col:
        max_y = round(df[y_col].max())
    else:
        max_y = round(df[y_col].max(), -1) + 5
    return max_y

## Don't Delete: Testing

In [11]:
# display(section2.filtered_route(df))

## Change embedded averages from text to charts.

###  The average length of a route is X miles.

In [12]:
# Load first dataset
operator_profiles = section1.load_operator_ntd_profile(sdi)

In [13]:
name = section1.organization_name_crosswalk(sdi)

In [14]:
operator_route_map = section1.load_operator_map(name)

In [15]:
# operator_route_map.drop(columns = ["geometry"]).head()

In [16]:
def find_percentiles(df:pd.DataFrame, col_of_interest: str)->pd.DataFrame:
    """
    Take a column you're interested in and categorize it by row
    for each percentile group it belongs to. 
    """
    # Get percentiles in objects for total vehicle.
    p25 = df[col_of_interest].quantile(0.25).astype(float)
    p50 = df[col_of_interest].quantile(0.50).astype(float)
    p75 = df[col_of_interest].quantile(0.75).astype(float)

    # Make a dataframe out of this 
    percentile_data = {
    f"{col_of_interest}_percentile": [
        "25th percentile",
        "50th percentile",
        "< 75th percentile",
        "> 75th percentile",
    ],
    "percentile_group": [
        f"25 percentile (<{p25.astype(int)} miles)",
        f"26-50th percentile ({p25.astype(int) + 0.1}-{p50.astype(int)} miles)",
        f"51-75th percentile ({p50.astype(int) + 0.1}-{p75.astype(int)} miles)",
        f"76th percentile (>{p75.astype(int) + 0.1} miles)",
    ],
}
    percentile_df = pd.DataFrame(data=percentile_data)
    
    def categorize_percentile(row):
        if (row[col_of_interest] > 0) and (row[col_of_interest] <= p25):
            return "25th percentile"
        elif (row[col_of_interest] > p25) and (row[col_of_interest] <= p50):
            return "50th percentile"
        elif (row[col_of_interest] > p50) and (row[col_of_interest] <= p75):
            return "< 75th percentile"
        elif row[col_of_interest] > p75:
            return "> 75th percentile"
        else:
            return "Zero"

    # Actually categorize each value for percentile
    df[f"{col_of_interest}_percentile"] = df.apply(
        lambda x: categorize_percentile(x), axis=1
    )

    # Delete out routes w/o service mile info
    df = df.loc[
    df[f"{col_of_interest}_percentile"] != "Zero"
    ]
    
    # Merge the dataframes
    df2 = pd.merge(df, percentile_df, on = f"{col_of_interest}_percentile")
    return df2

In [17]:
operator_route_map = find_percentiles(operator_route_map, "Service Miles")

In [18]:
operator_route_map['Service Miles_percentile'].value_counts()

25th percentile      29
> 75th percentile    29
< 75th percentile    28
50th percentile      28
Name: Service Miles_percentile, dtype: int64

In [20]:
operator_route_map.columns

Index(['shape_array_key', 'geometry', 'feed_key', 'schedule_gtfs_dataset_key',
       'dir_0_1', 'route_key', 'Service meters', 'Service Miles',
       'is_downtown_local', 'is_local', 'is_coverage', 'is_rapid',
       'is_express', 'is_rail', 'Organization ID', 'Organization',
       'Transit Operator', 'route_long_name', 'route_short_name', 'Route',
       'Route ID', 'Service Miles_percentile', 'percentile_group'],
      dtype='object')

In [25]:
def route_length_percentile(gdf:gpd.GeoDataFrame)->pd.DataFrame:
    """
    Total up each unique route that falls into 
    a percentile group.
    """
    agg = (
    gdf.groupby(["percentile_group"])
    .agg({"Route": "nunique"})
    .reset_index()
    .rename(
        columns={
            "Route": "Total Unique Routes",
        }
    )
    )
    return agg

In [21]:
percentile_agg = (
    operator_route_map.groupby(["percentile_group"])
    .agg({"Route": "nunique"})
    .reset_index()
    .rename(
        columns={
            "Route": "Total Unique Routes",
            "Service Miles_percentile": "Service Miles Percentile Group",
        }
    )
)

In [22]:
percentile_agg 

Unnamed: 0,percentile_group,Total Unique Routes
0,25 percentile (<6 miles),29
1,26-50th percentile (6.1-9 miles),28
2,51-75th percentile (9.1-13 miles),28
3,76th percentile (>13.1 miles),26


In [24]:
section1.basic_bar_chart(
    percentile_agg,
    
    "Total Unique Routes",
    "percentile_group",
    readable_dict["route_percentiles"]["title"],
    readable_dict["route_percentiles"]["subtitle"],
)

## Show Trips per Day

In [None]:
df.head(1)

In [None]:
df[["Date", "Route", "Period", "# scheduled trips", "# Trips with VP"]].head()

In [None]:
# Create color scale
color_scale = alt.Scale(
    domain=color_dict["spatial_accuracy_domain"],
    range=color_dict["spatial_accuracy_range"],
)

In [None]:
section2.grouped_bar_chart(
    df.loc[(df.dir_0_1 == 0) & (df.Period != "All Day")],
    color_col="Period",
    y_col="# scheduled trips",
    offset_col="Period",
    title=readable_dict["trips_per_day_graph"],
    subtitle="",
    range_color=color_dict["spatial_accuracy_range"],
    title_inc_dir=True,
).add_params(xcol_param).transform_filter(xcol_param)

In [None]:
(
    alt.Chart(df.loc[(df.dir_0_1 == 0) & (df.Period != "All Day")])
    .mark_bar(size=3, clip=True)
    .encode(
        y=alt.Y(
            "yearmonthdate(Date):O",
            title=["Date"],
            axis=alt.Axis(format="%b %Y"),
        ),
        x=alt.X(
            "# scheduled trips:Q",
            title=_report_utils.labeling("# scheduled trips"),
            scale=alt.Scale(domain=[0, 240]),
        ),
        color=alt.Color(
            "# scheduled trips:Q",
            scale=color_scale,
            title=_report_utils.labeling("# scheduled trips"),
        ),
        tooltip=["Date", "Route", "# scheduled trips", "Period", "Direction"],
    )
).facet(column=alt.Column("Period:N")).add_params(xcol_param).transform_filter(
    xcol_param
)

## Timeliness of Trips, change to line chart
* How come the early, on time, and late trips don't add up?

In [None]:
df.columns

In [None]:
timeliness_df2 = df.melt(
    id_vars=[
        "Date",
        "Organization",
        "Route",
        "Period",
        "Direction",
        "dir_0_1",
        "# Trips with VP",
    ],
    value_vars=[
        "# Early Arrival Trips",
        "# On-Time Trips",
        "# Late Trips",
    ],
)

In [None]:
def base_facet_line(
    df: pd.DataFrame,
    y_col: str,
    color_col: str,
    facet_col: str,
    title: str,
    subtitle: str,
    range_color: dict,
) -> alt.Chart:

    # Set y-axis
    max_y = set_y_axis(df, y_col)

    # Clean dataframe
    df = clean_data_charts(df, y_col)

    tooltip_cols = [
        "Period",
        "Route",
        "Date",
        f"{y_col}_str",
        "Direction",
    ]

    chart = (
        alt.Chart(df)
        .mark_line(size=3)
        .encode(
            x=alt.X(
                "yearmonthdate(Date):O",
                title="Date",
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
            ),
            y=alt.Y(
                f"{y_col}:Q",
                title=_report_utils.labeling(y_col),
                scale=alt.Scale(domain=[0, max_y]),
            ),
            color=alt.Color(
                f"{color_col}:N",
                title=_report_utils.labeling(color_col),
                scale=alt.Scale(range=range_color),
            ),
            tooltip=tooltip_cols,
        )
    )

    chart = chart.properties(width=200, height=250)
    chart = chart.facet(
        column=alt.Column(f"{facet_col}:N", title=_report_utils.labeling(facet_col)),
    ).properties(
        title={
            "text": [title],
            "subtitle": [subtitle],
        }
    )
    return chart

In [None]:
timeliness_df2["pct_trips"] = (
    timeliness_df2.value / timeliness_df2["# Trips with VP"]
) * 100

In [None]:
timeliness_df2.Period.unique()

In [None]:
(
    base_facet_line(
        timeliness_df2.loc[timeliness_df2.dir_0_1 == 0],
        "pct_trips",
        "variable",
        "Period",
        readable_dict["timeliness_trips_graph"]["title"],
        readable_dict["timeliness_trips_graph"]["subtitle"],
        range_color=color_dict["tri_color"],
    )
    .add_params(xcol_param)
    .transform_filter(xcol_param)
)

In [None]:
timeliness_df2.loc[
    (timeliness_df2.Route == "12 City College - Skyline Hills")
    & (timeliness_df2.Date == "2023-03-15")
].sort_values(by=["Date", "Direction", "Period", "variable"])

In [None]:
4 / 75

## Move text table before the rest of the charts. (Done)

In [None]:
timeliness_df.loc[timeliness_df.Route == "1 Fashion Valley - La Mesa"]