## Round 1 
* https://github.com/cal-itp/data-analyses/issues/1059
* cd rt_segment_speeds && pip install altair_transform && pip install -r requirements.txt && cd ../_shared_utils && make setup_env
* https://docs.google.com/document/d/1I1WiqlmU06W6iLCi7cZQrOCLILkrEfABEkcU0Jys7f0/edit
* https://route-speeds--cal-itp-data-analyses.netlify.app/name_bay-area-511-muni-schedule/0__report__name_bay-area-511-muni-schedule
* https://posit-dev.github.io/great-tables/get-started/nanoplots.html
* https://docs.pola.rs/py-polars/html/reference/api/polars.from_pandas.html
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_rt_scheduled_utils.py

In [None]:
%%capture
# import warnings
# warnings.filterwarnings('ignore')

import altair as alt
import calitp_data_analysis.magics
import geopandas as gpd
import great_tables as gt
import pandas as pd
from calitp_data_analysis import calitp_color_palette as cp
from great_tables import md
from IPython.display import HTML, Markdown, display
from segment_speed_utils.project_vars import RT_SCHED_GCS
from shared_utils import rt_dates, rt_utils

alt.renderers.enable("html")
alt.data_transformers.enable("default", max_rows=None)
from typing import List, Union

from altair_transform.extract import extract_transform
from altair_transform.transform import visit
from altair_transform.utils import to_dataframe

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
name = "SBMTD Schedule"

In [None]:
# %%capture_parameters
# name

### General Functions

In [None]:
def labeling(word: str) -> str:
    return word.replace("_", " ").title().replace("N", "Total")

### Data

In [None]:
# calitp-analytics-data/data-analyses/rt_vs_schedule/digest
df = pd.read_parquet(
    f"{RT_SCHED_GCS}digest/schedule_vp_metrics.parquet",
    filters=[[("name", "==", name)]],
)

In [None]:
most_recent_date = df.service_date.max()

In [None]:
most_recent_date

In [None]:
df.service_date.min()

In [None]:
df.head(2)

In [None]:
df["month_year"] = df.service_date.dt.strftime("%m/%Y")

### Test out Altair `extract_data`
* https://github.com/altair-viz/altair-transform/blob/master/altair_transform/core.py#L55
* Why do you want to use this?

In [None]:
__all__ = ["apply", "extract_data", "transform_chart"]

In [None]:
def reverse_snakecase(df):
    """
    Clean up columns to remove underscores and spaces.
    """
    df.columns = df.columns.str.replace("_", " ").str.strip().str.title()
    return df

In [None]:
def apply(
    df: pd.DataFrame,
    transform: Union[alt.Transform, List[alt.Transform]],
    inplace: bool = False,
) -> pd.DataFrame:
    """Apply transform or transforms to dataframe.

    Parameters
    ----------
    df : pd.DataFrame
    transform : list|dict
        A transform specification or list of transform specifications.
        Each specification must be valid according to Altair's transform
        schema.
    inplace : bool
        If True, then dataframe may be modified in-place. Default: False.

    Returns
    -------
    df_transformed : pd.DataFrame
        The transformed dataframe.

    Example
    -------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': range(5), 'y': list('ABCAB')})
    >>> chart = alt.Chart(data).transform_aggregate(sum_x='sum(x)', groupby=['y'])
    >>> apply(data, chart.transform)
       y  sum_x
    0  A      3
    1  B      5
    2  C      2
    """
    if not inplace:
        df = df.copy()
    if transform is alt.Undefined:
        return df
    return visit(transform, df)

In [None]:
def extract_data(
    chart: alt.Chart, apply_encoding_transforms: bool = True
) -> pd.DataFrame:
    """Extract transformed data from a chart.

    This only works with data and transform defined at the
    top level of the chart.

    Parameters
    ----------
    chart : alt.Chart
        The chart instance from which the data and transform
        will be extracted
    apply_encoding_transforms : bool
        If True (default), then apply transforms specified within an
        encoding as well as those specified directly in the transforms
        attribute.

    Returns
    -------
    df_transformed : pd.DataFrame
        The extracted and transformed dataframe.

    Example
    -------
    >>> import pandas as pd
    >>> data = pd.DataFrame({'x': range(5), 'y': list('ABCAB')})
    >>> chart = alt.Chart(data).mark_bar().encode(x='sum(x)', y='y')
    >>> extract_data(chart)
       y  sum_x
    0  A      3
    1  B      5
    2  C      2
    """
    if apply_encoding_transforms:
        chart = extract_transform(chart)
    return apply(to_dataframe(chart.data, chart), chart.transform)

In [None]:
data = pd.DataFrame({"x": range(5), "y": list("ABCAB")})

In [None]:
chart = alt.Chart(data).mark_bar().encode(x="sum(x)", y="y")

In [None]:
chart

In [None]:
extract_data(chart)

### Monthly aggregated service hours by day_type, time_of_day

In [None]:
from segment_speed_utils.project_vars import SCHED_GCS

In [None]:
year = "2023"

In [None]:
f"{SCHED_GCS}scheduled_service_by_route_{year}.parquet"

In [None]:
monthly_service_df = pd.read_parquet(
    f"{SCHED_GCS}scheduled_service_by_route_{year}.parquet",
    filters=[[("name", "==", name)]],
)

In [None]:
monthly_service_df.shape

In [None]:
monthly_service_df.sample()

In [None]:
monthly_service_df.day_type.value_counts()

In [None]:
monthly_service_df["full_date"] = (
    monthly_service_df.month.astype(str) + "-" + monthly_service_df.year.astype(str)
)

In [None]:
def tag_day(df: pd.DataFrame) -> pd.DataFrame:
    # Function to determine if a date is a weekend day or a weekday
    def which_day(date):
        if date == 1:
            return "Monday"
        elif date == 2:
            return "Tuesday"
        elif date == 3:
            return "Wednesday"
        elif date == 4:
            return "Thursday"
        elif date == 5:
            return "Friday"
        elif date == 6:
            return "Saturday"
        else:
            return "Sunday"

    # Apply the function to each value in the "service_date" column
    df["day_type"] = df["day_type"].apply(which_day)

    return df

In [None]:
monthly_service_df = tag_day(monthly_service_df)

In [None]:
monthly_service_df.sample(3)

In [None]:
monthly_service = (
    monthly_service_df.groupby(
        [
            "full_date",
            "name",
            "day_type",
            "time_of_day",
        ]
    )
    .agg(
        {
            "ttl_service_hours": "mean",
        }
    )
    .reset_index()
)

In [None]:
monthly_service.shape

In [None]:
def extract_data_altair(chart):
    chart_dict = chart.to_dict()
    encoding = chart_dict["datasets"]
    df1 = pd.DataFrame(encoding)

    column = df1.columns[0]
    normalized_df = pd.json_normalize(df1[column])
    # Combine the original DataFrame with the extracted values DataFrame
    df2 = pd.concat([df1.drop(column, axis=1), normalized_df], axis=1)
    return df2

In [None]:
def bar_chart_dropdown(
    df: pd.DataFrame,
    x_col: str,
    y_col: str,
    offset_col: str,
    title: str,
    dropdown_col: str,
):
    route_dropdown = alt.binding_select(
        options=sorted(df[dropdown_col].unique().tolist()),
        name=labeling(dropdown_col),
    )

    # Column that controls the bar charts
    route_selector = alt.selection_point(
        fields=[dropdown_col],
        bind=route_dropdown,
    )

    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({y_col}):Q")
    )

    chart = (
        alt.Chart(monthly_service)
        .mark_bar()
        .encode(
            x=alt.X(f"{x_col}:N", title=labeling(x_col)),
            y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
            xOffset=f"{offset_col}:N",
            color=alt.Color(
                f"{offset_col}:N",
                scale=alt.Scale(
                    range=cp.CALITP_SEQUENTIAL_COLORS,
                ),
            ),
        )
    )
    chart = (chart + ruler).properties(title=title, width=600, height=400)
    chart = chart.add_params(route_selector).transform_filter(route_selector)
    data = extract_data_altair(chart)
    display(chart, data)

In [None]:
monthly_chart = bar_chart_dropdown(
    monthly_service,
    "day_type",
    "ttl_service_hours",
    "time_of_day",
    "title",
    "full_date",
)

### Monthly Trends
* https://posit-dev.github.io/great-tables/articles/intro.html

In [None]:
by_date_category = (
    pd.crosstab(
        df.service_date,
        df.sched_rt_category,
        values=df.n_scheduled_trips,
        aggfunc="sum",
    )
    .reset_index()
    .fillna(0)
)

In [None]:
by_date_category = reverse_snakecase(by_date_category)

In [None]:
by_date_category.columns

In [None]:
(
    gt.GT(by_date_category, rowname_col="Service Date")
    .tab_spanner(
        label="Daily Trips by GTFS Availability",
        columns=["Schedule Only", "Vp Only", "Schedule And Vp"],
    )
    .fmt_integer(["Schedule Only", "Vp Only", "Schedule And Vp"])
    .tab_options(container_width="100%")
    .tab_options(table_font_size="12px")
)

In [None]:
route_categories = (
    df[df.time_period == "all_day"]
    .groupby("sched_rt_category")
    .agg({"route_combined_name": "nunique"})
    .reset_index()
)

In [None]:
route_categories.sched_rt_category = route_categories.sched_rt_category.str.replace(
    "_", " "
).str.title()

In [None]:
# Test with nanographs

#### GTFS Availability
* Change Category values to something more understandable?

In [None]:
(
    gt.GT(data=route_categories.dropna())
    .fmt_integer(columns=["route_combined_name"], compact=True)
    .cols_label(route_combined_name="# Routes", sched_rt_category="Category")
    .tab_options(container_width="100%")
    .tab_header(
        title=md("Routes with GTFS Availability"),
    )
    .tab_options(table_font_size="12px")
).data_color(
    palette=[
        "#2EA8CE",
        "#EB9F3C",
        "#F4D837",
    ],
    na_color="#FFE4C4",
)

#### Route Stats

In [None]:
route_merge_cols = ["route_combined_name", "direction_id"]

all_day_stats = df[
    (df.service_date == most_recent_date) & (df.time_period == "all_day")
][
    route_merge_cols
    + [
        "avg_scheduled_service_minutes",
        "avg_stop_miles",
        "n_scheduled_trips",
        "sched_rt_category",
    ]
]

In [None]:
all_day_stats.head(2)

In [None]:
peak_stats = df[(df.service_date == most_recent_date) & (df.time_period == "peak")][
    route_merge_cols + ["speed_mph", "n_scheduled_trips", "frequency"]
].rename(
    columns={
        "speed_mph": "peak_avg_speed",
        "n_scheduled_trips": "peak_scheduled_trips",
        "frequency": "peak_hourly_freq",
    }
)

In [None]:
peak_stats.head(2)

In [None]:
offpeak_stats = df[
    (df.service_date == most_recent_date) & (df.time_period == "offpeak")
][route_merge_cols + ["speed_mph", "n_scheduled_trips", "frequency"]].rename(
    columns={
        "speed_mph": "offpeak_avg_speed",
        "n_scheduled_trips": "offpeak_scheduled_trips",
        "frequency": "offpeak_hourly_freq",
    }
)

In [None]:
offpeak_stats.head(2)

In [None]:
table_df = (
    pd.merge(
        all_day_stats,
        peak_stats,
        on=route_merge_cols,
    )
    .merge(offpeak_stats, on=route_merge_cols)
    .sort_values(["route_combined_name", "direction_id"])
    .reset_index(drop=True)
)

In [None]:
table_df.columns

In [None]:
table_df.sample()

In [None]:
numeric_cols = table_df.select_dtypes(include="number").columns
table_df[numeric_cols] = table_df[numeric_cols].fillna(0)

#### Updating Already Made Charts
#### Scheduled Minutes...Can just be a table?

In [None]:
sched_df = df[df.sched_rt_category != "vp_only"]
vp_df = df[df.sched_rt_category != "schedule_only"]

sched_service_chart = sched_df[sched_df.time_period == "all_day"]

In [None]:
sched_service_chart.sample()

In [None]:
sched_service_chart.direction_id.value_counts()

#### Daily scheduled trips
* .facet(column=alt.Column("direction_id:N", title="direction_id"))

In [None]:
facet_test1 = (
    (
        alt.Chart(sched_df)
        .mark_bar()
        .encode(
            x=alt.X("month_year"),
            xOffset="time_period:N",
            y=alt.Y("n_scheduled_trips:Q"),
            color=alt.Color(
                "time_period:N", scale=alt.Scale(range=cp.CALITP_SEQUENTIAL_COLORS)
            ),
            tooltip=sched_df.columns.to_list(),
        )
    )
).properties(width=300, height=300)

In [None]:
facet_test1.facet(column=alt.Column("direction_id:N", title="direction_id"))

In [None]:
column_of_interest = "n_scheduled_trips"

In [None]:
# fake_faceted_chart2(sched_df, "n_scheduled_trips", "Total Scheduled Daily Trips")

#### Frequency
* What does Frequency mean?

In [None]:
# fake_faceted_chart2(sched_df, "frequency", "Frequency of Trips per Hour")

In [None]:
sched_df.sample()

#### Average Speed

In [None]:
# fake_faceted_chart2(vp_df, "speed_mph", "Average Speed")

#### Fake Faceted Chart
* List dropdown doesn't work. Sadly.

In [None]:
def fake_faceted_chart2(df: pd.DataFrame, column_of_interest: str, title=str):
    dropdown_list = df["route_combined_name"].unique().tolist()
    input_dropdown = alt.binding_select(options=dropdown_list, name="Routes")
    selection = alt.selection_point(fields=["route_combined_name"], bind=input_dropdown)

    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({column_of_interest}):Q")
    )

    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        "caltrans_district",
        "day_type",
        "month_year",
        column_of_interest,
    ]

    dir_0 = df.loc[df.direction_id == 0]
    dir_1 = df.loc[df.direction_id == 1]
    chart_dir_0 = (
        alt.Chart(
            dir_0,
            title=alt.Title(
                title,
                subtitle="Direction 0",
            ),
        )
        .mark_bar()
        .encode(
            x=alt.X("month_year", title=labeling("month_year")),
            xOffset="time_period:N",
            y=alt.Y(f"{column_of_interest}:Q", title=labeling(column_of_interest)),
            color=alt.Color(
                "time_period:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=tooltip_cols,
        )
        .add_params(selection)
        .transform_filter(selection)
        .properties(width=300, height=300)
    )
    chart_dir_1 = (
        alt.Chart(
            dir_1,
            title=alt.Title(
                "Title",
                color="white",
                subtitle="Direction 1",
            ),
        )
        .mark_bar()
        .encode(
            x=alt.X("month_year", title=labeling("month_year")),
            xOffset="time_period:N",
            y=alt.Y(f"{column_of_interest}:Q", axis=None),
            color=alt.Color(
                "time_period:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            ),
            tooltip=tooltip_cols,
        )
        .add_params(selection)
        .transform_filter(selection)
        .properties(width=300, height=300)
    )
    chart_dir_0 = chart_dir_0 + ruler
    chart_dir_1 = chart_dir_1 + ruler
    final_chart = alt.hconcat(chart_dir_0, chart_dir_1)
    return final_chart

In [None]:
avg_speeds = fake_faceted_chart2(vp_df, "speed_mph", "Average Speed")

In [None]:
# extract_data(avg_speeds)

In [None]:
def filtered_route_charts1(
    df: pd.DataFrame,
    control_field: str,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """
    dropdown_list = df["route_combined_name"].unique().tolist()
    input_dropdown = alt.binding_select(options=dropdown_list, name="Routes")
    selection = alt.selection_point(fields=["route_combined_name"], bind=input_dropdown)
    sched_df = df[df.sched_rt_category != "vp_only"]
    vp_df = df[df.sched_rt_category != "schedule_only"]

    daily_trips = fake_faceted_chart2(
        sched_df, "n_scheduled_trips", "Total Scheduled Daily Trips"
    )
    frequency = fake_faceted_chart2(
        sched_df, "frequency", "Frequency of Trips per Hour"
    )
    avg_speeds = fake_faceted_chart2(vp_df, "speed_mph", "Average Speed")

    chart_lists = [daily_trips, frequency, avg_speeds]
    chart = alt.vconcat(*chart_lists).resolve_scale(y="independent")
    display(chart)

In [None]:
filtered_route_charts1(df, "route_combined_name")

#### Actual Faceted Charts
* Issue ruler to guide people's eyes does not work once all the charts are concated using `vconcat` to be controlled by one large dropdown menu.


In [None]:
def base_facet(df: pd.DataFrame, column_of_interest: str, title: str):
    tooltip_cols = [
        "direction_id",
        "time_period",
        "route_combined_name",
        "organization_name",
        "caltrans_district",
        "day_type",
        "month_year",
        column_of_interest,
    ]

    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({column_of_interest}):Q")
    )

    chart1 = (
        (
            (
                alt.Chart(
                    sched_df,
                )
                .mark_bar()
                .encode(
                    x=alt.X("month_year", title=labeling("Date")),
                    xOffset="time_period:N",
                    y=alt.Y(
                        f"{column_of_interest}:Q", title=labeling(column_of_interest)
                    ),
                    color=alt.Color(
                        "time_period:N",
                        scale=alt.Scale(range=cp.CALITP_SEQUENTIAL_COLORS),
                    ),
                    tooltip=tooltip_cols,
                )
            )
        )
        .properties(title=labeling(title), width=300, height=300)
        .interactive()
    )
    try:
        chart1 = (chart1 + ruler).facet(
            column=alt.Column("direction_id:N", title="Direction ID")
        )
    except:
        chart1 = chart1.facet(column=alt.Column("direction_id:N", title="Direction ID"))
    return chart1

In [None]:
# base_facet(sched_df, "n_scheduled_trips", "Total Scheduled Daily Trips")

In [None]:
freq_df = base_facet(sched_df, "frequency", "Frequency of Trips per Hour")

In [None]:
# freq_df

##### Can't use extract_data with faceted chart
* Or charts created using `hconcat`.

In [None]:
# extract_data(freq_df)

In [None]:
# base_facet(vp_df, "speed_mph", "Average Speed")

In [None]:
def filtered_route_charts2(
    df: pd.DataFrame,
    control_field: str,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """
    route_dropdown = alt.binding_select(
        options=sorted(df[control_field].unique().tolist()),
        name="Routes ",
    )

    # Column that controls the bar charts
    route_selector = alt.selection_point(
        fields=[control_field],
        bind=route_dropdown,
    )

    sched_df = df[df.sched_rt_category != "vp_only"]
    vp_df = df[df.sched_rt_category != "schedule_only"]

    daily_trips = (
        base_facet(sched_df, "n_scheduled_trips", "Total Scheduled Daily Trips")
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    frequency = (
        base_facet(sched_df, "frequency", "Frequency of Trips per Hour")
        .add_params(route_selector)
        .transform_filter(route_selector)
    )

    # Grab info out to make a dataframe
    daily_trips_df = extract_data(daily_trips)
    frequency_df = extract_data(frequency)

    chart_lists = [daily_trips, frequency]
    chart = alt.vconcat(*chart_lists).resolve_scale(y="independent")

    display(chart)

In [None]:
# filtered_route_charts2(df, "route_combined_name")

#### Test 3

In [None]:
def grouped_bar_chart(
    df: pd.DataFrame,
    x_col: str,
    y_col: str,
    offset_col: str,
    title: str,
):
    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean({y_col}):Q")
    )

    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(f"{x_col}:N", title=labeling(x_col)),
            y=alt.Y(f"{y_col}:Q", title=labeling(y_col)),
            xOffset=f"{offset_col}:N",
            color=alt.Color(
                f"{offset_col}:N",
                scale=alt.Scale(
                    range=cp.CALITP_SEQUENTIAL_COLORS,
                ),
            ),
        )
    )
    chart = (chart + ruler).properties(title=title, width=600, height=400)

    return (chart)

In [None]:
def filtered_route_charts3(
    df: pd.DataFrame,
    control_field: str,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """
 
    route_dropdown = alt.binding_select(
        options=sorted(df[control_field].unique().tolist()),
        name="Routes ",
    )

    # Column that controls the bar charts
    route_selector = alt.selection_point(
        fields=[control_field],
        bind=route_dropdown,
    )
    
    sched_df = df[df.sched_rt_category != "vp_only"]
    vp_df = df[df.sched_rt_category != "schedule_only"]

    daily_trips_chart = (
        grouped_bar_chart(
            sched_df, "month_year", "n_scheduled_trips", "direction_id", "Total Daily Trips"
        ).add_params(route_selector).transform_filter(route_selector)
    )
    frequency_chart = (grouped_bar_chart(
        sched_df, "month_year", "frequency", "direction_id", "Frequency of Route Run"
    ).add_params(route_selector).transform_filter(route_selector))
    
    speed_chart = (grouped_bar_chart(
        vp_df, "month_year", "speed_mph", "direction_id", "Average Speed"
    ).add_params(route_selector).transform_filter(route_selector))
                  

    chart_lists = [daily_trips_chart, frequency_chart, speed_chart]
    chart = alt.vconcat(*chart_lists).resolve_scale(y="independent")

    display(chart)
    

In [None]:
daily_trips_chart = (
        grouped_bar_chart(
            sched_df, "month_year", "n_scheduled_trips", "direction_id", "Total Daily Trips"
        ).add_params(route_selector).transform_filter(route_selector)
    )
frequency_chart = (grouped_bar_chart(
        sched_df, "month_year", "frequency", "direction_id", "Frequency of Route Run"
    ).add_params(route_selector).transform_filter(route_selector))
    
speed_chart = (grouped_bar_chart(
        vp_df, "month_year", "speed_mph", "direction_id", "Average Speed"
    ).add_params(route_selector).transform_filter(route_selector))

In [None]:
chart_lists = [daily_trips_chart, frequency_chart, speed_chart]
all_charts = alt.vconcat(*chart_lists).resolve_scale(y="independent")

In [None]:
all_charts

In [None]:
all_charts_dict = all_charts.to_dict()

In [None]:
# filtered_route_charts3(df, 'route_combined_name')