## Round 1 
* Updating existing charts made by Tiffany. https://gtfs-digest--cal-itp-data-analyses.netlify.app/
* cd rt_segment_speeds && pip install altair_transform && pip install -r requirements.txt && cd ../_shared_utils && make setup_env

Links
* https://github.com/cal-itp/data-analyses/issues/1059
* https://docs.google.com/document/d/1I1WiqlmU06W6iLCi7cZQrOCLILkrEfABEkcU0Jys7f0/edit
* https://route-speeds--cal-itp-data-analyses.netlify.app/name_bay-area-511-muni-schedule/0__report__name_bay-area-511-muni-schedule
* https://posit-dev.github.io/great-tables/get-started/nanoplots.html
* https://docs.pola.rs/py-polars/html/reference/api/polars.from_pandas.html
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_rt_scheduled_utils.py
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/_threshold_utils.py

In [1]:
import _report_utils
import _section2_utils as section2
import altair as alt
import geopandas as gpd
import pandas as pd
from segment_speed_utils.project_vars import RT_SCHED_GCS, SCHED_GCS
from shared_utils import catalog_utils, rt_dates, rt_utils

alt.data_transformers.enable("default", max_rows=None)

DataTransformerRegistry.enable('default')

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data")
import yaml

with open("readable.yml") as f:
    readable_dict = yaml.safe_load(f)

In [4]:
with open("color_palettes.yml") as f:
    color_dict = yaml.safe_load(f)

In [5]:
org_name = "City of Fairfield"

### Original File 

In [6]:
schd_vp_url = f"{GTFS_DATA_DICT.digest_tables.dir}{GTFS_DATA_DICT.digest_tables.route_schedule_vp}.parquet"

In [7]:
og = pd.read_parquet(schd_vp_url)

In [8]:
og.columns

Index(['schedule_gtfs_dataset_key', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'is_express', 'is_rapid', 'is_rail', 'is_coverage',
       'is_downtown_local', 'is_local', 'service_date', 'typology',
       'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'name', 'route_long_name', 'route_short_name',
       'route_combined_name', 'route_id', 'schedule_source_record_id',
       'base64_url', 'organization_source_record_id', 'organization_name',
       'caltrans_district'],
     

### Checkout Duplicates

In [9]:
df = section2.load_schedule_vp_metrics(org_name)

In [10]:
all_day = df.loc[df["Period"] == "all_day"]

In [11]:
df.frequency_in_minutes.describe()

count   476.00
mean    103.44
std      86.37
min      26.67
25%      51.28
50%      80.00
75%     120.00
max     750.00
Name: frequency_in_minutes, dtype: float64

In [12]:
df.columns

Index(['schedule_gtfs_dataset_key', 'Direction', 'Period',
       'Average Scheduled Service (trip minutes)',
       'Average Stop Distance (miles)', '# scheduled trips', 'Trips per Hour',
       'is_express', 'is_rapid', 'is_rail', 'is_coverage', 'is_downtown_local',
       'is_local', 'Date', 'Route typology', '# Minutes with 1+ VP per Minute',
       '# Minutes with 2+ VP per Minute', 'Aggregate Actual Service Minutes',
       'Aggregate Scheduled Service Minutes (all trips)', '# VP',
       '# VP within Scheduled Shape', '# Early Arrival Trips',
       '# On-Time Trips', '# Late Trips', '# Trips with VP',
       'Average VP per Minute', '% VP within Scheduled Shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       '% Scheduled Trip w/ 1+ VP/Minute', '% Scheduled Trip w/ 2+ VP/Minute',
       'Realtime versus Scheduled Service Ratio',
       'Average Actual Service (Trip Minutes)', 'GTFS Availability',
       'Speed (MPH)', 'Transit Operator', 'route_long_n

In [13]:
df.frequency_in_minutes.describe()

count   476.00
mean    103.44
std      86.37
min      26.67
25%      51.28
50%      80.00
75%     120.00
max     750.00
Name: frequency_in_minutes, dtype: float64

In [14]:
df['Route'].unique()

array(['1 FTC/Fairfield Walmart',
       '2 Solano Town Center/Fairfield Vacaville Train Station',
       '3 FTC/Solano Town Center/Fairfield Walmart',
       '4 Smart & Final/Travis Air Force Base',
       '6 Solano Town Center/East Tabor & Sunset',
       '7 FTC/Solano College/Cordelia Library',
       '8 Cordelia Library/Oakbrook Dr', '8T Cordelia School Tripper'],
      dtype=object)

### One Route

In [15]:
one_route = df.loc[df["Route"] == '1 FTC/Fairfield Walmart']

In [16]:
# one_route.shape

In [17]:
one_route_all_day = one_route.loc[one_route["Period"] == "all_day"]

In [18]:
one_route_all_day["Route"].values[0]

'1 FTC/Fairfield Walmart'

#### Cleaning up Code

In [19]:
def set_y_axis(df, y_col):
    if "%" in y_col:
        max_y = 100

    elif "VP" in y_col:
        max_y = 3
    elif "Minute" in y_col:
        max_y = round(df[y_col].max())
    else:
        max_y = round(df[y_col].max(), -1) + 5
    return max_y

#### Try to make only the first route show.

In [20]:
all_day.shape

(164, 48)

In [21]:
# freq_range: ["#7cc665","#d2d776", "#fde18d", "#fcaa5f", "#ec5d3b"]
spatial_accuray_chart = section2.base_facet_with_ruler_chart(
    all_day,
    "% VP within Scheduled Shape",
    "ruler_100_pct",
    readable_dict["spatial_accuracy_graph"]["title"],
    readable_dict["spatial_accuracy_graph"]["title"],
    [0,20, 40, 60, 80, 100],
    ["#ec5d3b","#ec5d3b","#fcaa5f","#fde18d", "#d2d776","#7cc665",]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Period"] = df["Period"].str.replace("_", " ").str.title()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[y_col] = df[y_col].fillna(0).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{y_col}_str"] = df[y_col].astype(str)


In [22]:
spatial_accuray_chart

In [23]:
 speed_graph = (
        section2.base_facet_line(
            df,
            "Speed (MPH)",
            readable_dict["speed_graph"]["title"],
            readable_dict["speed_graph"]["subtitle"],
        ))

In [24]:
speed_graph

In [25]:
df.columns

Index(['schedule_gtfs_dataset_key', 'Direction', 'Period',
       'Average Scheduled Service (trip minutes)',
       'Average Stop Distance (miles)', '# scheduled trips', 'Trips per Hour',
       'is_express', 'is_rapid', 'is_rail', 'is_coverage', 'is_downtown_local',
       'is_local', 'Date', 'Route typology', '# Minutes with 1+ VP per Minute',
       '# Minutes with 2+ VP per Minute', 'Aggregate Actual Service Minutes',
       'Aggregate Scheduled Service Minutes (all trips)', '# VP',
       '# VP within Scheduled Shape', '# Early Arrival Trips',
       '# On-Time Trips', '# Late Trips', '# Trips with VP',
       'Average VP per Minute', '% VP within Scheduled Shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       '% Scheduled Trip w/ 1+ VP/Minute', '% Scheduled Trip w/ 2+ VP/Minute',
       'Realtime versus Scheduled Service Ratio',
       'Average Actual Service (Trip Minutes)', 'GTFS Availability',
       'Speed (MPH)', 'Transit Operator', 'route_long_n

### https://stackoverflow.com/questions/70937066/make-dropdown-selection-responsive-for-y-axis-altair-python

In [26]:
unique_routes = list(df['Route'].unique())

In [27]:
dropdown = alt.binding_select(
    options = unique_routes,
    name='Route: '
)

In [28]:
xcol_param = alt.param(
    value=unique_routes[0],
    bind=dropdown
)


In [29]:
speed_graph.transform_calculate(
    x=f'datum[{xcol_param.name}]'
).add_params(
    xcol_param
)

In [30]:
speed_graph

### Sample

In [31]:
import altair as alt
from vega_datasets import data

dropdown = alt.binding_select(
    options=['Miles_per_Gallon', 'Displacement', 'Weight_in_lbs', 'Acceleration'],
    name='X-axis column '
)
xcol_param = alt.param(
    value='Miles_per_Gallon',
    bind=dropdown
)

alt.Chart(data.cars.url).mark_circle().encode(
    x=alt.X('x:Q', title=''),
    y='Horsepower:Q',
    color='Origin:N'
).transform_calculate(
    x=f'datum[{xcol_param.name}]'
).add_params(
    xcol_param
)

In [32]:
xcol_param.name

'param_2'

In [33]:
alt.Chart(data.cars.url).mark_circle().encode(
    x=alt.X('x:Q', title=''),
    y='Horsepower:Q',
    color='Origin:N'
)

In [34]:
source = pd.DataFrame({
    'a': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I'],
    'b': [28, 55, 43, 91, 81, 53, 19, 87, 52]
})

source_unique_vals = list(source.a.unique())

dropdown = alt.binding_select(
    options=source_unique_vals,
    name='X-axis column '
)

xcol_param = alt.param(
    value=source_unique_vals[0],
    bind=dropdown
)

alt.Chart(source).mark_bar().encode(
    x='a',
    y='b'
).transform_calculate(
    x=f'datum[{xcol_param.name}]'
).add_params(
    xcol_param
)

In [35]:
from vega_datasets import data
cars_data = data.cars()

In [36]:
cars_unique_vals = list(cars_data['Origin'].unique())

dropdown = alt.binding_select(
    options=cars_unique_vals,
    name='Select Origin: '
)

xcol_param = alt.param(
    value=cars_unique_vals[0],
    bind=dropdown
)


In [37]:
alt.Chart(cars_data).mark_circle(size=60).encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
    tooltip=['Name', 'Origin', 'Horsepower', 'Miles_per_Gallon']
).transform_calculate(
    x=f'datum[{xcol_param.name}]'
).add_params(
    xcol_param
)

#### Text

In [None]:
table_df = section2.route_stats(df)

In [None]:
def create_text_table(df: pd.DataFrame, direction: float):

    df = df.loc[df["Direction"] == direction].drop_duplicates().reset_index(drop=True)

    if len(df) == 0:
        text_chart = section2.create_data_unavailable_chart()
        return text_chart

    else:
        df2 = df.melt(
            id_vars=[
                "Route",
                "Direction",
            ],
            value_vars=[
                "Average Scheduled Service (Trip Minutes)",
                "Average Stop Distance (Miles)",
                "# Scheduled Trips",
                "Gtfs Availability",
                "Peak Avg Speed",
                "Peak Scheduled Trips",
                "Peak Hourly Freq",
                "Offpeak Avg Speed",
                "Offpeak Scheduled Trips",
                "Trips Per Hour",
            ],
        )
        # Create a decoy column to center all the text
        df2["Zero"] = 0

        df2["combo_col"] = df2.variable.astype(str) + ": " + df2.value.astype(str)
        df2.combo_col = df2.combo_col.str.replace(
            "schedule_and_vp", "Schedule and Realtime Data"
        )
        text_chart = (
            alt.Chart(df2)
            .mark_text()
            .encode(x=alt.X("Zero:Q", axis=None), y=alt.Y("combo_col", axis=None))
        )

        text_chart = text_chart.encode(text="combo_col:N").properties(
            title=f"Route Statistics for Direction {direction}",
            width=500,
            height=300,
        )
        return text_chart

In [None]:
create_text_table(table_df, 0)

#### Putting it all together

In [None]:
def filtered_route_test(
    df: pd.DataFrame,
) -> alt.Chart:
    """
    https://stackoverflow.com/questions/58919888/multiple-selections-in-altair
    """
    # Create dropdown
    routes_list = df["Route"].unique().tolist()

    route_dropdown = alt.binding_select(
        options=routes_list,
        name="Routes",
    )
    # Column that controls the bar charts
    route_selector = alt.selection_point(
        fields=["Route"],
        bind=route_dropdown,
    )

    # Filter for only rows categorized as found in schedule and vp and all_day
    all_day = df.loc[df["Period"] == "All Day"].reset_index(drop=True)

    # Create route stats table for the text tables
    route_stats_df = section2.route_stats(df)

    # Manipulate the df for some of the metrics
    timeliness_df = timeliness_trips(df)

    rt_journey_vp = pct_vp_journey(
        all_day,
        "% Actual Trip Minutes with 1+ VP per Minute",
        "% Actual Trip Minutes with 2+ VP per Minute",
    )
    sched_journey_vp = pct_vp_journey(
        all_day,
        "% Scheduled Trip Minutes with 1+ VP per Minute",
        "% Scheduled Trip Minutes with 2+ VP per Minute",
    )

    avg_scheduled_min_graph = (
        grouped_bar_chart(
            df=all_day,
            color_col="Direction",
            y_col="Average Scheduled Service (trip minutes)",
            offset_col="Direction",
            title=readable_dict["avg_scheduled_min_graph"]["title"],
            subtitle=readable_dict["avg_scheduled_min_graph"]["subtitle"],
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    # display(avg_scheduled_min_graph)
    timeliness_trips_dir_0 = (
        (
            base_facet_chart(
                timeliness_df.loc[timeliness_df["Direction"] == 0],
                "value",
                "variable",
                "Period",
                readable_dict["timeliness_trips_dir_0_graph"]["title"],
                readable_dict["timeliness_trips_dir_0_graph"]["subtitle"],
            )
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    # display(timeliness_trips_dir_0)
    timeliness_trips_dir_1 = (
        (
            base_facet_chart(
                timeliness_df.loc[timeliness_df["Direction"] == 1],
                "value",
                "variable",
                "Period",
                readable_dict["timeliness_trips_dir_1_graph"]["title"],
                readable_dict["timeliness_trips_dir_0_graph"]["subtitle"],
            )
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    # display(timeliness_trips_dir_1)
    frequency_graph = (
        frequency_chart(df).add_params(route_selector).transform_filter(route_selector)
    )
    # display(frequency_graph)
    speed_graph = (
        base_facet_line(
            df,
            "Speed (MPH)",
            readable_dict["speed_graph"]["title"],
            readable_dict["speed_graph"]["subtitle"],
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    # display(speed_graph)
    vp_per_min_graph = (
        (
            base_facet_with_ruler_chart(
                all_day,
                "Average VP per Minute",
                "ruler_for_vp_per_min",
                readable_dict["vp_per_min_graph"]["title"],
                readable_dict["vp_per_min_graph"]["subtitle"],
            )
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    # display(vp_per_min_graph)
    rt_vp_per_min_graph = (
        base_facet_circle(
            rt_journey_vp,
            "% of Actual Trip Minutes",
            "Category",
            "ruler_100_pct",
            readable_dict["rt_vp_per_min_graph"]["title"],
            readable_dict["rt_vp_per_min_graph"]["subtitle"],
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    # display(rt_vp_per_min_graph)
    sched_vp_per_min = (
        base_facet_circle(
            sched_journey_vp,
            "% of Actual Trip Minutes",
            "Category",
            "ruler_100_pct",
            readable_dict["sched_vp_per_min_graph"]["title"],
            readable_dict["rt_vp_per_min_graph"]["subtitle"],
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    # display(sched_vp_per_min)
    spatial_accuracy = (
        base_facet_with_ruler_chart(
            all_day,
            "% VP within Scheduled Shape",
            "ruler_100_pct",
            readable_dict["spatial_accuracy_graph"]["title"],
            readable_dict["spatial_accuracy_graph"]["title"],
        )
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    # display(spatial_accuracy)
    text_dir0 = (
        (create_text_table(route_stats_df, 0))
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    # display(text_dir0)
    text_dir1 = (
        create_text_table(route_stats_df, 1)
        .add_params(route_selector)
        .transform_filter(route_selector)
    )
    # display(text_dir1)

    chart_list = [
        avg_scheduled_min_graph,
        timeliness_trips_dir_0,
        timeliness_trips_dir_1,
        frequency_graph,
        speed_graph,
        vp_per_min_graph,
        rt_vp_per_min_graph,
        sched_vp_per_min,
        spatial_accuracy,
        text_dir0,
        text_dir1,
    ]

    chart = alt.vconcat(*chart_list)

    return chart

In [None]:
filtered_route_test(df)