# [route-direction viz wrangling](https://github.com/orgs/cal-itp/projects/30/views/1?pane=issue&itemId=105612776)


In [1]:
import merge_data
import numpy as np
import pandas as pd
import yaml
from segment_speed_utils import gtfs_schedule_wrangling, helpers, time_series_utils
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS
from shared_utils import (
    catalog_utils,
    dask_utils,
    gtfs_utils_v2,
    portfolio_utils,
    publish_utils,
    rt_dates,
)
from update_vars import GTFS_DATA_DICT, RT_SCHED_GCS, SCHED_GCS, SEGMENT_GCS

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date_list = rt_dates.y2024_dates

In [4]:
FILE = GTFS_DATA_DICT.schedule_tables.gtfs_key_crosswalk

crosswalk_cols = [
    "schedule_gtfs_dataset_key",
    "name",
    "organization_name",
]

df = time_series_utils.concatenate_datasets_across_dates(
    SCHED_GCS, FILE, analysis_date_list, data_type="df", columns=crosswalk_cols
).drop_duplicates()

* This operator has multiple organization_names. 
* This is not correct, we want to keep only one. 
* Which one is the right one to select?
* By mapping Foothill Schedule originally, we are merely going back to name. 

In [5]:
df.loc[df.name.str.contains("Foothill")]

Unnamed: 0,schedule_gtfs_dataset_key,name,organization_name,service_date
99,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2024-02-14
100,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2024-02-14
97,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2024-03-13
98,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2024-03-13
100,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2024-04-17
101,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2024-04-17
97,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2024-05-22
98,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2024-05-22
90,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2024-06-12
91,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2024-06-12


In [6]:
with open("../_shared_utils/shared_utils/portfolio_organization_name.yml", "r") as f:
    PORTFOLIO_ORGANIZATIONS_DICT = yaml.safe_load(f)

In [7]:
df2 = df.pipe(
    portfolio_utils.standardize_portfolio_organization_names,
    PORTFOLIO_ORGANIZATIONS_DICT,
)

* This step is not done. Don't move onto the deploying this. 
* Two rows for every service date. How do we deal with this?
* This will cause Foothll Transit twice because City of Duarte and Foothill Transit. 
* Add all the columns I'm creating in the same place.
* Write a function that aggregates this to portfolio_organization_name and service_date to deal with this duplicate
* Use City and County of SF and Foothill Transit. 
* Probably need to drop some rows, combine some rows. 
* Don't create all new columns in the chart making step. 
* Create data after we get to the `portfolio_organization_name`
* This is the only grain I'll be visualizing. 
* I should only have 4 datasets and they should all be by `portfolio_organization_name`
* When I'm building quarterly stuff, I am adding even more rows. 
* Charts should only use 4 datasets at the new grain using the `portfolio_organization_name`
* `organization_name` needs to be dropped eventually. 
* I might still have `name` but not sure. 
* We don't need `schedule_gtfs_Dataset_key` and we don't need `name`.
* Look at Evan's list, but look at my other list with LA Metro which includes LA Bus/Rail.
    * Check everything.
* City of Duarte and Foothill have the same info so we can delete
* We only need schd key, name, service_date, portfolio_org_name & de duplicate 

In [8]:
df2.loc[df2.name.str.contains("Foothill")]

Unnamed: 0,schedule_gtfs_dataset_key,name,organization_name,service_date,portfolio_organization_name
99,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2024-02-14,Foothill Transit
100,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2024-02-14,Foothill Transit
97,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2024-03-13,Foothill Transit
98,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2024-03-13,Foothill Transit
100,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2024-04-17,Foothill Transit
101,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2024-04-17,Foothill Transit
97,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2024-05-22,Foothill Transit
98,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2024-05-22,Foothill Transit
90,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2024-06-12,Foothill Transit
91,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2024-06-12,Foothill Transit


In [24]:
df2.loc[df2.name.str.contains("Foothill")].columns

Index(['schedule_gtfs_dataset_key', 'name', 'organization_name',
       'service_date', 'portfolio_organization_name'],
      dtype='object')

* De duplicating on `key`, `name`, and `portfolio_organization_name`
* I can get it to one row but then I would have to check if `route_name-direction` are unique to the portfolio_organization_name
* What if Basin Transit GMV and Morongo Basin use both Route ID 1? Don't mix them up if they are using the same route ID name.
* These are different feeds so there is no gurantee that they have to pick uniqe route IDS
* If route IDS aren't unique, then we have to leave it as 2 rows 
* Just leave it as two rows since this will fix both the top and bottom examples, but this means I will have to unpack any route IDs that are repated across different `schedule_gtfs_dataset_keys`

In [9]:
df2.loc[df2.organization_name.str.contains("Basin")].sort_values(
    by=["portfolio_organization_name", "service_date"]
)

Unnamed: 0,schedule_gtfs_dataset_key,name,organization_name,service_date,portfolio_organization_name
152,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-01-17,Basin Transit
158,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-02-14,Basin Transit
157,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-03-13,Basin Transit
160,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-04-17,Basin Transit
154,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-05-22,Basin Transit
145,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-06-12,Basin Transit
160,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-07-17,Basin Transit
174,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-08-14,Basin Transit
178,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-09-18,Basin Transit
179,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-10-16,Basin Transit


## (2) time-series operator route-direction data becomes portfolio_organization route-direction data

In [12]:
DIGEST_RT_SCHED = GTFS_DATA_DICT.digest_tables.route_schedule_vp

In [13]:
schd_vp_df = pd.read_parquet(f"{RT_SCHED_GCS}{DIGEST_RT_SCHED}.parquet")

In [33]:
schd_vp_df.columns

Index(['schedule_gtfs_dataset_key', 'route_id', 'direction_id', 'time_period',
       'avg_scheduled_service_minutes', 'avg_stop_miles', 'n_scheduled_trips',
       'frequency', 'service_date', 'is_express', 'is_ferry', 'is_rail',
       'is_coverage', 'is_local', 'is_downtown_local', 'is_rapid', 'typology',
       'name', 'combined_name', 'recent_combined_name', 'recent_route_id',
       'route_primary_direction', 'minutes_atleast1_vp', 'minutes_atleast2_vp',
       'total_rt_service_minutes', 'total_scheduled_service_minutes',
       'total_vp', 'vp_in_shape', 'is_early', 'is_ontime', 'is_late',
       'n_vp_trips', 'vp_per_minute', 'pct_in_shape',
       'pct_rt_journey_atleast1_vp', 'pct_rt_journey_atleast2_vp',
       'pct_sched_journey_atleast1_vp', 'pct_sched_journey_atleast2_vp',
       'rt_sched_journey_ratio', 'avg_rt_service_minutes', 'sched_rt_category',
       'speed_mph', 'schedule_source_record_id', 'base64_url',
       'organization_source_record_id', 'organization_name

In [22]:
schd_vp_df2 = schd_vp_df.drop_duplicates(
    subset=[
        "schedule_gtfs_dataset_key",
        "name",
        "service_date",
        "route_id",
        "direction_id",
    ]
)

In [23]:
len(schd_vp_df2), len(schd_vp_df)

(99735, 708563)

In [27]:
schd_vp_df2.name = schd_vp_df2.name.fillna("None")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schd_vp_df2.name = schd_vp_df2.name.fillna("None")


In [31]:
schd_vp_df2.organization_name = schd_vp_df2.organization_name.fillna("None")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  schd_vp_df2.organization_name = schd_vp_df2.organization_name.fillna("None")


In [29]:
schd_vp_df2.loc[schd_vp_df2.name.str.contains("Foothill")][
    [
        "schedule_gtfs_dataset_key",
        "name",
        "organization_name",
        "service_date",
        "portfolio_organization_name",
    ]
].drop_duplicates().sort_values(by=["service_date", "organization_name"])

Unnamed: 0,schedule_gtfs_dataset_key,name,organization_name,service_date,portfolio_organization_name
673154,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2023-03-15,Foothill Transit
673158,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2023-04-12,Foothill Transit
673162,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2023-05-17,Foothill Transit
673166,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2023-06-14,Foothill Transit
673168,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2023-07-12,Foothill Transit
673170,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2023-08-15,Foothill Transit
673174,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2023-09-13,Foothill Transit
673178,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,Foothill Transit,2023-10-11,Foothill Transit
673182,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2023-11-15,Foothill Transit
673186,f74424acf8c41e4c1e9fd42838c4875c,Foothill Schedule,City of Duarte,2023-12-13,Foothill Transit


In [32]:
schd_vp_df2.loc[schd_vp_df2.organization_name.str.contains("Basin Transit")][
    [
        "schedule_gtfs_dataset_key",
        "name",
        "organization_name",
        "service_date",
        "portfolio_organization_name",
    ]
].drop_duplicates().sort_values(by=["service_date", "organization_name"])

Unnamed: 0,schedule_gtfs_dataset_key,name,organization_name,service_date,portfolio_organization_name
491279,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2023-11-15,Basin Transit
491280,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2023-12-13,Basin Transit
491281,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-01-17,Basin Transit
491282,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-02-14,Basin Transit
491283,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-03-13,Basin Transit
491284,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-04-17,Basin Transit
491285,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-05-22,Basin Transit
491286,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-06-12,Basin Transit
491287,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-07-17,Basin Transit
491288,b0760015c9fcd0500c4fddd5b9bb115b,Morongo Basin Schedule,Basin Transit,2024-08-14,Basin Transit


In [36]:
def portfolio_organization_name_grain(df: pd.DataFrame) -> pd.DataFrame:
    """
    Input df after merge_data_sources_by_route_direction().
    Change grain from "schedule_gtfs_dataset_key", "name", "organization_name" to
    "portfolio_organization_name". Do some light cleaning.
    """
    # Drop duplicates for these following rows
    df2 = df.drop_duplicates(
        subset=[
            "schedule_gtfs_dataset_key",
            "name",
            "service_date",
            "route_id",
            "direction_id",
        ]
    )
    # Drop name, organization_name, and schd_key because we are now using the
    # portfolio_organization_name grain
    df2 = df.drop(columns=["schedule_gtfs_dataset_key", "name", "organization_name"])

    # Do some light cleaning like rounding and multiplying % cols
    df2 = clean_schedule_vp_df(df2)

    # Create the new column here?
    df2["headway_in_minutes"] = 60 / df2.frequency
    return df2

## (3) data wrangling for visualizations - create columns, replace values, rename columns
* What's the difference between `recent_route_id` vs. `route_id` and `recent_combined_name` vs. `combined_name`

In [34]:
# Lifted from my `report_operator_section2_utils`
def clean_schedule_vp_df(df: pd.DataFrame) -> pd.DataFrame:
    # Delete duplicates
    df = df.drop_duplicates().reset_index(drop=True)

    # Round float columns
    float_columns = df.select_dtypes(include=["float"])
    for i in float_columns:
        df[i] = df[i].round(2)

    # Multiply percent columns to 100%
    pct_cols = df.columns[df.columns.str.contains("pct")].tolist()
    for i in pct_cols:
        df[i] = df[i] * 100

    # Replace column names
    df = _report_utils.replace_column_names(df)
    return df

## (4) route dropdown to visualize time-series trends
Route dropdown chart (TODO)

    TODO: Below is pseudo code to get you started.
        6-7 chart type functions
        2 chart config functions
        8 functions for each metric listed below created in the route dropdown
        1 wrapper function of the 8 functions

Charts in this section:

    avg scheduled service
    timeliness by direction
    frequency / headway
    avg speed by direction
    vp per minute
    % scheduled journey with vp
    spatial accuracy
    text of other metrics by direction


### Config functions (2)

In [61]:
def set_y_axis(df, y_col):
    """
    Set y_axis automatically depending on the
    column used to generate the y_axis.
    """
    if "%" in y_col:
        max_y = 100
    elif "VP" in y_col:
        max_y = 3
    elif "Minute" in y_col:
        max_y = round(df[y_col].max())
    else:
        max_y = round(df[y_col].max(), -1) + 5
    return max_y

In [46]:
def configure_chart(
    chart: alt.Chart, width: int, height: int, title: str, subtitle: str
) -> alt.Chart:
    """
    Adjust width, height, title, and subtitle
    """
    chart2 = chart.properties(
        width=height,
        height=250,
        title={
            "text": [title],
            "subtitle": [subtitle],
        },
    )
    return chart2

### Chart 1: Ruler Chart

In [48]:
def create_ruler_chart(df: pd.DataFrame, ruler_type: str) -> pd.DataFrame:
    # Add the ruler columns
    if ruler_type == "spatial accuracy ruler":
        df["ruler_100_pct"] = 100
    else:
        df["ruler_for_vp_per_min"] = 2

    ruler = (
        alt.Chart(df)
        .mark_rule(color="red", strokeDash=[10, 7])
        .encode(y=f"mean(ruler_100_pct):Q")
    )
    return ruler

### Chart 2: Pie
* Used only 1x for Total Routes by Typology. 


In [38]:
import altair as alt

In [47]:
def basic_pie_chart(df: pd.DataFrame, color_col: str, theta_col: str) -> alt.Chart:
    chart = (
        alt.Chart(df)
        .mark_arc()
        .encode(
            theta=theta_col,
            color=alt.Color(
                color_col,
                title=_report_utils.labeling(color_col),
                scale=alt.Scale(range=color_dict["full_color_scale"]),
            ),
            tooltip=df.columns.tolist(),
        )
    )

    return chart

In [66]:
def create_route_typology(df):
    chart = basic_pie_chart(df, blah, blah)
    chart = configure_chart(chart, 400, 250, "title", "title")
    return chart

### Chart 3: Bar
* Used for 
* Comparing longest and shortest route.
* Metrics for all routes (plus a ruler)
* Spatial Accuracy (plus a ruler)
* Frequency of Trips 

In [45]:
def create_bar_chart(
    df: pd.DataFrame,
    x_col: str,
    y_col: str,
    color_col: str,
    color_scheme: list,
) -> alt.Chart:
    # Set y-axis
    max_y = set_y_axis(df, y_col)

    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(
                x_col,
                title=_report_utils.labeling(x_col),
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
                scale=alt.Scale(domain=[0, max_y]),
            ),
            y=alt.Y(y_col, title=_report_utils.labeling(y_col)),
            color=alt.Color(
                color_col,
                legend=None,
                title=_report_utils.labeling(color_col),
                scale=alt.Scale(
                    range=color_scheme,
                ),
            ),
            tooltip=list(df.columns),
        )
    )

    return chart

In [44]:
def longest_shortest_route_chart(gdf: pd.DataFrame) -> alt.Chart:
    gdf2 = shortest_longest_route(gdf)
    chart = create_bar_chart(
        df=gdf2,
        x_col="Service Miles",
        y_col="Route",
        color_col="Route",
        color_scheme=["TBD"],
    )

    chart = configure_chart(chart, width=400, height=250, "title", "subtitle")
    return chart

In [57]:
def spatial_accuracy_chart(df: pd.DataFrame) -> alt.Chart:
    ruler_chart = create_ruler_chart(df, "spatial accuracy ruler")
    base_chart = create_bar_chart(
        df=df,
        x_col="Quarter",
        y_col="% VP within Scheduled Shape",
        color_col="% VP within Scheduled Shape",
        color_scheme=["TBD"],
    )

    # Combine
    chart = ruler_chart + base_chart
    # Facet
    chart = chart.facet(
        column=alt.Column(
            "Direction:N",
        )
    )
    chart = configure_chart(
        chart, width=400, height=250, title="title", subtitle="subtitle"
    )

    return chart

In [58]:
def vp_per_minute_chart(df: pd.DataFrame) -> alt.Chart:
    ruler_chart = create_ruler_chart(df, "vp_per_minute")
    base_chart = create_bar_chart(
        df=df,
        x_col="Quarter",
        y_col="% VP within Scheduled Shape",
        color_col="% VP within Scheduled Shape",
        color_scheme=["TBD"],
    )

    # Combine
    chart = ruler_chart + base_chart
    # Facet
    chart = chart.facet(
        column=alt.Column(
            "Direction:N",
        )
    )
    chart = configure_chart(
        chart, width=400, height=250, title="title", subtitle="subtitle"
    )

    return chart

In [59]:
def headway_chart(df: pd.DataFrame) -> alt.Chart:
    base_chart = create_bar_chart(
        df=peak_offpeak_df,
        x_col="headway_in_minutes",
        y_col="Date",
        color_col="headway_in_minutes",
        color_scheme=["TBD"],
    )

    # Facet
    base_chart = base_chart.facet(
        column=alt.Column(
            "Period:N",
        )
    )

    base_chart = configure_chart(
        base_chart, width=400, height=250, title="title", subtitle="subtitle"
    )

    return base_chart

In [62]:
def total_scheduled_trips_chart(df: pd.DataFrame) -> alt.Chart:
    base_chart = create_bar_chart(
        df=df,
        x_col="# scheduled trips",
        y_col="Date",
        color_col="Period",
        color_scheme=["TBD"],
    )

    base_chart = configure_chart(
        base_chart, width=400, height=250, title="title", subtitle="subtitle"
    )

    return base_chart

### Chart 4: Line

In [67]:
def create_line_chart(
    df: pd.DataFrame,
    x_col: str,
    y_col: str,
    color_col: str,
    color_scheme: list,
):
    # Set y-axis
    max_y = set_y_axis(df, y_col)

    # Set chart
    chart = (
        alt.Chart(df)
        .mark_line(size=3)
        .encode(
            x=alt.X(
                x_col,
                title=_report_utils.labeling(x_col),
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
            ),
            y=alt.Y(
                f"{y_col}:Q",
                title=_report_utils.labeling(y_col),
                scale=alt.Scale(domain=[0, max_y]),
            ),
            color=alt.Color(
                f"{color_col}:N",
                title=_report_utils.labeling(color_col),
                scale=alt.Scale(range=color_scheme),
            ),
            tooltip=list(df.columns),
        )
    )
    return chart

In [68]:
def timeliness_chart(df) -> alt.Chart:
    base_chart = create_line_chart(
        df=df,
        x_col=x_col,
        y_col=y_col,
        title="title",
        subtitle="subtitle",
        color_scheme=["color"],
    )

    # Set title/width
    base_chart = configure_chart(
        base_chart, width=200, height=250, title="title", subtitle="subtitle"
    )

    # Facet
    base_chart = base_chart.facet(
        column=alt.Column(
            "Period:N",
        )
    )
    return base_chart

In [69]:
def speed_chart(df) -> alt.Chart:
    base_chart = create_line_chart(
        df=df,
        x_col=x_col,
        y_col=y_col,
        title="title",
        subtitle="subtitle",
        color_scheme=["color"],
    )

    # Set title/width
    base_chart = configure_chart(
        base_chart, width=200, height=250, title="title", subtitle="subtitle"
    )

    # Facet
    base_chart = base_chart.facet(
        column=alt.Column(
            "Period:N",
        )
    )
    return base_chart

### Chart 5: Divider

In [63]:
def divider_chart(df: pd.DataFrame, title: str):
    """
    This chart creates only one line of text.
    I use this to divide charts thematically.
    """
    df = df.head(1)
    # Create a text chart using Altair
    chart = (
        alt.Chart(df)
        .mark_text(
            align="center",
            baseline="middle",
            fontSize=14,
            fontWeight="bold",
            text=title,
        )
        .properties(width=400, height=100)
    )

    return chart

### Chart 6: Text

In [64]:
def reshape_df_text_table(df: pd.DataFrame) -> pd.DataFrame:
    # Reshape dataframe before plotting
    df2 = df.melt(
        id_vars=[
            "Route",
            "Direction",
        ],
        value_vars=[
            "Average Scheduled Service (Trip Minutes)",
            "Average Stop Distance (Miles)",
            "# Scheduled Trips",
            "Peak Avg Speed",
            "Peak Scheduled Trips",
            "Peak Hourly Freq",
            "Offpeak Avg Speed",
            "Offpeak Scheduled Trips",
            "Trips Per Hour",
        ],
    )

    # Create a decoy column so all the text will be centered.
    df2["Zero"] = 0

    # Combine columns so the column title and variable will be aligned.
    # Ex: "Trips Per Hour: 0.56". This column is what will show up on the
    # graphs.
    df2["combo_col"] = df2.variable.astype(str) + ": " + df2.value.astype(str)

    # Clean up
    df2.combo_col = df2.combo_col.str.replace(
        "schedule_and_vp",
        "Schedule and Realtime Data",
    ).str.replace("Gtfs", "GTFS")

    return df2

In [65]:
def text_table(df: pd.DataFrame, direction: int) -> alt.Chart:

    # Filter dataframe for direction
    df = df.loc[df["Dir 0 1"] == direction].drop_duplicates().reset_index(drop=True)

    # Grab variables to use for title/subtitle
    direction = df.Direction.iloc[0]
    most_recent_date = df.Date.iloc[0].strftime("%B %Y")

    # Melt so the dataframe can be plotted
    df2 = reshape_df_text_table(df)

    # Create the chart
    text_chart = (
        alt.Chart(df2)
        .mark_text()
        .encode(x=alt.X("Zero:Q", axis=None), y=alt.Y("combo_col", axis=None))
    )

    text_chart = text_chart.encode(text="combo_col:N")

    # Configure this
    text_chart = configure_chart(
        text_chart,
        width=400,
        height=250,
        title=f"{title}{direction} Vehicles",
        subtitle=f"{subtitle} {most_recent_date}",
    )

    return text_chart

### Chart 7: Grouped Bar Chart


In [70]:
def create_grouped_bar(
    df: pd.DataFrame, x_col: str, y_col: str, offset_col: str, color_col: str
) -> alt.Chart:
    chart = (
        alt.Chart(df)
        .mark_bar(size=5)
        .encode(
            x=alt.X(
                x_col,
                title=_report_utils.labeling(x_col),
                axis=alt.Axis(labelAngle=-45, format="%b %Y"),
            ),
            y=alt.Y(f"{y_col}:Q", title=_report_utils.labeling(y_col)),
            xOffset=alt.X(f"{offset_col}:N", title=_report_utils.labeling(offset_col)),
            color=alt.Color(
                f"{color_col}:N",
                title=_report_utils.labeling(color_col),
                scale=alt.Scale(range=range_color),
            ),
            tooltip=list(df.columns),
        )
    )
    return chart

In [73]:
def avg_scheduled_minutes_chart(df: pd.DataFrame) -> alt.Chart:
    chart = create_grouped_bar(df, x_col, y_col, offset_col, color_col)
    # Configure this
    chart = configure_chart(
        chart,
        width=400,
        height=250,
        title="title",
        subtitle="subtitle",
    )
    return chart

### Chart 8: Circles

In [75]:
def create_circle_chart(
    df: pd.DataFrame,
    x_col: str,
    y_col: str,
    color_col: str,
    color_scheme: list,
) -> alt.Chart:

    chart = (
        alt.Chart(df)
        .mark_circle(size=150)
        .encode(
            x=alt.X(
                x_col,
                title=_report_utils.labeling(x_col),
                axis=alt.Axis(labelAngle=-45),
            ),
            y=alt.Y(
                f"{y_col}:Q",
                title=_report_utils.labeling(y_col),
                scale=alt.Scale(domain=[0, max_y]),
            ),
            color=alt.Color(
                f"{color_col}:N",
                title=_report_utils.labeling(color_col),
                scale=alt.Scale(range=color_scheme),
            ),
            tooltip=list(df.columns),
        )
    )

#### Chart 10 

In [77]:
def schd_trip_with_vps(df: pd.DataFrame) -> alt.Chart:
    chart = create_circle_chart(
        df=df,
        x_col="",
        y_col="",
        color_col="",
        color_scheme=["TBD"],
    )

    chart = chart.facet(
        column=alt.Column(
            "Direction:N",
        )
    )

    chart = configure_chart(
        chart, width=400, height=250, title="title", subtitle="subtitle"
    )
    return chart

## One large function


In [78]:
def route_dropdown(df: pd.DataFrame):
    # Chart 1 and 2 but I need one text chart for each direction
    text_chart = text_table(df, direction)
    # Chart 3 for average scheduled minutes per route, quarterly
    avg_scheduled_minutes_chart = avg_scheduled_minutes_chart(df)
    # Charts 4 and 5, need one chart for each direction
    timeliness_chart = timeliness_chart(df)
    # Charts 6 and 7, need one chart for each direction
    headway_chart = headway_chart(df)
    # Charts 8 and 9, need one chart for each direction
    total_scheduled_trips_chart = total_scheduled_trips_chart(df)
    # Chart 10
    speed_chart = speed_chart(df)
    # Chart 11
    vp_per_minute_chart = vp_per_minute_chart(df)
    # Chart 12
    schd_trip_with_vps = schd_trip_with_vps(df)
    # Chart 13
    spatial_accuracy_chart = spatial_accuracy_chart(df)
    charts_list = ["chart1", "chart2"]
    chart = alt.vconcat(*chart_list)

    # Also need to add some of the divider charts
    return chart