## Scheduled vs, RT total trip time

In [22]:
import datetime

import _rt_scheduled_utils as rt_scheduled_utils
import _threshold_utils as threshold_utils
import altair as alt
import dask.dataframe as dd
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp_data_analysis.sql import to_snakecase
from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import RT_SCHED_GCS, SEGMENT_GCS, analysis_date
from shared_utils import calitp_color_palette as cp

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import gcsfs

fs = gcsfs.GCSFileSystem()

In [4]:
import intake

catalog = intake.open_catalog("./catalog.yml")

### Deliverable 1 -Dataframe

#### Original file from `b1_rt_trip_diagnostics`

In [5]:
rt_scheduled_df = rt_scheduled_utils.find_metrics(analysis_date)

In [10]:
# Final df - saved to GCS as well
final = rt_scheduled_utils.final_df(rt_scheduled_df, analysis_date)

In [11]:
final.shape

(98105, 20)

In [12]:
# No more negative trips
len(final.loc[final.actual_trip_duration_minutes < 0])

0

### Deliverable 2 - Charts 

#### Operator

In [13]:
# Exclude "none" operators which are only scheduled data
# for the operator charts
operator_wo_none = final.loc[final._gtfs_dataset_name != "None"][["_gtfs_dataset_name"]]

In [14]:
dropdown_list = operator_wo_none._gtfs_dataset_name.unique().tolist()

In [15]:
initialize_first_op = sorted(dropdown_list)[0]

In [16]:
input_dropdown = alt.binding_select(options=sorted(dropdown_list), name="Operator")

In [17]:
selection = alt.selection_single(
    name="Operator",
    fields=["_gtfs_dataset_name"],
    bind=input_dropdown,
    init={"_gtfs_dataset_name": initialize_first_op},
)

In [18]:
rt_scheduled = (
    final.groupby(["_gtfs_dataset_name", "rounded_rt_data_proportion_percentage"])
    .agg({"rt_trip_counts_by_operator": "max", "trip_id": "nunique"})
    .reset_index()
    .rename(columns={"trip_id": "total_trips"})
)

In [19]:
rt_scheduled["percentage_of_trips"] = (
    rt_scheduled.total_trips / rt_scheduled.rt_trip_counts_by_operator * 100
)

In [23]:
rt_scheduled_chart = (
    alt.Chart(rt_scheduled)
    .mark_bar(size=40)
    .encode(
        x=alt.X(
            "rounded_rt_data_proportion_percentage:N",
            scale=alt.Scale(domain=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]),
        ),
        y=alt.Y("percentage_of_trips", scale=alt.Scale(domain=[0, 100])),
        color=alt.Color(
            "rounded_rt_data_proportion_percentage",
            scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            legend=None,
        ),
        tooltip=rt_scheduled.columns.tolist(),
    )
    .properties(title="% of RT minutes vs. Scheduled")
    .add_selection(selection)
    .transform_filter(selection)
)

In [24]:
threshold_utils.chart_size(rt_scheduled_chart, 500, 400)

In [25]:
rt_trip_time = (
    final.groupby(["_gtfs_dataset_name", "trip_category", "rt_category"])
    .agg({"rt_trip_counts_by_operator": "max", "trip_id": "nunique"})
    .reset_index()
    .rename(columns={"trip_id": "total_trips"})
)

In [26]:
rt_trip_time.total_trips = (rt_trip_time.total_trips).astype(int)

In [27]:
rt_trip_time["percentage_of_trips"] = (rt_trip_time.total_trips).divide(
    rt_trip_time.rt_trip_counts_by_operator
) * 100

In [28]:
rt_trip_time_chart = (
    (
        alt.Chart(rt_trip_time)
        .mark_bar()
        .encode(
            x=alt.X("trip_category", axis=alt.Axis(labelAngle=-45)),
            y=alt.Y("percentage_of_trips", scale=alt.Scale(domain=[0, 100])),
            color=alt.Color(
                "rt_category", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
            ),
            tooltip=rt_trip_time.columns.tolist(),
        )
        .properties(title="RT Trip Duration and % of RT vs. Scheduled")
    )
    .add_selection(selection)
    .transform_filter(selection)
    .interactive()
)

In [29]:
threshold_utils.chart_size(rt_trip_time_chart, 500, 400)

#### Statewide
* why is `len(final.loc[final.rounded_rt_data_proportion_percentage == 0])` yielding different results than my graph?
* bc I used `trip_id:nunique` instead of `trip_id:count`

In [30]:
all_trips = final.trip_id.count()

In [31]:
all_trips

98105

In [32]:
# for i in [0,10,20,30,40,50,60,70,80,90,100]:
#    print(len(final.loc[final.rounded_rt_data_proportion_percentage == i]))

In [33]:
total_sw_rt = (
    rt_scheduled.groupby(["rounded_rt_data_proportion_percentage"])
    .agg({"total_trips": "sum"})
    .reset_index()
)

In [34]:
total_sw_rt["Percentage of Usable Trips"] = total_sw_rt.total_trips.div(all_trips) * 100

In [35]:
total_sw_rt

Unnamed: 0,rounded_rt_data_proportion_percentage,total_trips,Percentage of Usable Trips
0,0,25719,26.22
1,10,249,0.25
2,20,389,0.4
3,30,821,0.84
4,40,718,0.73
5,50,590,0.6
6,60,816,0.83
7,70,1615,1.65
8,80,3772,3.84
9,90,6978,7.11


In [36]:
threshold_utils.chart_size(
    alt.Chart(total_sw_rt)
    .mark_bar(size=30)
    .encode(
        x=alt.X(
            "rounded_rt_data_proportion_percentage:N",
            scale=alt.Scale(domain=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]),
        ),
        y=alt.Y(
            "Percentage of Usable Trips",
        ),
        color=alt.Color(
            "rounded_rt_data_proportion_percentage",
            scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
            legend=None,
        ),
        tooltip=total_sw_rt.columns.tolist(),
    ),
    500,
    400,
)

In [37]:
sw_rt_trips = (
    rt_trip_time.groupby(["rt_category", "trip_category"])
    .agg({"total_trips": "sum"})
    .reset_index()
)

In [38]:
sw_rt_trips["Percentage of Usable Trips"] = (
    sw_rt_trips["total_trips"].div(all_trips) * 100
)

In [39]:
threshold_utils.chart_size(
    alt.Chart(sw_rt_trips)
    .mark_bar()
    .encode(
        x=alt.X("trip_category", axis=alt.Axis(labelAngle=-45)),
        y="Percentage of Usable Trips",
        color=alt.Color(
            "rt_category", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
        ),
        tooltip=sw_rt_trips.columns.tolist(),
    ),
    500,
    400,
).interactive()

In [40]:
def summary_valid_trips_by_cutoff(
    df,
    time_col: str,
    phrase: str,
    time_cutoffs: list,
    ping_cutoffs: list,
    pings: bool = True,
):
    """
    Find percentage & number of trips that meet trip time elapsed
    and pings.
    """
    group_by_cols = [
        "gtfs_dataset_key",
        "_gtfs_dataset_name",
        "rt_trip_counts_by_operator",
    ]
    final = pd.DataFrame()

    if pings:
        for t in time_cutoffs:
            for s in ping_cutoffs:
                valid = (
                    df[(df[time_col] >= t) & (df.pings_per_minute >= s)]
                    .groupby(group_by_cols)
                    .trip_id.nunique()
                    .reset_index()
                    .rename(columns={"trip_id": "n_trips"})
                )

                valid = valid.assign(
                    trip_cutoff=t,
                    pings=s,
                    cutoff=f"{t} {phrase} & {s} ping(s)+ per minute",
                )

                final = pd.concat([final, valid], axis=0)
    else:
        for t in time_cutoffs:
            valid = (
                df[(df[time_col] >= t)]
                .groupby(group_by_cols)
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
            )

            valid = valid.assign(trip_cutoff=t, cutoff=f"{t}{phrase}")

            final = pd.concat([final, valid], axis=0)

    final = final.assign(
        percentage_usable_trips=final.n_trips.divide(final.rt_trip_counts_by_operator)
        * 100
    )
    final = final.sort_values("_gtfs_dataset_name")
    return final

In [41]:
def create_valid_stats(df):
    # Prepare valid stats
    pings_trip_duration_ops = summary_valid_trips_by_cutoff(
        df, "actual_trip_duration_minutes", "+ min", [15, 30, 60], [1, 2, 3]
    )
    rt_data_available = summary_valid_trips_by_cutoff(
        df, "rt_data_proportion", "%+", [20, 40, 60, 80, 100], [1, 2, 3], True
    )

    # Clean df
    pings_trip_duration_ops = threshold_utils.pre_clean(pings_trip_duration_ops)
    rt_data_available = threshold_utils.pre_clean(rt_data_available)

    return pings_trip_duration_ops, rt_data_available

In [42]:
def operator_visuals(df):
    """
    Return one chart for trip time & # of pings
    and another chart for RT vs Scheduled
    data
    """
    pings_trip_duration_ops, rt_data_available = create_valid_stats(df)

    # Prepare dropdown menu
    dropdown_list = (
        pings_trip_duration_ops["Gtfs Dataset Name"].sort_values().unique().tolist()
    )
    dropdown = alt.binding_select(
        options=[None] + dropdown_list, labels=["All"] + dropdown_list, name="Operator"
    )
    selection = alt.selection_single(fields=["Gtfs Dataset Name"], bind=dropdown)

    # Create charts
    pings_chart = threshold_utils.bar_chart(
        pings_trip_duration_ops,
        "Percentage Usable Trips",
        "Cutoff",
        [
            "Gtfs Dataset Key",
            "Gtfs Dataset Name",
            "N Trips",
            "Cutoff",
            "Percentage Usable Trips",
        ],
        "Pings and RT Trip Time Elapsed",
    )

    rt_chart = threshold_utils.bar_chart(
        rt_data_available,
        "Percentage Usable Trips",
        "Cutoff",
        [
            "Gtfs Dataset Key",
            "Gtfs Dataset Name",
            "Trip Cutoff",
            "Percentage Usable Trips",
            "N Trips",
        ],
        "Real Time vs. Scheduled Data Percentage",
    )

    # Clean Charts
    pings_chart = threshold_utils.chart_size(
        pings_chart.add_selection(selection).transform_filter(selection).interactive(),
        500,
        400,
    )
    rt_chart = threshold_utils.chart_size(
        rt_chart.add_selection(selection).transform_filter(selection).interactive(),
        500,
        400,
    )

    return pings_chart & rt_chart

In [43]:
# operator_visuals(final)

* Double check with Unitrans

In [44]:
# len(final[(final["_gtfs_dataset_name"] == "Unitrans Vehicle Positions") & (final["pings_per_minute"] > 1) & (final["actual_trip_duration_minutes"] > 15)])

In [45]:
# len(final.loc[final._gtfs_dataset_name == "Unitrans Vehicle Positions"])

In [46]:
# final.loc[final._gtfs_dataset_name == "Unitrans Vehicle Positions"][['actual_trip_duration_minutes', 'rt_data_proportion','pings_per_minute']].sort_values('actual_trip_duration_minutes')

In [47]:
# final[(final["_gtfs_dataset_name"] == "Unitrans Vehicle Positions") & (final["pings_per_minute"] > 1) & (final["actual_trip_duration_minutes"] > 15)]

* Check with Redding

In [48]:
# final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions")].shape

In [49]:
# final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions") & (final["rt_data_proportion"] < 20)]

In [50]:
# len(final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions") & (final["rt_data_proportion"] > 20)])

In [51]:
def statewide_visuals(df):
    # Prepare valid stats
    pings_trip_duration_ops, rt_data_available = create_valid_stats(df)

    # Summarize to statewide level
    def statewide(df):
        summary = (
            df.groupby(
                [
                    "Cutoff",
                ]
            )
            .agg({"Rt Trip Counts By Operator": "max", "N Trips": "max"})
            .reset_index()
            .rename(columns={"Rt Trip Counts By Operator": "All Trips"})
        )

        summary["Percentage of Usable Trips"] = (
            summary["N Trips"] / summary["All Trips"] * 100
        )
        return summary

    # Aggregate valid stats to statewide level
    statewide_pings = statewide(pings_trip_duration_ops)
    statewide_rt = statewide(rt_data_available)

    # Charts
    pings_chart = threshold_utils.bar_chart(
        statewide_pings,
        "Percentage of Usable Trips",
        "Cutoff",
        statewide_pings.columns.tolist(),
        "Statewide Pings and RT Trip Time Elapsed",
    )
    rt_chart = threshold_utils.bar_chart(
        statewide_rt,
        "Percentage of Usable Trips",
        "Cutoff",
        statewide_rt.columns.tolist(),
        "Statewide RT vs. Scheduled Data",
    )

    # Clean Charts
    pings_chart = threshold_utils.chart_size(pings_chart.interactive(), 500, 400)
    rt_chart = threshold_utils.chart_size(rt_chart.interactive(), 500, 400)

    return pings_chart & rt_chart

In [52]:
# statewide_visuals(final)