## V2 

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
import altair as alt
import threshold_utils
from shared_utils import calitp_color_palette as cp
from shared_utils import geography_utils, rt_utils, styleguide, utils

In [3]:
import intake
catalog = intake.open_catalog("./catalog_threshold.yml")

In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Tests
* Keep name? 

In [5]:
# Keep name?
def clean_trips():
    df = catalog.trips.read()

    subset = [
        "feed_key",
        "name",
        "route_id",
        "direction_id",
        "shape_id",
    ]

    df = df[subset]

    df = df.drop_duplicates().reset_index(drop=True)

    return df

In [6]:
# trips = clean_trips()

In [7]:
# trips.sample()

In [8]:
def clean_routelines():
    df = catalog.route_lines.read()

    # Drop shape array key? Drop n_trips?
    df = df.drop(columns=["shape_array_key"])
    df = (df.drop_duplicates()).reset_index(drop=True)

    # Calculate length of geometry
    df = df.assign(actual_route_length=(df.geometry.length))

    return df

In [9]:
# routelines = clean_routelines()

In [10]:
# routelines.drop(columns = ['geometry']).sample(5)

In [11]:
def clean_longest_shape():
    df = catalog.longest_shape.read()

    df = df.rename(columns={"route_length": "longest_route_length"})

    return df

In [12]:
# longest_shape = clean_longest_shape()

In [13]:
# Drop name?
# crosswalk = catalog.crosswalk.read()

In [14]:
# crosswalk.sample()

In [15]:
# Get outer merge counts including name
# pd.merge(trips, crosswalk, how="outer",
#         on = ["name","feed_key", "route_id", "direction_id"],
#         indicator=True)[["_merge"]].value_counts()

In [16]:
# Get outer merge counts excluding name
# pd.merge(trips, crosswalk, how="outer",
#         on = ["feed_key", "route_id", "direction_id"],
#         indicator=True)[["_merge"]].value_counts()

In [17]:
# m1 = trips.merge(crosswalk, how="inner", on=["feed_key", "route_id", "name", "direction_id"])

In [18]:
# routelines.sample()

In [19]:
# m2 = m1.merge(routelines, how="inner", on=["feed_key", "shape_id"])

In [20]:
"""
m3 = m2.merge(longest_shape.drop(columns = ["geometry"]),
              how="inner", on=["feed_key", "gtfs_dataset_key", "direction_id",
                               "route_id","route_dir_identifier", "name"])"""

'\nm3 = m2.merge(longest_shape.drop(columns = ["geometry"]),\n              how="inner", on=["feed_key", "gtfs_dataset_key", "direction_id",\n                               "route_id","route_dir_identifier", "name"])'

In [21]:
"""m3["route_length_percentage"] = (
        (m3.actual_route_length / m3.longest_route_length) * 100
    ).astype(int)"""

'm3["route_length_percentage"] = (\n        (m3.actual_route_length / m3.longest_route_length) * 100\n    ).astype(int)'

In [22]:
# m3.sample(3).drop(columns = ['geometry','geometry_arrowized'])

In [23]:
# m3.gtfs_dataset_key.nunique(), m3.name.nunique()

In [24]:
""" m4 = (
        m3.groupby(
            [
                "route_id",
                "name",
                "gtfs_dataset_key",
                "route_dir_identifier",
                "shape_id",
                "longest_shape_id",
                "route_length_percentage",
            ]
        )
        .agg({"segment_sequence": "count"})
        .rename(columns={"segment_sequence": "total_segments"})
        .reset_index()
    )"""

' m4 = (\n        m3.groupby(\n            [\n                "route_id",\n                "name",\n                "gtfs_dataset_key",\n                "route_dir_identifier",\n                "shape_id",\n                "longest_shape_id",\n                "route_length_percentage",\n            ]\n        )\n        .agg({"segment_sequence": "count"})\n        .rename(columns={"segment_sequence": "total_segments"})\n        .reset_index()\n    )'

### Function
* Best way is using `gtfs_dataset_key` or `name`?

In [25]:
def merge_trips_routes_longest_shape():
    trips = clean_trips()
    crosswalk = catalog.crosswalk.read()
    routelines = clean_routelines()
    longest_shape = clean_longest_shape()

    m1 = (
        trips.merge(
            crosswalk, how="inner", on=["feed_key", "route_id", "name", "direction_id"]
        )
        .merge(routelines, how="inner", on=["feed_key", "shape_id"])
        .merge(
            longest_shape.drop(columns=["geometry"]),
            how="inner",
            on=[
                "feed_key",
                "gtfs_dataset_key",
                "direction_id",
                "route_id",
                "route_dir_identifier",
                "name",
            ],
        )
    )

    # Calculate out proportion of route length against longest.
    m1["route_length_percentage"] = (
        (m1["actual_route_length"] / m1["longest_route_length"]) * 100
    ).astype(int)

    # Count number of segments that appear in the longest shape.
    m1 = (
        m1.groupby(
            [
                "route_id",
                "name",
                "gtfs_dataset_key",
                "route_dir_identifier",
                "shape_id",
                "longest_shape_id",
                "route_length_percentage",
            ]
        )
        .agg({"segment_sequence": "count"})
        .rename(columns={"segment_sequence": "total_segments"})
        .reset_index()
    )

    return m1

In [26]:
def calculate_longest_shape(operator: str):
    """
    Get total segments of the longest shape and
    shape_id's length compared to the longest
    shape_id's length for a route.
    """
    df = merge_trips_routes_longest_shape()

    df = df.loc[df.name == operator].reset_index(drop=True)

    agg = (
        df.groupby(["name", "route_id", "shape_id"])
        .agg({"route_length_percentage": "max"})
        .reset_index()
    )

    return agg

In [27]:
def summary_stats_route_length():
    """
    Get mean, median, max, and min
    of route length for each operator
    """
    df = merge_trips_routes_longest_shape()

    df = (
        df.groupby(["name", "route_id", "shape_id"])
        .agg({"route_length_percentage": "max"})
        .reset_index()
    )

    df = (
        df.groupby("name")
        .agg({"route_length_percentage": ["mean", "median", "min", "max"]})
        .reset_index()
    )

    df.columns = df.columns.droplevel()

    df = df.rename(columns={"": "Name"})

    df = pd.melt(df, id_vars=["Name"], value_vars=["mean", "median", "min", "max"])

    df.variable = df.variable.str.title()

    return df

In [28]:
route_length = summary_stats_route_length()

In [29]:
# route_length = pd.melt(route_length, id_vars=['Name'], value_vars=['mean', 'median','min','max'])

### Operator

* just 1 boxplot or dot plot for the operator for scheduled shapes % route length
* i've also pared back the summary stats to accompany each chart (just the crucial stuff)
* only allow altair selection on operator, not time or segment, i'm not going to remember what it is moving from selection to selection, so i'd rather see it at once. 
    * For usable trips remove time/segment. 
    * Thought the workflow would be running per operator as opposed to running all operators and selecting from the dropdown menu?
    * If the dropdown menu should be operator -> have to combine all the dataframes together.
* if you find it too crowded, i suggest paring down what's * included in SEGMENT_CUTOFFS ....likely we're never going to go up to 0.9, so you can probably scale it back
* continue with the alt.vconcat, but i want the single boxplot or dotplot to be aligned the same way as the bar
the table can be printed below (outside of the charts) if that's easier
    * Wants charts to run in the same direction.
    * Rotate dotplot instead of increasing up and down. Increasing should be left to right. 
    

* Box plot looks super bare.

#### Shape ID vs. Longest Shape ID Lengths 
* Play with jitter, opacity, size. 

In [30]:
def length_comparison_dot(df):

    df = threshold_utils.clean_up_columns(df)

    # Dropdown menu 1
    dropdown1 = alt.binding_select(
        options=df["Name"].sort_values().unique().tolist(), name="Name"
    )
    selection1 = alt.selection_single(fields=["Name"], bind=dropdown1)

    chart = (
        alt.Chart(df, width=5)
        .mark_circle(size=200)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("Value:Q"),
            color=alt.Color(
                "Variable:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=None,
            ),
            tooltip=[
                "Name",
                "Variable",
                "Value",
            ],
            column=alt.Column(
                "Name:N",
                header=alt.Header(
                    labelAngle=90,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .configure_facet(spacing=0)
        .configure_view(stroke=None)
        .properties(title=f"Routes by Length for Operator")
        .add_selection(selection1)
        .transform_filter(selection1)
    )
    return chart 

In [31]:
length_comparison_dot(route_length)

In [32]:
boxplot = merge_trips_routes_longest_shape()

In [33]:
boxplot = (
    boxplot.groupby(["name", "route_id", "shape_id"])
    .agg({"route_length_percentage": "max"})
    .reset_index()
)

In [152]:
threshold_utils.chart_size(
    (
        alt.Chart(boxplot)
        .mark_boxplot(extent="min-max")
        .encode(
            x="route_length_percentage:Q",
            y="name:N",
            color=alt.Color(
                "name",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=None,
            ),
        )
    ).properties(title=f"Test"),
    1000,
    1000,
)

In [68]:
threshold_utils.chart_size(
    (
        alt.Chart(boxplot.loc[boxplot.name == 'Redding Schedule'])
        .mark_boxplot(extent="min-max")
        .encode(
            x="route_length_percentage:Q",
            y="name:N",
            color=alt.Color(
                "name",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=None,
            ),
        )
    ).properties(title=f"Test"),
    500,
    300,
)

#### Trip Time and Segment

In [35]:
def merge_trip_diagnostics_with_total_segments():
    trip_diagnostics = catalog.trip_stats.read()
    segments = catalog.longest_shape.read()

    total_segments_by_shape = (
        segments.groupby(["gtfs_dataset_key", "name", "route_dir_identifier"])
        .segment_sequence.nunique()
        .reset_index()
        .rename(columns={"segment_sequence": "total_segments"})
    )

    df = pd.merge(
        trip_diagnostics,
        total_segments_by_shape,
        on=["gtfs_dataset_key", "route_dir_identifier"],
        how="inner",
        validate="m:1",
    )

    # Should  this be divided by 60 to be in minutes for the latter chunk of code?
    df = df.assign(
        pct_vp_segments=df.num_segments_with_vp.divide(df.total_segments),
        trip_time=((df.trip_end - df.trip_start) / np.timedelta64(1, "s") / 60).astype(
            int
        ),
        total_trips=df.groupby(["gtfs_dataset_key", "name"]).trip_id.transform(
            "nunique"
        ),
    )

    return df

In [36]:
all_ops = merge_trip_diagnostics_with_total_segments()

In [37]:
def summary_valid_trips_by_cutoff(df, time_cutoffs: list, segment_cutoffs: list):
    final = pd.DataFrame()

    for t in time_cutoffs:
        for s in segment_cutoffs:
            valid = (
                df[(df.trip_time >= t) & (df.pct_vp_segments >= s)]
                .groupby(["gtfs_dataset_key", "name", "total_trips"])
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
            )

            valid = valid.assign(
                trip_cutoff=t, segment_cutoff=s, cutoff=f"{t}+ min & {s*100}%+ segments"
            )

            final = pd.concat([final, valid], axis=0)

    final = final.assign(pct_usable_trips=final.n_trips.divide(final.total_trips))

    return final

In [69]:
TIME_CUTOFFS = [5, 10, 15]
SEGMENT_CUTOFFS = [
    0.1,
    0.2,
    0.3,
    0.4,
    0.5,
    0.6,
    0.7,
    0.8
]

In [70]:
valid_stats = summary_valid_trips_by_cutoff(all_ops, TIME_CUTOFFS, SEGMENT_CUTOFFS)

In [71]:
def operator_trip_chart(df):
    df = threshold_utils.clean_up_columns(df)

    # Dropdown menu 1
    dropdown1 = alt.binding_select(
        options=df["Name"].sort_values().unique().tolist(), name="Name"
    )
    selection1 = alt.selection_single(fields=["Name"], bind=dropdown1)

    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(
                "Pct Usable Trips:Q",
                sort=alt.SortField("Pct Usable Trips", order="descending"),
            ),
            y=alt.Y(
                "Cutoff:N", sort=alt.SortField("Pct Usable Trips", order="descending")
            ),
            color=alt.Color(
                "Cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
            ),
            tooltip=[
                "Name",
                "Cutoff",
                "Pct Usable Trips",
                "N Trips",
            ],
        )
        .properties(title=f"Percentage of Usable Trips")
        .add_selection(selection1)
        .transform_filter(selection1)
    )

    chart = threshold_utils.chart_size(chart, 500, 300)
    return chart

In [72]:
operator_trip_chart(valid_stats)

### Statewide
#### Statewide Routes by Cutoff

In [73]:
trips_routes_shape = merge_trips_routes_longest_shape()

In [74]:
trip_stats = catalog.trip_stats.read()

In [75]:
# trip_stats.columns, trips_routes_shape.columns

In [76]:
def statewide_summary():
    trips_routes_shape = merge_trips_routes_longest_shape()
    trip_stats = catalog.trip_stats.read()

    m1 = trip_stats.merge(
        trips_routes_shape.drop(columns=["route_length_percentage"]),
        how="inner",
        on=["gtfs_dataset_key", "route_dir_identifier"],
    )

    m1 = m1.assign(
        pct_vp_segments=m1.num_segments_with_vp.divide(m1.total_segments),
        trip_time=((m1.trip_end - m1.trip_start) / np.timedelta64(1, "s")) / 60,
        total_routes=m1.groupby("route_id").trip_id.transform("nunique"),
    )

    return m1

In [77]:
statewide = statewide_summary()

In [153]:
statewide.head(3)

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,trip_id,route_dir_identifier,trip_start,trip_end,num_segments_with_vp,route_id,name,shape_id,longest_shape_id,total_segments,pct_vp_segments,trip_time,total_routes
0,02af2d11f5bd44434c581540e9e857d8,LA DOT VehiclePositions,183-00dqgv6kf,3371121095,2023-01-18 17:02:55+00:00,2023-01-18 17:30:57+00:00,7,4446,LA DOT Schedule,20569,20569,7,1.0,28.03,237
1,02af2d11f5bd44434c581540e9e857d8,LA DOT VehiclePositions,183-04vq6dp71,3371121095,2023-01-18 18:40:40+00:00,2023-01-18 19:10:40+00:00,6,4446,LA DOT Schedule,20569,20569,7,0.86,30.0,237
2,02af2d11f5bd44434c581540e9e857d8,LA DOT VehiclePositions,183-0541i4kgy,3371121095,2023-01-18 16:07:58+00:00,2023-01-18 16:34:01+00:00,7,4446,LA DOT Schedule,20569,20569,7,1.0,26.05,237


In [79]:
def route_thresholds_time_only(df, time_cutoffs: list):

    total_unique_routes = df.route_id.nunique()

    # Find stats for all operators
    all_operators = pd.DataFrame()
    for t in time_cutoffs:
        valid = (
            df[(df.trip_time >= t)][["route_id"]]
            .nunique()
            .reset_index()
            .rename(columns={0: "Total Routes in Category"})
        )

        valid = valid.assign(route_cutoff=t, cutoff=f"{t}+ min")

        all_operators = pd.concat([all_operators, valid], axis=0)

    all_operators = all_operators.assign(
        total_routes=total_unique_routes,
        pct_usable_routes=all_operators["Total Routes in Category"].divide(
            total_unique_routes
        ),
        calitp_itp_id="all operators",
    )

    return all_operators

In [101]:
statewide_thresholds_time_only = route_thresholds_time_only(statewide, TIME_CUTOFFS)

In [102]:
statewide_thresholds_time_only.sample()

Unnamed: 0,index,Total Routes in Category,route_cutoff,cutoff,total_routes,pct_usable_routes,calitp_itp_id
0,route_id,1008,15,15+ min,1022,0.99,all operators


In [105]:
threshold_utils.chart_size((
    alt.Chart(statewide_thresholds_time_only)
    .mark_bar()
    .encode(
        x=alt.X(
            "cutoff:N",
            sort=alt.SortField("pct_usable_routes", order="descending"),
        ),
        y=alt.Y(
            "pct_usable_routes:Q",
            sort=alt.SortField("pct_usable_routes", order="descending"),
        ),
        color=alt.Color(
            "cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
        ),
        tooltip=[
            "total_routes",
            "cutoff",
            "Total Routes in Category",
            "pct_usable_routes",
        ],
    )
    .properties(title=f"Percentage of Usable Routes for the State")
), 500, 300)

In [80]:
def route_thresholds(df, time_cutoffs: list, segment_cutoffs: list):

    total_unique_routes = df.route_id.nunique()

    # Find stats for all operators
    all_operators = pd.DataFrame()

    for t in time_cutoffs:
        for s in segment_cutoffs:
            valid = (
                df[(df.trip_time >= t) & (df.pct_vp_segments >= s)][["route_id"]]
                .nunique()
                .reset_index()
                .rename(columns={0: "Total Routes in Category"})
            )

            valid = valid.assign(
                time_cutoff=t,
                segment_cutoff=s,
                route_cutoff=f"{t}+ min & {s*100}%+ segments",
                total_routes=total_unique_routes,
                pct_usable_routes=valid["Total Routes in Category"].divide(
                    total_unique_routes
                ),
            )

            all_operators = pd.concat([all_operators, valid], axis=0)

    return all_operators

In [81]:
statewide_thresholds = route_thresholds(statewide, TIME_CUTOFFS, SEGMENT_CUTOFFS)

In [94]:
(
    alt.Chart(statewide_thresholds)
    .mark_bar()
    .encode(
        x=alt.X(
            "pct_usable_routes:Q",
            sort=alt.SortField("pct_usable_routes", order="descending"),
        ),
        y=alt.Y(
            "route_cutoff:N",
            sort=alt.SortField("pct_usable_routes", order="descending"),
        ),
        color=alt.Color(
            "route_cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
        ),
        tooltip=[
            "total_routes",
            "route_cutoff",
            "Total Routes in Category",
            "pct_usable_routes",
        ],
    )
    .properties(title=f"Percentage of Usable Routes for the State")
)

#### Route Thresholds by Operators

In [110]:
# Find stats for all operators
operator_df = pd.DataFrame()

In [84]:
# Subset statewide to only 2 operators
subset = statewide[statewide["name"].isin(['Big Blue Bus Schedule','Bay Area 511 AC Transit Schedule',])].reset_index(drop = True)

In [122]:
for o in subset.name.unique().tolist():
    for t in TIME_CUTOFFS:
        for s in SEGMENT_CUTOFFS:
            valid = subset.loc[subset.name == o].reset_index(drop = True)
            total_unique_routes = valid.route_id.nunique() 
            operator = valid.head(1).iloc[0]["name"]
            valid = (
                    valid[(valid.trip_time >= t) & (valid.pct_vp_segments >= s)][["route_id"]]
                    .nunique()
                    .reset_index()
                    .rename(columns={0: "Total Routes in Category"})
                )

            valid = valid.assign(
                time_cutoff=t,
                segment_cutoff=s,
                name = operator,
                route_cutoff=f"{t}+ min & {s*100}%+ segments",
                total_routes=total_unique_routes,
                pct_usable_routes=valid["Total Routes in Category"].divide(
                    total_unique_routes
                ),
            )

            operator_df = pd.concat([operator_df, valid], axis=0)

In [89]:
empty_df.name.sort_values().unique().tolist()

['Bay Area 511 AC Transit Schedule', 'Big Blue Bus Schedule']

In [149]:
# Dropdown menu 1
dropdown1 = alt.binding_select(
        options=empty_df["name"].sort_values().unique().tolist(), name="name"
    )
selection1 = alt.selection_single(fields=["name"], bind=dropdown1)

(
        alt.Chart(empty_df)
        .mark_bar()
        .encode(
            x=alt.X(
                "pct_usable_routes:Q",
                sort=alt.SortField("pct_usable_routes", order="descending"),
            ),
            y=alt.Y(
                "route_cutoff:N", sort=alt.SortField("pct_usable_routes", order="descending")
            ),
            color=alt.Color(
                "route_cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
            ),
            tooltip=[

            ],
        )
        .properties(title=f"Percentage of Usable Routes")
    )#.add_selection(selection1).transform_filter(selection1)

In [135]:
# Find stats for all operators
operator_df2 = pd.DataFrame()

In [142]:
for o in statewide.name.unique().tolist():
    for t in TIME_CUTOFFS:
            valid = statewide.loc[statewide.name == o].reset_index(drop = True)
            total_unique_routes = valid.route_id.nunique() 
            operator = valid.iloc[0]["name"]
            valid = (
                    valid[(valid.trip_time >= t)][["route_id"]]
                    .nunique()
                    .reset_index()
                    .rename(columns={0: "Total Routes in Category"})
                )

            valid = valid.assign(
                time_cutoff=t,
                segment_cutoff=s,
                name = operator,
                route_cutoff=f"{t}+ min",
                total_routes=total_unique_routes,
                pct_usable_routes=valid["Total Routes in Category"].divide(
                    total_unique_routes
                ),
            )

            operator_df2 = pd.concat([operator_df2, valid], axis=0)

In [143]:
def operator_trip_chart(df):
    df = threshold_utils.clean_up_columns(df)

    # Dropdown menu 1
    dropdown1 = alt.binding_select(
        options=df["Name"].sort_values().unique().tolist(), name="Name"
    )
    selection1 = alt.selection_single(fields=["Name"], bind=dropdown1)

    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X(
                "Pct Usable Routes:Q",
                sort=alt.SortField("Pct Usable Routes", order="descending"),
            ),
            y=alt.Y(
                "Route Cutoff:N", sort=alt.SortField("Pct Usable Routes", order="descending")
            ),
            color=alt.Color(
                "Route Cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
            ),
            tooltip=[
                "Name",
                "Route Cutoff",
                "Pct Usable Routes",
                "Total Routes",
                "Total Routes In Category"
            ],
        )
        .properties(title=f"Percentage of Usable Trips")
        .add_selection(selection1)
        .transform_filter(selection1)
    )

    chart = threshold_utils.chart_size(chart, 500, 300)
    return chart

In [144]:
operator_trip_chart(operator_df2)