### V2 Recommendations

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
import altair as alt
import _threshold_utils as threshold_utils
from shared_utils import calitp_color_palette as cp
from shared_utils import geography_utils, rt_utils, styleguide, utils

In [3]:
import intake
catalog = intake.open_catalog("./catalog_threshold.yml")

In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

#### Load in data

In [5]:
route_length = threshold_utils.summary_stats_route_length()

In [6]:
route_length = threshold_utils.pre_clean(route_length)

In [7]:
time_segments = threshold_utils.merge_trip_diagnostics_with_total_segments()

In [8]:
time_segments.drop_duplicates(subset = 'name')['total_trips'].sum()

69874

In [9]:
TIME_CUTOFFS = [5, 10, 15]
SEGMENT_CUTOFFS = [
    0.1,
    0.2,
    0.3,
    0.4,
    0.5,
    0.6,
    0.7,
    0.8
]

In [11]:
valid_stats = threshold_utils.summary_valid_trips_by_cutoff(time_segments, TIME_CUTOFFS, SEGMENT_CUTOFFS)

In [13]:
valid_stats = threshold_utils.pre_clean(valid_stats)

#### Delete any operators who do not have RT information

In [None]:
# Filter out any operators without RT information
routelengthlist = set(route_length.Name.unique().tolist())
tripslist = set(valid_stats.Name.unique().tolist())

In [None]:
operators_wo_rt = list(routelengthlist - tripslist)

In [None]:
route_length = route_length.loc[~route_length.Name.isin(operators_wo_rt)].reset_index(drop = True)

In [None]:
# Column that controls the bar charts
dropdown_list = route_length["Name"].sort_values().unique().tolist()

In [None]:
# Dropdown menu 1
dropdown = alt.binding_select(options=[None] + dropdown_list, labels = ['All'] + dropdown_list, name = "Operator")    
selection = alt.selection_single(fields=["Name"], bind=dropdown)
  

##### Dotplot

In [None]:
route_len_dot = (alt.Chart(route_length).mark_circle(opacity=1, size = 200).transform_window(
    id='rank()',
    groupby=['Variable']).encode(
  
    alt.X('Rounded Route Length Percentage:O', sort='ascending',
          scale=alt.Scale(domain=[0,10,20,30,40,50,60,70,80,90,100]), axis=alt.Axis(ticks=False, grid=True)),
    alt.Y('Name:N'), 
    color=alt.Color("Variable:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None),
    tooltip = ['Name', 'Variable', 'Route Length Percentage'])
             .properties(title = "Length of Shape ID versus Longest Shape ID").add_selection(selection).transform_filter(selection))
route_len_dot = threshold_utils.chart_size(route_len_dot, 500, 300)

##### Create text table for Routes

In [None]:
route_length["Zero"] = 0
route_length["Full Information"] = route_length["Variable"] + '-' + route_length["Route Length Percentage"].astype(str) + "%"

In [None]:
route_length['Route Length Percentage'].median()

In [None]:
route_len_table = (
        (alt.Chart(route_length)
            .mark_circle()
            .encode(x=alt.X("Zero:Q", axis=None), 
            y=alt.Y("Full Information", axis=None))
            .properties(title=f"Summary Statistics")).add_selection(selection).transform_filter(selection))
    
route_len_table = (route_len_table.mark_text(
        align="center",
        baseline="middle",
        dx=5)
        .encode(text="Full Information:N")
           )
    

In [None]:
route_len_table = threshold_utils.chart_size(route_len_table, 500,300)

##### Find most lenient and stringent of each operator

In [None]:
valid_stats_leniency = valid_stats.groupby(["Name"]).agg({"Percentage Usable Trips": ["max", "min"]}).reset_index()

In [None]:
valid_stats_leniency.columns = valid_stats_leniency.columns.droplevel()

In [None]:
valid_stats_leniency = valid_stats_leniency.rename(columns={"": "name",})

In [None]:
valid_stats_leniency = pre_clean(valid_stats_leniency)

In [None]:
valid_stats_leniency["Zero"] = 0

In [None]:
valid_stats_leniency['Full Information'] = 'Most Lenient: ' + valid_stats_leniency.Max.astype(str) + "%"  + ' Most Stringent: ' + valid_stats_leniency.Min.astype(str) + "%" 

In [None]:
trip_stats_table = (
        (alt.Chart(valid_stats_leniency)
            .mark_circle()
            .encode(x=alt.X("Zero:Q", axis=None), 
            y=alt.Y("Full Information", axis=None))
            .properties(title=f"Percentage of Trips Kept")).add_selection(selection).transform_filter(selection))
    
trip_stats_table = (trip_stats_table.mark_text(
        align="center",
        baseline="middle",
        dx=5)
        .encode(text="Full Information:N")
           )
trip_stats_table =  threshold_utils.chart_size(trip_stats_table, 500,300) 

#####  Trip Stats

In [None]:
trip_stats_chart = (
        alt.Chart(valid_stats)
        .mark_bar()
        .encode(
            x=alt.X(
                "Percentage Usable Trips:Q",
                sort=alt.SortField("Percentage Usable Trips", order="descending"),
            ),
            y=alt.Y(
                "Cutoff:N", sort=alt.SortField("Percentage Usable Trips", order="descending")
            ),
            color=alt.Color(
                "Cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None
            ),
            tooltip=[
                "Name",
                "Cutoff",
                "Percentage Usable Trips",
                "N Trips",
            ],
        )
        .properties(title=f"Percentage of Usable Trips")
        .add_selection(selection)
        .transform_filter(selection)
    )
trip_stats_chart = threshold_utils.chart_size(trip_stats_chart, 500, 300)

In [None]:
route_len_dot & route_len_table & trip_stats_chart & trip_stats_table

#### Bind Operator-level graphs together. V2

In [None]:
# Brush for selection
brush = alt.selection(type='interval')

In [None]:
boxplot2 = (alt.Chart(route_length).mark_circle(opacity=1, size = 150).transform_window(
    id='rank()',
    groupby=['Variable']).encode(
    alt.X('Rounded Route Length Percentage:O', sort='ascending',
          scale=alt.Scale(domain=[0,10,20,30,40,50,60,70,80,90,100]), axis=alt.Axis(ticks=False, grid=True)),
          alt.Y('Name:N'), 
    color=alt.Color("Variable:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None),
    tooltip = ['Name', 'Variable', 'Route Length Percentage'])
     .properties(title = "Length of Shape ID versus Longest Shape ID").add_selection(brush))


In [None]:
boxplot2 = threshold_utils.chart_size(boxplot2,500, 1000)

In [None]:
ranked_text = alt.Chart(route_length).mark_text().encode(
    y=alt.Y('row_number:O',axis=None)
).transform_window(
    row_number='row_number()'
).transform_filter(
    brush
).transform_window(
    rank='rank(row_number)'
).transform_filter(
    alt.datum.rank < 5
)

In [None]:
operator = ranked_text.encode(text='Name:N').properties(title='Operator')
variable = ranked_text.encode(text='Variable:N').properties(title='Summary Statistics')
route_length_percentage = ranked_text.encode(text='Route Length Percentage:Q').properties(title='Route Length Percentage')

In [None]:
text = alt.hconcat(operator, variable, route_length_percentage) # Combine data tables

In [None]:
alt.vconcat(boxplot2,
    text,
)

### Statewide
#### Statewide Routes by Cutoff

In [None]:
statewide_sum = valid_stats.groupby(['Cutoff']).agg({'N Trips':'sum'}).reset_index()

In [None]:
total_trips_state = valid_stats.groupby('Name')['Total Trips'].max().sum()

In [None]:
statewide_sum['Percentage of Usable Trips'] = statewide_sum['N Trips']/total_trips_state * 100

In [None]:
statewide_chart = (
        alt.Chart(statewide_sum)
        .mark_bar()
        .encode(
            x=alt.X(
                "Percentage of Usable Trips:Q",
                sort=alt.SortField("Percentage of Usable Trips", order="descending"),
            ),
            y=alt.Y(
                "Cutoff:N", sort=alt.SortField("Percentage of Usable Trips", order="descending")
            ),
            color=alt.Color(
                "Cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None
            ),
            tooltip=[
                "Cutoff",
                "Percentage of Usable Trips",
                "N Trips",
            ],
        )
        .properties(title="Percentage of Usable Trips Across All Operators")
    )

In [None]:
statewide_chart = threshold_utils.chart_size(statewide_chart, 500, 300)

In [None]:
statewide_chart

#### How many routes will be cut with each threshold?
* Find exact routes??

In [None]:
def routes_left_thresholds():
    """
    Find number of routes that are cut
    and are left after applying thresholds
    """
    trips_routes_shape = merge_trips_routes_longest_shape()
    trip_stats = catalog.trip_stats.read()
    
    m1 = trip_stats.merge(
        trips_routes_shape.drop(columns=["route_length_percentage"]),
        how="inner",
        on=["gtfs_dataset_key", "route_dir_identifier"],
    )
    
    m1 = m1.assign(
        pct_vp_segments=m1.num_segments_with_vp.divide(m1.total_segments),
        trip_time=((m1.trip_end - m1.trip_start) / np.timedelta64(1, "s")) / 60,
    )
    
    total_unique_routes = m1.route_id.nunique() 
    
    routes = pd.DataFrame()
    for t in TIME_CUTOFFS:
        for s in SEGMENT_CUTOFFS:
            valid = (
                m1[(m1.trip_time >= t) & (m1.pct_vp_segments >= s)][["route_id"]]
                    .nunique()
                    .reset_index()
                    .rename(columns={0: "Total Routes in Category"})
                )

            valid = valid.assign(route_cutoff=f"{t} min, {s}% segments")

            routes = pd.concat([routes, valid], axis=0)
            
    routes = routes.assign(
            total_routes=total_unique_routes,
            percentage_of_routes_left=(routes["Total Routes in Category"].divide(
                total_unique_routes))*100,
            missing_routes = total_unique_routes - routes["Total Routes in Category"],)
    
    return routes

In [None]:
routes_left = routes_left_thresholds()

In [None]:
statewide_routes_chart = (
        alt.Chart(threshold_utils.clean_up_columns(routes_left))
        .mark_bar()
        .encode(
            x=alt.X(
                "Percentage Of Routes Left:Q",
                sort=alt.SortField("Percentage Of Routes Left", order="descending"),
            ),
            y=alt.Y(
                "Route Cutoff:N", sort=alt.SortField("Percentage Of Routes Left", order="descending")
            ),
            color=alt.Color(
                "Route Cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None
            ),
            tooltip=[
                "Route Cutoff",
                "Percentage Of Routes Left",
                "Missing Routes"
            ],
        )
        .properties(title="Percentage of Routes Left after Applying Thresholds")
    )

In [None]:
threshold_utils.chart_size(statewide_routes_chart, 500, 300)

### Recs - 15 minutes, 70% of segments

In [None]:
time_segments.trip_time.median()

In [None]:
valid_stats_leniency.Max.describe()

In [None]:
valid_stats_leniency.Max.median()

In [None]:
valid_stats_leniency.Min.median()

### Find specific missing routes

In [None]:
def find_cut_routes(trip_time:int, segments_pct: float):
    """
    Find which routes are missing 
    after applying thresholds
    """
    trips_routes_shape = merge_trips_routes_longest_shape()
    trip_stats = catalog.trip_stats.read()
    
    m1 = trip_stats.merge(
        trips_routes_shape.drop(columns=["route_length_percentage"]),
        how="inner",
        on=["gtfs_dataset_key", "route_dir_identifier"],
    )
    
    m1 = m1.assign(
        pct_vp_segments=m1.num_segments_with_vp.divide(m1.total_segments),
        trip_time=((m1.trip_end - m1.trip_start) / np.timedelta64(1, "s")) / 60,
    )
    
    # Find routes that are retained
    kept_routes = m1[(m1["trip_time"] >= trip_time ) & (m1["pct_vp_segments"] >= segments_pct)][['name','route_id']].drop_duplicates()
    
    # Cast routes that are retained to a set
    routes_left_after_threshold = set(kept_routes.route_id.tolist())
    
    # Cast all routes into a set
    all_routes = set(m1.route_id.unique().tolist())
    
    # Find routes that are cut out after applying thresholds
    missing_routes_list = list(all_routes - routes_left_after_threshold)
    missing_routes_df = (m1[m1["route_id"]
                            .isin(missing_routes_list)][['name','route_id',]]
                           .drop_duplicates()
                         .reset_index(drop = True)
                         .sort_values(by=['name','route_id'])
                        )
    
    missing_routes_df = pre_clean(missing_routes_df)
    return missing_routes_df

In [None]:
find_cut_routes(15, 0.7)