## V2 
### Altair Stuff
* https://stackoverflow.com/questions/62109475/how-to-deselect-drop-down-box-of-altair

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
import altair as alt
import threshold_utils
from shared_utils import calitp_color_palette as cp
from shared_utils import geography_utils, rt_utils, styleguide, utils

In [3]:
import intake
catalog = intake.open_catalog("./catalog_threshold.yml")

In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Tests
* Keep name? 

In [5]:
# Keep name?
def clean_trips():
    df = catalog.trips.read()

    subset = [
        "feed_key",
        "name",
        "route_id",
        "direction_id",
        "shape_id",
    ]

    df = df[subset]

    df = df.drop_duplicates().reset_index(drop=True)

    return df

In [6]:
def clean_routelines():
    df = catalog.route_lines.read()

    df = df.drop(columns=["shape_array_key"])
    
    df = (df.drop_duplicates()).reset_index(drop=True)

    # Calculate length of geometry
    df = df.assign(actual_route_length=(df.geometry.length))

    return df

In [7]:
def clean_longest_shape():
    df = catalog.longest_shape.read()

    df = df.rename(columns={"route_length": "longest_route_length"})

    return df

### Function for shape id vs longest shape id length
* Best way is using `gtfs_dataset_key` or `name`?

In [8]:
def merge_trips_routes_longest_shape():
    """
    Merge and find the shape_id's length
    versus the longest shape_id's length.
    Count segments.
    """
    trips = clean_trips()
    crosswalk = catalog.crosswalk.read()
    routelines = clean_routelines()
    longest_shape = clean_longest_shape()

    m1 = (
        trips.merge(
            crosswalk, how="inner", on=["feed_key", "route_id", "name", "direction_id"]
        )
        .merge(routelines, how="inner", on=["feed_key", "shape_id"])
        .merge(
            longest_shape.drop(columns=["geometry"]),
            how="inner",
            on=[ "feed_key","gtfs_dataset_key","direction_id","route_id","route_dir_identifier","name"],
        )
    )

    # Calculate out proportion of route length against longest.
    m1["route_length_percentage"] = (
        (m1["actual_route_length"] / m1["longest_route_length"]) * 100
    ).astype(int)

    # Count number of segments that appear in the longest shape.
    m1 = (
        m1.groupby(
            [
                "route_id",
                "name",
                "gtfs_dataset_key",
                "route_dir_identifier",
                "shape_id",
                "longest_shape_id",
                "route_length_percentage",
            ]
        )
        .agg({"segment_sequence": "count"})
        .rename(columns={"segment_sequence": "total_segments"})
        .reset_index()
    )

    return m1

In [9]:
def calculate_longest_shape(operator: str):
    """
    Get total segments of the longest shape and
    shape_id's length compared to the longest
    shape_id's length for a route.
    """
    df = merge_trips_routes_longest_shape()

    df = df.loc[df.name == operator].reset_index(drop=True)

    agg = (
        df.groupby(["name", "route_id", "shape_id"])
        .agg({"route_length_percentage": "max"})
        .reset_index()
    )

    return agg

In [10]:
def summary_stats_route_length():
    """
    Get mean, median, max, and min longest shape_id
    versus actual shape_id  of route length for every operator.
    All in one dataframe.
    """
    df = merge_trips_routes_longest_shape()

    df = (
        df.groupby(["name", "route_id", "shape_id"])
        .agg({"route_length_percentage": "max"})
        .reset_index()
    )
    
    # Get summary stats
    df = (
        df.groupby("name")
        .agg({"route_length_percentage": ["mean", "median", "min", "max"]})
        .reset_index()
    )
    
    # Drop index
    df.columns = df.columns.droplevel()
 
    df = df.rename(columns={"": "name",})
    
    # Melt to long df
    df = pd.melt(df, id_vars=["name"], value_vars=["mean", "median", "min", "max"])
    df = df.rename(columns={"value": "route_length_percentage",})
    
    # Title case variable col
    df.variable = df.variable.str.title()
    
    # Round value col for axis
    df['rounded_route_length_percentage'] = ((df.route_length_percentage/100)*10).astype(int)*10
    
    # Sort values by name and mean/median
    df = df.sort_values(['name','variable']).reset_index(drop = True)
    return df

In [11]:
route_length = summary_stats_route_length()

### Precleaning Function

In [12]:
def pre_clean(df):
    df = df.round(1)
    df = threshold_utils.clean_up_columns(df)
    return df

In [13]:
route_length = pre_clean(route_length)

In [14]:
route_length.head()

Unnamed: 0,Name,Variable,Route Length Percentage,Rounded Route Length Percentage
0,Antelope Valley Transit Authority Schedule,Max,100.0,100
1,Antelope Valley Transit Authority Schedule,Mean,94.4,90
2,Antelope Valley Transit Authority Schedule,Median,100.0,100
3,Antelope Valley Transit Authority Schedule,Min,17.0,10
4,Banning Pass Schedule,Max,100.0,100


### Operator 

* just 1 boxplot or dot plot for the operator for scheduled shapes % route length
* i've also pared back the summary stats to accompany each chart (just the crucial stuff)
* only allow altair selection on operator, not time or segment, i'm not going to remember what it is moving from selection to selection, so i'd rather see it at once. 
    * For usable trips remove time/segment. 
    * Thought the workflow would be running per operator as opposed to running all operators and selecting from the dropdown menu?
    * If the dropdown menu should be operator -> have to combine all the dataframes together.
* if you find it too crowded, i suggest paring down what's * included in SEGMENT_CUTOFFS ....likely we're never going to go up to 0.9, so you can probably scale it back
* continue with the alt.vconcat, but i want the single boxplot or dotplot to be aligned the same way as the bar
the table can be printed below (outside of the charts) if that's easier
    * Wants charts to run in the same direction.
    * Rotate dotplot instead of increasing up and down. Increasing should be left to right. 
    

* Box plot looks super bare.

#### Shape ID vs. Longest Shape ID Lengths 
* https://stackoverflow.com/questions/75414637/altair-panel-chart-dynamic-dropdown-filter-doesnt-work

##### Boxplot

In [15]:
# boxplot = merge_trips_routes_longest_shape()

In [16]:
"""
boxplot = (
    boxplot.groupby(["name", "route_id", "shape_id"])
    .agg({"route_length_percentage": "max"})
    .reset_index()
)
"""

'\nboxplot = (\n    boxplot.groupby(["name", "route_id", "shape_id"])\n    .agg({"route_length_percentage": "max"})\n    .reset_index()\n)\n'

In [17]:
"""
threshold_utils.chart_size(
    (
        alt.Chart(boxplot)
        .mark_boxplot(extent="min-max")
        .encode(
            x="route_length_percentage:Q",
            y="name:N",
            color=alt.Color(
                "name",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=None,
            ),
        )
    ).properties(title=f"Test"),
    1000,
    1000,
)"""

'\nthreshold_utils.chart_size(\n    (\n        alt.Chart(boxplot)\n        .mark_boxplot(extent="min-max")\n        .encode(\n            x="route_length_percentage:Q",\n            y="name:N",\n            color=alt.Color(\n                "name",\n                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),\n                legend=None,\n            ),\n        )\n    ).properties(title=f"Test"),\n    1000,\n    1000,\n)'

#### Trip Time and Segment

In [18]:
def merge_trip_diagnostics_with_total_segments():
    trip_diagnostics = catalog.trip_stats.read()
    
    # Load in longest shape
    segments = catalog.longest_shape.read()
    
    # Count # of segments by longest recorded shape.
    # For each route direction and operator.
    total_segments_by_shape = (
        segments.groupby(["gtfs_dataset_key", "name", "route_dir_identifier"])
        .segment_sequence.nunique()
        .reset_index()
        .rename(columns={"segment_sequence": "total_segments"})
    )
    
    df = pd.merge(
        trip_diagnostics,
        total_segments_by_shape,
        on=["gtfs_dataset_key", "route_dir_identifier"],
        how="inner",
        validate="m:1",
    )
    
    # Find the total of segments that appear vs. what 'should' appear,
    # trip time, and number of trips the operator made in total.
    df = df.assign(
        pct_vp_segments=df.num_segments_with_vp.divide(df.total_segments),
        trip_time=((df.trip_end - df.trip_start) / np.timedelta64(1, "s") / 60).astype(
            int
        ),
        total_trips=df.groupby(["gtfs_dataset_key", "name"]).trip_id.transform(
            "nunique"
        ),
    )

    return df

In [19]:
all_ops = merge_trip_diagnostics_with_total_segments()

In [20]:
all_ops.drop_duplicates(subset = 'name')['total_trips'].sum()

69874

In [21]:
def summary_valid_trips_by_cutoff(df, time_cutoffs: list, segment_cutoffs: list):
    """
    Find percentage of trips that meet trip time 
    and percentage of segments by
    """
    final = pd.DataFrame()

    for t in time_cutoffs:
        for s in segment_cutoffs:
            valid = (
                df[(df.trip_time >= t) & (df.pct_vp_segments >= s)]
                .groupby(["gtfs_dataset_key", "name", "total_trips"])
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
            )

            valid = valid.assign(
                trip_cutoff=t, segment_cutoff=s*100, cutoff=f"{t}+ min & {s*100}%+ segments"
            )

            final = pd.concat([final, valid], axis=0)

    final = final.assign(percentage_usable_trips=final.n_trips.divide(final.total_trips) * 100)

    return final

In [22]:
TIME_CUTOFFS = [5, 10, 15]
SEGMENT_CUTOFFS = [
    0.1,
    0.2,
    0.3,
    0.4,
    0.5,
    0.6,
    0.7,
    0.8
]

In [23]:
valid_stats = summary_valid_trips_by_cutoff(all_ops, TIME_CUTOFFS, SEGMENT_CUTOFFS)

In [24]:
valid_stats = pre_clean(valid_stats)

#### Bind Operator-level graphs together. V1.

In [25]:
route_length = threshold_utils.clean_up_columns(route_length)

In [26]:
valid_stats = threshold_utils.clean_up_columns(valid_stats)

In [27]:
# Column that controls the bar charts
dropdown_list = route_length["Name"].sort_values().unique().tolist()

In [28]:
# Dropdown menu 1
dropdown = alt.binding_select(options=[None] + dropdown_list, labels = ['All'] + dropdown_list, name = "Operator")    
selection = alt.selection_single(fields=["Name"], bind=dropdown)
  

##### Dotplot

In [29]:
route_length.sample()

Unnamed: 0,Name,Variable,Route Length Percentage,Rounded Route Length Percentage
219,Madera County Connection Schedule,Min,88.0,80


In [129]:
route_len_dot = (alt.Chart(route_length).mark_circle(opacity=1, size = 200).transform_window(
    id='rank()',
    groupby=['Variable']).encode(
  
    alt.X('Rounded Route Length Percentage:O', sort='ascending',
          scale=alt.Scale(domain=[0,10,20,30,40,50,60,70,80,90,100])),
    alt.Y('Name:N'), 
    color=alt.Color("Variable:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None),
    tooltip = ['Name', 'Variable', 'Route Length Percentage'])
             .properties(title = "Length of Shape ID versus Longest Shape ID").add_selection(selection).transform_filter(selection))
route_len_dot = threshold_utils.chart_size(route_len_dot, 500, 300)

In [164]:
dot_chart = (alt.Chart(route_length).mark_circle(opacity=1, size = 200).transform_window(
    id='rank()',
    groupby=['Variable']).encode(
  
    alt.X('Rounded Route Length Percentage:O', sort='ascending',
          scale=alt.Scale(domain=[0,10,20,30,40,50,60,70,80,90,100])),
    alt.Y('Name:N'), 
    color=alt.Color("Variable:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None),
    tooltip = ['Name', 'Variable', 'Route Length Percentage'])
             .properties(title = "Length of Shape ID versus Longest Shape ID"))

##### Add percentile grids
https://stackoverflow.com/questions/72559887/add-vertical-lines-with-same-encodings-as-corresponding-curve-in-altair

In [96]:
p25 = route_length['Route Length Percentage'].quantile(0.25).astype(float)

In [154]:
p25

86.625

In [98]:
p50 = route_length['Route Length Percentage'].quantile(0.50).astype(float)

In [97]:
p75 = route_length['Route Length Percentage'].quantile(0.75).astype(float)

In [170]:
p25_line = alt.Chart(pd.DataFrame({'x': [p25]})).mark_rule(clip = False, strokeDash=[10, 10]).encode(x='x')

In [136]:
p50_line = alt.Chart(pd.DataFrame({'x': [p50]})).mark_rule(clip = False,strokeDash=[10, 10]).encode(x='x')

In [175]:
p75_line = alt.Chart(pd.DataFrame({'x': [p75]})).mark_rule(clip = False,strokeDash=[10, 10]).encode(x='x')

In [184]:
base = alt.Chart(
    pd.DataFrame({'25th':[p25]})
).mark_rule(color='black', strokeDash=[4, 2]).encode(
    x='x'
)

In [185]:
base

In [176]:
(route_len_dot + p25_line + p50_line + p75_line)

In [179]:
(route_len_dot + base)

##### Create text table for Routes

In [31]:
route_length["Zero"] = 0
route_length["Full Information"] = route_length["Variable"] + '-' + route_length["Route Length Percentage"].astype(int).astype(str) + "%"

In [32]:
route_length['Route Length Percentage'].median()

100.0

In [33]:
route_len_table = (
        (alt.Chart(route_length)
            .mark_circle()
            .encode(x=alt.X("Zero:Q", axis=None), 
            y=alt.Y("Full Information", axis=None))
            .properties(title=f"Summary Statistics")).add_selection(selection).transform_filter(selection))
    
route_len_table = (route_len_table.mark_text(
        align="center",
        baseline="middle",
        dx=5)
        .encode(text="Full Information:N")
           )
    

In [34]:
route_len_table = threshold_utils.chart_size(route_len_table, 500,300)

##### Find most lenient and stringent of each operator

In [35]:
valid_stats_leniency = valid_stats.groupby(["Name"]).agg({"Percentage Usable Trips": ["max", "min"]}).reset_index()

In [36]:
valid_stats_leniency.columns = valid_stats_leniency.columns.droplevel()

In [37]:
valid_stats_leniency = valid_stats_leniency.rename(columns={"": "name",})

In [38]:
valid_stats_leniency = pre_clean(valid_stats_leniency)

In [39]:
valid_stats_leniency["Zero"] = 0

In [40]:
valid_stats_leniency['Full Information'] = 'Most Lenient: ' + valid_stats_leniency.Max.astype(str) + "%"  + ' Most Stringent: ' + valid_stats_leniency.Min.astype(str) + "%" 

In [41]:
trip_stats_table = (
        (alt.Chart(valid_stats_leniency)
            .mark_circle()
            .encode(x=alt.X("Zero:Q", axis=None), 
            y=alt.Y("Full Information", axis=None))
            .properties(title=f"Percentage of Trips Kept")).add_selection(selection).transform_filter(selection))
    
trip_stats_table = (trip_stats_table.mark_text(
        align="center",
        baseline="middle",
        dx=5)
        .encode(text="Full Information:N")
           )
trip_stats_table =  threshold_utils.chart_size(trip_stats_table, 500,300) 

#####  Trip Stats

In [42]:
trip_stats_chart = (
        alt.Chart(valid_stats)
        .mark_bar()
        .encode(
            x=alt.X(
                "Percentage Usable Trips:Q",
                sort=alt.SortField("Percentage Usable Trips", order="descending"),
            ),
            y=alt.Y(
                "Cutoff:N", sort=alt.SortField("Percentage Usable Trips", order="descending")
            ),
            color=alt.Color(
                "Cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None
            ),
            tooltip=[
                "Name",
                "Cutoff",
                "Percentage Usable Trips",
                "N Trips",
            ],
        )
        .properties(title=f"Percentage of Usable Trips")
        .add_selection(selection)
        .transform_filter(selection)
    )
trip_stats_chart = threshold_utils.chart_size(trip_stats_chart, 500, 300)

In [43]:
route_len_dot & route_len_table & trip_stats_chart & trip_stats_table

In [44]:
routelengthlist = set(route_length.Name.unique().tolist())
tripslist = set(valid_stats.Name.unique().tolist())
routelengthlist - tripslist

{'Bay Area 511 ACE Schedule',
 'Bay Area 511 BART Schedule',
 'Bay Area 511 Capitol Corridor Schedule',
 'Bay Area 511 Commute.org Schedule',
 'Bay Area 511 Golden Gate Ferry Schedule',
 'Bay Area 511 Rio Vista Delta Breeze Schedule',
 'Bay Area 511 SFO AirTrain Schedule',
 'Bay Area 511 South San Francisco Shuttle Schedule',
 'Bay Area 511 Treasure Island Ferry Schedule',
 'Bay Area 511 Vacaville City Coach Schedule',
 'Merced GMV Schedule',
 'Sacramento Schedule',
 'TIME GMV Schedule',
 'eTrans Schedule'}

In [45]:
tripslist - routelengthlist 

set()

#### Bind Operator-level graphs together. V2

In [46]:
# Brush for selection
brush = alt.selection(type='interval')

In [47]:
boxplot2 = (alt.Chart(route_length).mark_circle(opacity=1, size = 150).transform_window(
    id='rank()',
    groupby=['Variable']).encode(
    alt.X('Rounded Route Length Percentage:O', sort='ascending', axis=alt.Axis(ticks=False, grid=False)),
          alt.Y('Name:N'), 
    color=alt.Color("Variable:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None),
    tooltip = ['Name', 'Variable', 'Route Length Percentage'])
     .properties(title = "Length of Shape ID versus Longest Shape ID").add_selection(brush))


In [48]:
# c1[['Name','Variable','Route Length Percentage']].dropna()

In [49]:
boxplot2 = threshold_utils.chart_size(boxplot2,500, 1000)

In [50]:
ranked_text = alt.Chart(route_length).mark_text().encode(
    y=alt.Y('row_number:O',axis=None)
).transform_window(
    row_number='row_number()'
).transform_filter(
    brush
).transform_window(
    rank='rank(row_number)'
).transform_filter(
    alt.datum.rank < 10
)

In [51]:
operator = ranked_text.encode(text='Name:N').properties(title='Operator')
variable = ranked_text.encode(text='Variable:N').properties(title='Summary Statistics')
route_length_percentage = ranked_text.encode(text='Route Length Percentage:Q').properties(title='Route Length Percentage')

In [52]:
text = alt.hconcat(operator, variable, route_length_percentage) # Combine data tables

In [53]:
alt.vconcat(boxplot2,
    text,trip_stats_chart, trip_stats_table
)

### Statewide
#### Statewide Routes by Cutoff

In [54]:
statewide_sum = valid_stats.groupby(['Cutoff']).agg({'N Trips':'sum'}).reset_index()

In [55]:
total_trips_state = valid_stats.groupby('Name')['Total Trips'].max().sum()

In [56]:
statewide_sum['Percentage of Usable Trips'] = statewide_sum['N Trips']/total_trips_state * 100

In [57]:
statewide_chart = (
        alt.Chart(statewide_sum)
        .mark_bar()
        .encode(
            x=alt.X(
                "Percentage of Usable Trips:Q",
                sort=alt.SortField("Percentage of Usable Trips", order="descending"),
            ),
            y=alt.Y(
                "Cutoff:N", sort=alt.SortField("Percentage of Usable Trips", order="descending")
            ),
            color=alt.Color(
                "Cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None
            ),
            tooltip=[
                "Cutoff",
                "Percentage of Usable Trips",
                "N Trips",
            ],
        )
        .properties(title="Percentage of Usable Trips Across All Operators")
    )

In [58]:
statewide_chart = threshold_utils.chart_size(statewide_chart, 500, 300)

In [59]:
statewide_chart

#### How many routes will be cut with each threshold?
* Find exact routes??

In [60]:
def routes_left_thresholds():
    """
    Find number of routes that are cut
    and are left after applying thresholds
    """
    trips_routes_shape = merge_trips_routes_longest_shape()
    trip_stats = catalog.trip_stats.read()
    
    m1 = trip_stats.merge(
        trips_routes_shape.drop(columns=["route_length_percentage"]),
        how="inner",
        on=["gtfs_dataset_key", "route_dir_identifier"],
    )
    
    m1 = m1.assign(
        pct_vp_segments=m1.num_segments_with_vp.divide(m1.total_segments),
        trip_time=((m1.trip_end - m1.trip_start) / np.timedelta64(1, "s")) / 60,
    )
    
    total_unique_routes = m1.route_id.nunique() 
    
    routes = pd.DataFrame()
    for t in TIME_CUTOFFS:
        for s in SEGMENT_CUTOFFS:
            valid = (
                m1[(m1.trip_time >= t) & (m1.pct_vp_segments >= s)][["route_id"]]
                    .nunique()
                    .reset_index()
                    .rename(columns={0: "Total Routes in Category"})
                )

            valid = valid.assign(route_cutoff=f"{t} min, {s}% segments")

            routes = pd.concat([routes, valid], axis=0)
            
    routes = routes.assign(
            total_routes=total_unique_routes,
            percentage_of_routes_left=(routes["Total Routes in Category"].divide(
                total_unique_routes))*100,
            missing_routes = total_unique_routes - routes["Total Routes in Category"],)
    
    return routes

In [61]:
routes_left = routes_left_thresholds()

In [62]:
statewide_routes_chart = (
        alt.Chart(threshold_utils.clean_up_columns(routes_left))
        .mark_bar()
        .encode(
            x=alt.X(
                "Percentage Of Routes Left:Q",
                sort=alt.SortField("Percentage Of Routes Left", order="descending"),
            ),
            y=alt.Y(
                "Route Cutoff:N", sort=alt.SortField("Percentage Of Routes Left", order="descending")
            ),
            color=alt.Color(
                "Route Cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None
            ),
            tooltip=[
                "Route Cutoff",
                "Percentage Of Routes Left",
                "Missing Routes"
            ],
        )
        .properties(title="Percentage of Routes Left after Applying Thresholds")
    )

In [63]:
threshold_utils.chart_size(statewide_routes_chart, 500, 300)

### Recs - 15 minutes, 70% of segments

In [64]:
all_ops.trip_time.median()

60.0

In [65]:
valid_stats_leniency.Max.describe()

count    69.00
mean     97.31
std       3.97
min      79.80
25%      96.80
50%      98.70
75%      99.80
max     100.00
Name: Max, dtype: float64

In [66]:
valid_stats_leniency.Max.median()

98.7

In [67]:
valid_stats_leniency.Min.median()

84.7

##### Find specific missing routes?

In [68]:
trips_routes_shape = merge_trips_routes_longest_shape()
trip_stats = catalog.trip_stats.read()
    
m1 = trip_stats.merge(
        trips_routes_shape.drop(columns=["route_length_percentage"]),
        how="inner",
        on=["gtfs_dataset_key", "route_dir_identifier"],
    )
    
m1 = m1.assign(
        pct_vp_segments=m1.num_segments_with_vp.divide(m1.total_segments),
        trip_time=((m1.trip_end - m1.trip_start) / np.timedelta64(1, "s")) / 60,
    )

In [69]:
# Double checks
test_missing_routes = m1[(m1["trip_time"] >= 15 ) & (m1["pct_vp_segments"] >= 0.7)][['name','route_id']].drop_duplicates()

In [70]:
routes_left_after_threshold = set(test_missing_routes.route_id.tolist())

In [71]:
all_routes = set(m1.route_id.unique().tolist())

In [72]:
missing_routes = list(all_routes - routes_left_after_threshold)

In [73]:
m1[m1["route_id"].isin(missing_routes)]['route_id'].nunique()

28

In [74]:
m1[m1["route_id"].isin(missing_routes)]['name'].unique()

array(['Bay Area 511 Santa Clara Transit Schedule', 'Lake Schedule',
       'Bay Area 511 County Connection Schedule',
       'Antelope Valley Transit Authority Schedule',
       'Bay Area 511 Caltrain Schedule',
       'Bay Area 511 AC Transit Schedule',
       'Bay Area 511 Tri-Valley Wheels Schedule',
       'Bay Area 511 Vine Transit Schedule', 'Santa Clarita Schedule',
       'VCTC GMV Schedule', 'Eastern Sierra Schedule', 'SBMTD Schedule',
       'Bay Area 511 San Francisco Bay Ferry Schedule',
       'StanRTA Schedule', 'Humboldt Schedule'], dtype=object)

In [75]:
m1[m1["route_id"].isin(missing_routes)][['name','route_id',]].drop_duplicates()

Unnamed: 0,name,route_id
10254,Bay Area 511 Santa Clara Transit Schedule,ACE Gray
10255,Bay Area 511 Santa Clara Transit Schedule,ACE Green
10257,Bay Area 511 Santa Clara Transit Schedule,ACE Orange
10261,Bay Area 511 Santa Clara Transit Schedule,ACE Purple
10262,Bay Area 511 Santa Clara Transit Schedule,ACE Red
10304,Bay Area 511 Santa Clara Transit Schedule,ACE Violet
11820,Lake Schedule,2033
20520,Bay Area 511 County Connection Schedule,92X
21022,Antelope Valley Transit Authority Schedule,790
36697,Bay Area 511 Caltrain Schedule,L3


In [76]:
for o in subset.name.unique().tolist():
    for t in TIME_CUTOFFS:
        for s in SEGMENT_CUTOFFS:
            valid = subset.loc[subset.name == o].reset_index(drop = True)
            total_unique_routes = valid.route_id.nunique() 
            operator = valid.head(1).iloc[0]["name"]
            valid = (
                    valid[(valid.trip_time >= t) & (valid.pct_vp_segments >= s)][["route_id"]]
                    .nunique()
                    .reset_index()
                    .rename(columns={0: "Total Routes in Category"})
                )

            valid = valid.assign(
                time_cutoff=t,
                segment_cutoff=s,
                name = operator,
                route_cutoff=f"{t}+ min & {s*100}%+ segments",
                total_routes=total_unique_routes,
                pct_usable_routes=valid["Total Routes in Category"].divide(
                    total_unique_routes
                ),
            )

            operator_df = pd.concat([operator_df, valid], axis=0)

NameError: name 'subset' is not defined

In [None]:
empty_df.name.sort_values().unique().tolist()