## RT trip diagnostics: thresholds for usable trips 
### To Do
* Add operator name to itp_id.

In [1]:
import altair as alt
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase



In [2]:
from shared_utils import calitp_color_palette as cp
from shared_utils import geography_utils, styleguide, utils

In [3]:
# Save files to GCS
from calitp.storage import get_fs
fs = get_fs()

In [4]:
# Record start and end time
import datetime
from loguru import logger

In [5]:
import intake
catalog = intake.open_catalog("./catalog_threshold.yml")

In [6]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Operator Level
#### Shape ID Length vs. Longest shape ID Length
* Using GTFS schedule data, by route_id-shape_id, calculate the route_length of each shape_id as a proportion of the longest shape_id. 
* For <b>each route_id</b>, what's the shortest shape_id length, in proportion to the longest shape_id's length. if it's 100%, then all shape_ids are equal length for that route. if it's 50%, there's a short trip that exists that only runs 50% of the length and turns around.

<b>How</b>
* Need table `trips` from compile cached views -> shape ID and route ID and direction ID -> merge in segments crosswalk with route direction identifier 
* Shapes table -> attach route dir identifier
* Merge in longest shape line using  routes and direction take the fraction. 

In [7]:
def clean_trips():
    df = catalog.trips.read()

    subset = [
        "calitp_itp_id",
        "route_id",
        "direction_id",
        "shape_id",
    ]

    df = df[subset]

    df = df.drop_duplicates().reset_index(drop=True)

    return df

In [8]:
def clean_routelines():
    df = catalog.route_lines.read()

    # Drop CalITP since it's no longer needed
    df = df.drop(columns=["calitp_url_number"])
    df = (df.drop_duplicates()).reset_index(drop=True)

    # Calculate length of geometry
    df = df.assign(actual_route_length=(df.geometry.length))

    return df

In [9]:
def clean_longest_shape():
    df = catalog.longest_shape.read()

    df = df.rename(columns={"route_length": "longest_route_length"})

    return df

In [10]:
def merge_trips_routes_longest_shape():
    trips = clean_trips()
    crosswalk = catalog.crosswalk.read()
    routelines = clean_routelines()
    longest_shape = clean_longest_shape()

    m1 = (
        trips.merge(
            crosswalk, how="inner", on=["calitp_itp_id", "route_id", "direction_id"]
        )
        .merge(routelines, how="inner", on=["calitp_itp_id", "shape_id"])
        .merge(
            longest_shape.drop(columns=["geometry"]),
            how="inner",
            on=["calitp_itp_id", "direction_id", "route_id", "route_dir_identifier"],
        )
    )

    # Calculate out proportion of route length against longest.
    m1["route_length_percentage"] = (
        (m1["actual_route_length"] / m1["longest_route_length"]) * 100
    ).astype(int)

    # Count number of segments that appear in the longest shape.
    m1 = (
        m1.groupby(
            [
                "route_id",
                "calitp_itp_id",
                "route_dir_identifier",
                "shape_id",
                "longest_shape_id",
                "route_length_percentage",
            ]
        )
        .agg({"segment_sequence": "count"})
        .rename(columns={"segment_sequence": "total_segments"})
        .reset_index()
    )
   
    return m1

In [11]:
def calculate_longest_shape(operator: int):
    """
    Get total segments of the longest shape and
    shape_id's length compared to the longest
    shape_id's length for a route.
    """
    df = merge_trips_routes_longest_shape()

    df = df.loc[df.calitp_itp_id == operator].reset_index(drop=True)
    
    agg = (
        df.groupby(["calitp_itp_id", "route_id", "shape_id"])
        .agg({"route_length_percentage": "max"})
        .reset_index()
    )

    return df

In [56]:
muni = calculate_longest_shape(282)

In [12]:
def clean_up_columns(df):
    df.columns = df.columns.str.replace("_", " ").str.strip().str.title()
    return df

In [13]:
def chart_size(chart: alt.Chart, chart_width: int, chart_height: int) -> alt.Chart:
    chart = chart.properties(width=chart_width, height=chart_height)
    return chart

In [14]:
def length_comparison_boxplot(df):
    df = clean_up_columns(df)
    operator = df.head(1).iloc[0]["Calitp Itp Id"]
    chart = (
        alt.Chart(df)
        .mark_boxplot(extent="min-max")
        .encode(
            x="Route Id:N",
            y="Route Length Percentage:Q",
            color=alt.Color(
                "Route Id",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=None,
            ),
        )
    ).properties(title=f"Longest vs. Recorded Route Lengths for Operator {operator}")

    chart = chart_size(chart, 1200, 300)
    return chart

In [15]:
# length_comparison_boxplot(muni)

In [16]:
def length_comparison_dot(df):

    df = clean_up_columns(df)
    operator = df.head(1).iloc[0]["Calitp Itp Id"]

    chart = (
        alt.Chart(df, width=5)
        .mark_circle(size=200)
        .encode(
            x=alt.X(
                "jitter:Q",
                title=None,
                axis=alt.Axis(values=[0], ticks=False, grid=False, labels=False),
                scale=alt.Scale(),
            ),
            y=alt.Y("Route Length Percentage:Q"),
            color=alt.Color(
                "Route Id:N",
                scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
                legend=None,
            ),
            tooltip=["Route Length Percentage", "Route Id"],
            column=alt.Column(
                "Route Id:N",
                header=alt.Header(
                    labelAngle=90,
                    titleOrient="top",
                    labelOrient="bottom",
                    labelAlign="right",
                    labelPadding=2,
                ),
            ),
        )
        .transform_calculate(
            # Generate Gaussian jitter with a Box-Muller transform
            jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
        )
        .configure_facet(spacing=0)
        .configure_view(stroke=None)
        .properties(title=f"Routes by Length for Operator {operator}")
    )
    return chart

In [57]:
length_comparison_dot(muni)

### Cut Off by Trips Tiffany's Code
* Double check that my changes are correct.

In [18]:
def merge_trip_diagnostics_with_total_segments():
    trip_diagnostics = catalog.trip_stats.read()
    segments = catalog.longest_shape.read()

    total_segments_by_shape = (
        segments.groupby(["calitp_itp_id", "route_dir_identifier"])
        .segment_sequence.nunique()
        .reset_index()
        .rename(columns={"segment_sequence": "total_segments"})
    )

    df = pd.merge(
        trip_diagnostics,
        total_segments_by_shape,
        on=["calitp_itp_id", "route_dir_identifier"],
        how="inner",
        validate="m:1",
    )

    # Should  this be divided by 60 to be in minutes for the latter chunk of code?
    df = df.assign(
        pct_vp_segments=df.num_segments_with_vp.divide(df.total_segments),
        trip_time=((df.trip_end - df.trip_start) / np.timedelta64(1, "s") / 60).astype(
            int
        ),
        total_trips=df.groupby("calitp_itp_id").trip_id.transform("nunique"),
    )

    return df

In [19]:
all_operators = merge_trip_diagnostics_with_total_segments()

In [20]:
def summary_valid_trips_by_cutoff(df, time_cutoffs: list, segment_cutoffs: list):
    final = pd.DataFrame()

    for t in time_cutoffs:
        for s in segment_cutoffs:
            valid = (
                df[(df.trip_time >= t) & (df.pct_vp_segments >= s)]
                .groupby(["calitp_itp_id", "total_trips"])
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
            )

            valid = valid.assign(
                trip_cutoff=t, segment_cutoff=s, cutoff=f"{t}+ min & {s*100}%+ segments"
            )

            final = pd.concat([final, valid], axis=0)

    final = final.assign(pct_usable_trips=final.n_trips.divide(final.total_trips))

    return final

In [21]:
TIME_CUTOFFS = [5, 10, 15]
SEGMENT_CUTOFFS = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [22]:
valid_stats = summary_valid_trips_by_cutoff(all_operators, TIME_CUTOFFS, SEGMENT_CUTOFFS)

In [23]:
valid_stats.head()

Unnamed: 0,calitp_itp_id,total_trips,n_trips,trip_cutoff,segment_cutoff,cutoff,pct_usable_trips
0,4,5206,5190,5,0.1,5+ min & 10.0%+ segments,1.0
1,30,44,44,5,0.1,5+ min & 10.0%+ segments,1.0
2,45,91,90,5,0.1,5+ min & 10.0%+ segments,0.99
3,75,112,110,5,0.1,5+ min & 10.0%+ segments,0.98
4,110,278,278,5,0.1,5+ min & 10.0%+ segments,1.0


In [24]:
# Filter out for just Muni
# muni_only = all_operators.loc[all_operators.calitp_itp_id == 282]

In [25]:
# muni_only.head(2)

In [35]:
# muni_test = muni_only[(muni_only["trip_time"] >= 5) & (muni_only["pct_vp_segments"]>=0.75)]

In [58]:
# Filter out for just Muni.
muni_trip_stats = valid_stats.loc[valid_stats.calitp_itp_id == 282].reset_index(
  drop=True)

In [59]:
def summary_stats_chart(df, column_to_sum:str):
    
    table = df[column_to_sum].describe().astype(int).reset_index().rename(columns = {'index':'Statistics'})
    median = {'Statistics': 'Median', column_to_sum: table[column_to_sum].median()}
    table = table.append(median, ignore_index = True)
    
    table['Zero'] = 0
    table['Full Information'] = table['Statistics'] + '-' + table[column_to_sum].astype(str)
    
    title= column_to_sum.replace("_", " ").title()
    
    chart = (
        alt.Chart(table)
            .mark_circle()
            .encode(x=alt.X("Zero:Q", axis=None), 
            y=alt.Y("Full Information", axis=None))
            .properties(title=f"{title} Summary Stats"))
    
    chart = (chart.mark_text(
        align="center",
        baseline="middle",
        dx=5)
        .encode(text="Full Information:N")
           )
    
    chart = chart_size(chart, 500,300)
    
    return chart

In [60]:
def operator_trip_chart(df):
    df = clean_up_columns(df)
    operator = df.head(1).iloc[0]["Calitp Itp Id"]
    # Dropdown menu 1
    dropdown1 = alt.binding_select(
        options=df["Segment Cutoff"].unique().tolist(), name="Segment"
    )
    selection1 = alt.selection_single(fields=["Segment Cutoff"], bind=dropdown1)

    # Dropdown 2
    dropdown2 = alt.binding_select(
        options=df["Trip Cutoff"].unique().tolist(), name="Time"
    )
    selection2 = alt.selection_single(fields=["Trip Cutoff"], bind=dropdown2)

    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x=alt.X("Pct Usable Trips:Q", sort=alt.SortField(
                 "Pct Usable Trips", order="descending")), 
            y=alt.Y("Cutoff:N",  
                  sort=alt.SortField("Pct Usable Trips", order="descending")),
            color=alt.Color(
                "Cutoff:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
            ),
            tooltip=["Cutoff", "Pct Usable Trips", "N Trips"],
        )
        .properties(title=f"Percentage of Usable Trips for Operator {operator}")
        .add_selection(selection1)
        .transform_filter(selection1)
    )

    chart = chart.add_selection(selection2).transform_filter(selection2)
    chart = chart_size(chart, 500, 300)
    return chart

In [61]:
# operator_trip_chart(muni_trip_stats)

In [62]:
def operator_stats(operator:int,time_cutoffs:list, segment_cutoffs: list):
    # Get longest shape
    longest_shape = calculate_longest_shape(operator)
    
    # Get trip diagnositics
    trips = merge_trip_diagnostics_with_total_segments()
    trips = trips.loc[trips.calitp_itp_id == operator].reset_index(drop=True)
    
    # Calculate out bins
    valid_stats = summary_valid_trips_by_cutoff(trips, time_cutoffs, segment_cutoffs)
    
    # Create charts
    length_chart = length_comparison_boxplot(longest_shape)
    trips_chart = operator_trip_chart(valid_stats)
    
    # Get tables
    length_summary_chart = summary_stats_chart(longest_shape, 'Route Length Percentage')
    trips_summary_chart = summary_stats_chart(trips, 'trip_time')
    
    return alt.vconcat(trips_chart,trips_summary_chart, length_chart, length_summary_chart, center=True)

In [63]:
operator_stats(4,TIME_CUTOFFS, SEGMENT_CUTOFFS)

### Statewide

In [64]:
def statewide_threshold():
    trips_routes_shape = merge_trips_routes_longest_shape()
    trip_stats = catalog.trip_stats.read()
    
    m1 = trip_stats.merge(
    trips_routes_shape.drop(columns=["route_length_percentage"]),
    how="inner",
    on=["calitp_itp_id", "route_dir_identifier"])
    
    m1 = m1.assign(
    pct_vp_segments=m1.num_segments_with_vp.divide(m1.total_segments),
    trip_time=((m1.trip_end - m1.trip_start) / np.timedelta64(1, "s")) / 60,
    total_trips=m1.groupby("route_id").trip_id.transform("nunique"))
    
    return m1

In [65]:
statewide = statewide_threshold()

In [66]:
statewide.route_id.nunique()

819

In [67]:
statewide.head()

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,trip_start,trip_end,num_segments_with_vp,route_id,shape_id,longest_shape_id,total_segments,pct_vp_segments,trip_time,total_trips
0,110,t_5284733_b_77443_tn_1,389670377,2022-10-12 07:35:11,2022-10-12 08:02:12,8,978,p_1306149,p_2495,30,0.27,27.02,20
1,110,t_5284733_b_77443_tn_1,389670377,2022-10-12 07:35:11,2022-10-12 08:02:12,8,978,p_1306152,p_2495,30,0.27,27.02,20
2,110,t_5284733_b_77443_tn_1,389670377,2022-10-12 07:35:11,2022-10-12 08:02:12,8,978,p_2495,p_2495,15,0.53,27.02,20
3,110,t_5284733_b_77443_tn_1,389670377,2022-10-12 07:35:11,2022-10-12 08:02:12,8,978,p_2495,p_2495,15,0.53,27.02,20
4,110,t_5284733_b_77443_tn_2,389670377,2022-10-12 08:36:09,2022-10-12 09:01:29,8,978,p_1306149,p_2495,30,0.27,25.33,20


In [68]:
def summary_routes_cut_off(df, time_cutoffs: list, segment_cutoffs: list):
    
    total_unique_routes = df.route_id.nunique()
    
    # Find stats for all operators
    all_operators = pd.DataFrame()
    for t in time_cutoffs:
            valid = (
                df[(df.trip_time >= t)]
                [['route_id']]
                .nunique()
                .reset_index()
                .rename(columns = {0:'Total Routes in Category'})
            )
                
            valid = valid.assign(
                trip_cutoff=t, cutoff=f"{t}+ min"
            )

            all_operators = pd.concat([all_operators, valid], axis=0)
            
    all_operators = all_operators.assign(
        total_routes= total_unique_routes, 
        pct_usable_routes=all_operators["Total Routes in Category"].divide(total_unique_routes),
        calitp_itp_id = "all operators")
 

    return all_operators

In [69]:
statewide_test = summary_routes_cut_off(statewide,  TIME_CUTOFFS, SEGMENT_CUTOFFS)

In [70]:
statewide_test

Unnamed: 0,index,Total Routes in Category,trip_cutoff,cutoff,total_routes,pct_usable_routes,calitp_itp_id
0,route_id,813,5,5+ min,819,0.99,all operators
0,route_id,809,10,10+ min,819,0.99,all operators
0,route_id,803,15,15+ min,819,0.98,all operators


In [73]:
# statewide.groupby("calitp_itp_id").route_id.nunique()

In [81]:
statewide.loc[statewide.calitp_itp_id == 282].route_id.nunique() 

64

In [77]:
operators = pd.DataFrame()

In [79]:
for t in TIME_CUTOFFS:
     for i in statewide.calitp_itp_id.unique().tolist():
        total_routes_one_operator = statewide.loc[statewide.calitp_itp_id == i].route_id.nunique()    
        valid = (
                statewide[statewide.calitp_itp_id == i]
                .groupby(["calitp_itp_id"])
                .route_id.nunique()
                .reset_index()
                .rename(columns={"route_id": "Total Routes in Category"})
            )

        valid = valid.assign(
                trip_cutoff=t, cutoff=f"{t}+ min",
                total_routes = total_routes_one_operator, 
                pct_usable_routes=valid["Total Routes in Category"].divide(total_routes_one_operator))

        operators = pd.concat([operators, valid], axis=0)

In [80]:
operators.sort_values('calitp_itp_id')

Unnamed: 0,calitp_itp_id,Total Routes in Category,trip_cutoff,cutoff,total_routes,pct_usable_routes
4,,,,,127,
30,,,,,3,
45,,,,,3,
75,,,,,6,
110,,,,,16,
126,,,,,16,
127,,,,,18,
135,,,,,11,
148,,,,,14,
159,,,,,9,


In [76]:
(alt.Chart(statewide_test)
        .mark_bar()
        .encode(
            x="pct_usable_routes:Q",
            y="cutoff:N",
            color=alt.Color("cutoff"),
            tooltip=["cutoff", "pct_usable_routes", "Total Routes in Category"],
        )
        .properties(title="% of Usable Routes"))


### Cutoff Recommendations

### Notes
Github
* For each operator, what's the % of RT trip_ids that would remain after those thresholds are used? Make a chart function that takes a single operator. Produce charts for all operators. Is the time or geographic coverage that's driving this excluding of trips? What is a recommended threshold to use?
* For short trips, do they tend to be 50% of the longest route length? 40% 30%? 

Meeting
* Filter out for trips that provide useful information before attaching segments to it. 
* How many shape ID's for that route are usable? 
* What's the typical threshold of the actual length of the route versus the longest length we have on record?
* Example: How many 10 minute unique trip IDs will remain and segments will remain after filtering ones that don't provide insights?
* % of segments that actually show up reflects how much of a trip was recorded in GTFS. 

Already Answered Notes/Questions
* What is the calitp url number? What does 0 or 1 mean? V1, operator has different feeds. 
    * 0 could be primary, 1 is backup. This column will be deleted in V2. 
* Do you think that most shape IDS are going to be less than 100% of the length of the longest shape ID? 
    * Not necessarily, shape ID can be a short version of the trip.
* What’s the difference between direction ID and route dir identifier? What does the 0 and 1 mean in direction ID?
    * We don't know where the bus is going, so just do 0 and 1.
    * Route dir identifier: captures route info and direction it is going to capture all the trips. Helps with groupby. 
    * We don't want to stick with trip id, we need to get to route level. 
    * Don't want to lose info on the direction. 
    * Have to distinguish direction or else it'll look like the bus is going backwards when plotting.
    * RT data comes with direction id and can get which direction it ran in from schedule data. 
    * Attach route, join coordinate data to segments. 
    * Use segments and average out trips that occurred on that segment. 
* Ask about graph on Slack. 
* Should I use this `get_routelines` from `A1_vehicle_positions`. 
    * Just read it directly from GCS, don't need buffer.
* Why would the same route ID for the other direction have more segments? 
   * Can have a layover. 
   * A segment must be 1000 meters or less.
* The `route_dir_identifier` is used for segments to cut segments
for both directions the route runs.

* How come there are so many different timestamps within a 30 second increments of each either within the same segment? GTFS pings every 30 seconds.