## RT trip diagnostics: thresholds for usable trips 
### To Do
* Add operator name to itp_id.

In [1]:
import altair as alt
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import numpy as np
import pandas as pd
from calitp.sql import to_snakecase

# from shared_utils import calitp_color_palette as cp
# from shared_utils import geography_utils, styleguide, utils



In [2]:
# Save files to GCS
from calitp.storage import get_fs

fs = get_fs()

In [3]:
# Record start and end time
import datetime

from loguru import logger

In [4]:
import intake

catalog = intake.open_catalog("./catalog_threshold.yml")

In [5]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Load Files

In [6]:
GCS_DASK_PATH = "gs://calitp-analytics-data/data-analyses/dask_test/"
GCS_RT_PATH = "gs://calitp-analytics-data/data-analyses/rt_delay/"

In [7]:
analysis_date = "2022-10-12"

In [8]:
agency = 282

In [9]:
# operator = pd.read_parquet(
#    f"{GCS_DASK_PATH}vp_sjoin/vp_segment_{agency}_{analysis_date}.parquet"
# )

### Task 1
* Using GTFS schedule data, by route_id-shape_id, calculate the route_length of each shape_id as a proportion of the longest shape_id. 
* For <b>each route_id</b>, what's the shortest shape_id length, in proportion to the longest shape_id's length. if it's 100%, then all shape_ids are equal length for that route. if it's 50%, there's a short trip that exists that only runs 50% of the length and turns around.

<b>How</b>
* Need table `trips` from compile cached views -> shape ID and route ID and direction ID -> merge in segments crosswalk with route direction identifier 
* Shapes table -> attach route dir identifier
* Merge in longest shape line using  routes and direction take the fraction. 

In [10]:
def clean_trips():
    df = catalog.trips.read()

    subset = [
        "calitp_itp_id",
        "route_id",
        "direction_id",
        "shape_id",
    ]

    df = df[subset]

    df = df.drop_duplicates().reset_index(drop=True)

    return df

In [11]:
def clean_routelines():
    df = catalog.route_lines.read()

    # Drop CalITP since it's no longer needed
    df = df.drop(columns=["calitp_url_number"])

    df = (df.drop_duplicates()).reset_index(drop=True)

    # Calculate length of geometry
    df = df.assign(actual_route_length=(df.geometry.length))

    return df

In [12]:
def clean_longest_shape():
    df = catalog.longest_shape.read()

    df = df.rename(columns={"route_length": "longest_route_length"})

    return df

In [13]:
def merge_trips_routes_longest_shape():
    trips = clean_trips()
    crosswalk = catalog.crosswalk.read()
    routelines = clean_routelines()
    longest_shape = clean_longest_shape()

    m1 = (
        trips.merge(
            crosswalk, how="inner", on=["calitp_itp_id", "route_id", "direction_id"]
        )
        .merge(routelines, how="inner", on=["calitp_itp_id", "shape_id"])
        .merge(
            longest_shape.drop(columns=["geometry"]),
            how="inner",
            on=["calitp_itp_id", "direction_id", "route_id", "route_dir_identifier"],
        )
    )

    # Calculate out proportion of route length against longest.
    m1["route_length_percentage"] = (
        (m1["actual_route_length"] / m1["longest_route_length"]) * 100
    ).astype(int)

    # Count number of segments that appear in the longest shape.
    m1 = (
        m1.groupby(
            [
                "route_id",
                "calitp_itp_id",
                "route_dir_identifier",
                "shape_id",
                "longest_shape_id",
                "route_length_percentage",
            ]
        )
        .agg({"segment_sequence": "count"})
        .rename(columns={"segment_sequence": "total_segments"})
        .reset_index()
    )

    return m1

In [14]:
trips_routes_shape = merge_trips_routes_longest_shape()

In [15]:
trips_routes_shape.shape

(8231, 7)

In [16]:
agg1 = (
    trips_routes_shape.groupby(["calitp_itp_id", "route_id", "shape_id"])
    .agg({"route_length_percentage": "max"})
    .reset_index()
)

In [17]:
agg1.shape

(7832, 4)

In [18]:
agg1.head()

Unnamed: 0,calitp_itp_id,route_id,shape_id,route_length_percentage
0,4,10,shp-10-09,100
1,4,10,shp-10-10,100
2,4,12,shp-12-14,100
3,4,12,shp-12-56,100
4,4,14,shp-14-14,21


In [19]:
dropdown = agg1.calitp_itp_id.unique().tolist()

In [20]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [21]:
# Create drop down menu
input_dropdown = alt.binding_select(options=dropdown, name='Operator')

# The column tied to the drop down menu
selection = alt.selection_single(fields=['calitp_itp_id'], bind=input_dropdown)

chart1 = (
        alt.Chart(agg1.loc[agg1.calitp_itp_id == 282])
        .mark_bar()
        .encode(
            x='route_length_percentage:Q',
            y='shape_id:N',
            color=alt.Color(
                'route_id:N', legend = None
            ),
            tooltip=['route_id','shape_id', 'calitp_itp_id','route_length_percentage'],
        )
        .properties(title="test")
        .add_selection(selection)
        .transform_filter(selection)
    )


In [22]:
# chart1

### Cut Off by Trips 

In [23]:
def merge_trip_diagnostics_with_total_segments():
    trip_diagnostics = pd.read_parquet(
        "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/trip_diagnostics_2022-10-12.parquet",
    )

    segments = catalog.longest_shape.read()

    total_segments_by_shape = (
        segments.groupby(["calitp_itp_id", "route_dir_identifier"])
        .segment_sequence.nunique()
        .reset_index()
        .rename(columns={"segment_sequence": "total_segments"})
    )

    df = pd.merge(
        trip_diagnostics,
        total_segments_by_shape,
        on=["calitp_itp_id", "route_dir_identifier"],
        how="inner",
        validate="m:1",
    )

    df = df.assign(
        pct_vp_segments=df.num_segments_with_vp.divide(df.total_segments),
        trip_time=((df.trip_end - df.trip_start) / np.timedelta64(1, "s") / 60).astype(
            int
        ),
        total_trips=df.groupby("calitp_itp_id").trip_id.transform("nunique"),
    )

    return df

In [26]:
all_operators = merge_trip_diagnostics_with_total_segments()

In [27]:
operator_282_trips = all_operators.loc[
    all_operators.calitp_itp_id == 282
].reset_index(drop=True)

In [28]:
operator_282_trips.trip_id.nunique()

8578

In [29]:
operator_282_trips.head()

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,trip_start,trip_end,num_segments_with_vp,total_segments,pct_vp_segments,trip_time,total_trips
0,282,11108746,4013410901,2022-10-12 15:05:36,2022-10-12 15:47:45,9,10,0.9,42,8578
1,282,11108747,4013410901,2022-10-12 14:57:24,2022-10-12 15:41:44,9,10,0.9,44,8578
2,282,11108748,4013410901,2022-10-12 14:49:23,2022-10-12 15:31:16,10,10,1.0,41,8578
3,282,11108749,4013410901,2022-10-12 14:45:02,2022-10-12 15:27:16,10,10,1.0,42,8578
4,282,11108750,4013410901,2022-10-12 14:36:35,2022-10-12 15:23:40,9,10,0.9,47,8578


In [30]:
def summary_valid_trips_by_cutoff(df, time_cutoffs: list, segment_cutoffs: list):

    final = pd.DataFrame()

    for t in time_cutoffs:
        for s in segment_cutoffs:
            valid = (
                df[(df.trip_time >= t) & (df.pct_vp_segments >= s)]
                .groupby(["calitp_itp_id", "total_trips"])
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
            )

            valid = valid.assign(
                trip_cutoff=t, segment_cutoff=s, cutoff=f"{t}+ min & {s*100}%+ segments"
            )

            final = pd.concat([final, valid], axis=0)

    final = final.assign(pct_usable_trips=final.n_trips.divide(final.total_trips))

    return final

In [31]:
TIME_CUTOFFS = [5, 10, 15]
SEGMENT_CUTOFFS = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75]

In [32]:
valid_stats = summary_valid_trips_by_cutoff(
    all_operators, TIME_CUTOFFS, SEGMENT_CUTOFFS
)

In [33]:
valid_stats.columns

Index(['calitp_itp_id', 'total_trips', 'n_trips', 'trip_cutoff',
       'segment_cutoff', 'cutoff', 'pct_usable_trips'],
      dtype='object')

In [34]:
def operator_tripchart(df, chart_title:str):
    # Create drop down menu 1
    input_dropdown = alt.binding_select(options=df.calitp_itp_id.unique().tolist(), name='Operator')
    selection1 = alt.selection_single(fields=['calitp_itp_id'], bind=input_dropdown)
    
    # Dropdown menu 1
    seg_dropdown = alt.binding_select(options=df.segment_cutoff.unique().tolist(), name='Segment')
    selection2 = alt.selection_single(fields=['segment_cutoff'], bind=seg_dropdown)
    
    # Dropdown 2 
    triptime_dropdown = alt.binding_select(options=df.trip_cutoff.unique().tolist(), name='Time')
    selection3 = alt.selection_single(fields=['trip_cutoff'], bind=triptime_dropdown)
    
    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x='pct_usable_trips:Q',
            y='cutoff:N',
            color=alt.Color(
                'pct_usable_trips'
            ),
            tooltip=["cutoff", "calitp_itp_id","pct_usable_trips", "n_trips"],
        )
        .properties(title=chart_title)
        .add_selection(selection1)
        .transform_filter(selection1)
    )
    
    chart = chart.add_selection(selection2).transform_filter(selection2)
    chart = chart.add_selection(selection3).transform_filter(selection3)
    return chart

In [35]:
chart1 = operator_tripchart(valid_stats, '% Usable Trips by Operator')

In [36]:
chart1

In [37]:
# Find the total number of segments in the specific operator file
# vs. what was recorded in `longest_shape`
m2["segment_proportion"] = ((m2.number_of_segments / m2.segment_sequence) * 100).astype(
    "int64"
)

NameError: name 'm2' is not defined

In [None]:
m2.sample()

In [None]:
m2.segment_proportion.value_counts().head()

In [None]:
m2.loc[m2.route_id == "U"]

In [None]:
m2.loc[m2.route_dir_identifier == 4105021223].shape_id.nunique()

In [None]:
m2.loc[m2.route_dir_identifier == 4105021223].longest_shape_id.nunique()

In [None]:
m2.loc[m2.route_dir_identifier == 4105021223].trip_id.nunique()

In [None]:
m2.loc[m2.route_dir_identifier == 4105021223].sample(5)

In [None]:
m2.loc[m2.trip_id == "6566020"]

In [None]:
operator_4.loc[operator_4.trip_id == "6566020"].head()

In [None]:
operator_4_metrics.loc[operator_4_metrics.trip_id == "6566020"]

In [None]:
# Can't find 1244740981 in this list.
# operator_4.route_dir_identifier.unique().tolist()

In [None]:
# Total route ids using longest_shape/trips/routelines.
routelines_final.loc[routelines_final.calitp_itp_id == 4][["route_id"]].nunique()

In [None]:
m2.route_id.nunique()

In [None]:
merged_routeid = set(m2.route_id.unique().tolist())

In [None]:
routelines_routeid = set(routelines_final.route_id.unique().tolist())

In [None]:
merged_routeid - routelines_routeid

In [None]:
# routelines_routeid - merged_routeid

### Cut Off by Routes 
* Test with Muni

In [38]:
def operator_routes(df, chart_title:str):
    # Create drop down menu 1
    input_dropdown = alt.binding_select(options=df.route_id.unique().tolist(), name='Route')
    selection1 = alt.selection_single(fields=['route_id'], bind=input_dropdown)
    
    # Dropdown menu 2
    seg_dropdown = alt.binding_select(options=df.segment_cutoff.unique().tolist(), name='Segment')
    selection2 = alt.selection_single(fields=['segment_cutoff'], bind=seg_dropdown)
    
    # Dropdown 3 
    triptime_dropdown = alt.binding_select(options=df.trip_cutoff.unique().tolist(), name='Time')
    selection3 = alt.selection_single(fields=['trip_cutoff'], bind=triptime_dropdown)
    
    # Dropdown 4 
    operator_dropdown = alt.binding_select(options=df.calitp_itp_id.unique().tolist(), name='Operator')
    selection4 = alt.selection_single(fields=['calitp_itp_id'], bind=operator_dropdown)
    
    chart = (
        alt.Chart(df)
        .mark_bar()
        .encode(
            x='usable_trip_percentage:Q',
            y='cutoff:N',
            color=alt.Color(
                'usable_trip_percentage'
            ),
            tooltip=["cutoff", "route_id", "usable_trip_percentage", "n_trips"],
        )
        .properties(title=chart_title)
        .add_selection(selection1)
        .transform_filter(selection1)
    )
    
    chart = chart.add_selection(selection2).transform_filter(selection2)
    
    chart = chart.add_selection(selection3).transform_filter(selection3)
    
    chart = chart.add_selection(selection4).transform_filter(selection4)
    
    return chart 

In [39]:
trip_diagnostics = pd.read_parquet(
    "gs://calitp-analytics-data/data-analyses/rt_segment_speeds/trip_diagnostics_2022-10-12.parquet",
)

In [40]:
#operator_282 = trip_diagnostics.loc[trip_diagnostics.calitp_itp_id == 282].reset_index(
#    drop=True
#)

In [41]:
#operator_282.shape

In [42]:
#operator_282.head()

In [43]:
m1 = trip_diagnostics.merge(
    trips_routes_shape.drop(columns=["route_length_percentage"]),
    how="inner",
    on=["calitp_itp_id", "route_dir_identifier"],
)

In [44]:
m1.shape, trips_routes_shape.shape

((173714, 10), (8231, 7))

In [45]:
m1 = m1.assign(
    pct_vp_segments=m1.num_segments_with_vp.divide(m1.total_segments),
    trip_time=((m1.trip_end - m1.trip_start) / np.timedelta64(1, "s") / 60).astype(int),
    total_trips=m1.groupby("route_id").trip_id.transform("nunique"),
)

In [46]:
m1.route_id.nunique()

819

In [47]:
test = pd.DataFrame()

In [48]:
for t in [5, 10, 15]:
    for s in [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75]:
        valid = (
            m1[(m1.trip_time >= t) & (m1.pct_vp_segments >= s)]
            .groupby(["calitp_itp_id", "route_id", "total_trips"])
            .trip_id.nunique()
            .reset_index()
            .rename(columns={"trip_id": "n_trips"})
        )

        valid = valid.assign(
            trip_cutoff=t, segment_cutoff=s, cutoff=f"{t}+ min & {s*100}%+ segments"
        )

        test = pd.concat([test, valid], axis=0)
        test["usable_trip_percentage"] = (
            (test.n_trips / test.total_trips) * 100
        ).astype(int)

In [49]:
test.shape

(31653, 8)

In [50]:
test.sample()

Unnamed: 0,calitp_itp_id,route_id,total_trips,n_trips,trip_cutoff,segment_cutoff,cutoff,usable_trip_percentage
285,182,212-13157,146,145,10,0.5,10+ min & 50.0%+ segments,99


In [51]:
# Check that results below make sense.
# route_38[(route_38['pct_vp_segments'] > 0.70 ) & (route_38['trip_time'] > 5)][['trip_id']].nunique()

In [52]:
test_chart = operator_routes(test, 'Muni by Route')

In [55]:
# test_chart

### Notes
Github
* For each operator, what's the % of RT trip_ids that would remain after those thresholds are used? Make a chart function that takes a single operator. Produce charts for all operators. Is the time or geographic coverage that's driving this excluding of trips? What is a recommended threshold to use?
* For short trips, do they tend to be 50% of the longest route length? 40% 30%? 

Meeting
* Filter out for trips that provide useful information before attaching segments to it. 
* How many shape ID's for that route are usable? 
* What's the typical threshold of the actual length of the route versus the longest length we have on record?
* Example: How many 10 minute unique trip IDs will remain and segments will remain after filtering ones that don't provide insights?
* % of segments that actually show up reflects how much of a trip was recorded in GTFS. 

Already Answered Notes/Questions
* What is the calitp url number? What does 0 or 1 mean? V1, operator has different feeds. 
    * 0 could be primary, 1 is backup. This column will be deleted in V2. 
* Do you think that most shape IDS are going to be less than 100% of the length of the longest shape ID? 
    * Not necessarily, shape ID can be a short version of the trip.
* What’s the difference between direction ID and route dir identifier? What does the 0 and 1 mean in direction ID?
    * We don't know where the bus is going, so just do 0 and 1.
    * Route dir identifier: captures route info and direction it is going to capture all the trips. Helps with groupby. 
    * We don't want to stick with trip id, we need to get to route level. 
    * Don't want to lose info on the direction. 
    * Have to distinguish direction or else it'll look like the bus is going backwards when plotting.
    * RT data comes with direction id and can get which direction it ran in from schedule data. 
    * Attach route, join coordinate data to segments. 
    * Use segments and average out trips that occurred on that segment. 
* Ask about graph on Slack. 
* Should I use this `get_routelines` from `A1_vehicle_positions`. 
    * Just read it directly from GCS, don't need buffer.
* Why would the same route ID for the other direction have more segments? 
   * Can have a layover. 
   * A segment must be 1000 meters or less.
* The `route_dir_identifier` is used for segments to cut segments
for both directions the route runs.

* How come there are so many different timestamps within a 30 second increments of each either within the same segment? GTFS pings every 30 seconds.

In [None]:
len(m2)

In [None]:
(m2.proportion_route_length.value_counts() / len(m2) * 100).head(15)

In [None]:
(m2.segment_proportion.value_counts() / len(m2) * 100).head(15)

In [None]:
m2.minutes_elapsed.describe()

In [None]:
p25_time = m2.minutes_elapsed.quantile(0.25).astype(int)
p50_time = m2.minutes_elapsed.quantile(0.50).astype(int)
p75_time = m2.minutes_elapsed.quantile(0.75).astype(int)

In [None]:
def trip_duration(row):
    if (row.minutes_elapsed > 0) and (row.minutes_elapsed <= p25_time):
        return f"Short Trip <= {p25_time} min"
    elif (row.minutes_elapsed > p25_time) and (row.minutes_elapsed <= p75_time):
        return f"Medium Trip <= {p75_time} min"
    else:
        return f"Long Trip > {p75_time} min"

In [None]:
m2["trip_duration_categories"] = m2.apply(lambda x: trip_duration(x), axis=1)

In [None]:
m2.trip_duration_categories.value_counts()

In [None]:
test = m2.loc[m2.segment_proportion < 100][["segment_proportion"]]

In [None]:
test.describe()

In [None]:
p25_length = test.segment_proportion.quantile(0.25).astype(int)
p75_length = test.segment_proportion.quantile(0.75).astype(int)

In [None]:
def shape_id_comparison(row):
    if (row.segment_proportion > 0) and (row.segment_proportion <= p25_length):
        return f" <={p25_length}% of segments appear"
    elif (row.segment_proportion > p25_length) and (
        row.segment_proportion <= p75_length
    ):
        return f"<= {p75_length}% of segments appear"
    else:
        return f">= {p75_length}% of segments appear"

In [None]:
m2["shapeid_vs_longest_shapeid_length"] = m2.apply(
    lambda x: shape_id_comparison(x), axis=1
)

In [None]:
m2.shapeid_vs_longest_shapeid_length.value_counts()

In [None]:
m2.loc[m2.trip_id == "6566020"]

In [None]:
len(m2), len(m2.drop_duplicates())

In [None]:
m2.loc[m2.route_dir_identifier == 2184919314].minutes_elapsed.describe()

In [None]:
m2.loc[m2.route_dir_identifier == 2184919314][
    ["trip_id", "minutes_elapsed", "trip_duration_categories"]
].head(10)

In [None]:
def usable(row):
    if row.shapeid_vs_longest_shapeid_length == (
        f" <={p25_length}% of segments appear"
    ):
        return "Unusable"
    else:
        return "Usable"

In [None]:
m2["usable_y_n"] = m2.apply(lambda x: usable(x), axis=1)