## RT trip diagnostics: thresholds for usable trips 

In [223]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import pandas as pd
from calitp.sql import to_snakecase
from shared_utils import geography_utils, utils

In [224]:
# Save files to GCS
from calitp.storage import get_fs

fs = get_fs()

In [225]:
# Record start and end time
import datetime

from loguru import logger

In [226]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Look at files

In [227]:
GCS_DASK_PATH = "gs://calitp-analytics-data/data-analyses/dask_test/"

In [228]:
GCS_RT_PATH = "gs://calitp-analytics-data/data-analyses/rt_delay/"

In [229]:
analysis_date = "2022-10-12"

In [230]:
# Read in route lines
routelines = gpd.read_parquet(
    f"{GCS_RT_PATH}compiled_cached_views/routelines_{analysis_date}.parquet"
)

In [231]:
len(routelines), routelines.shape_id.nunique(), routelines.calitp_itp_id.nunique()

(9430, 6353, 175)

In [232]:
routelines.sample().drop(columns=["geometry"])

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id
7903,310,1,10


In [233]:
# Read in longest_shape of each route
longest_shape = gpd.read_parquet(f"{GCS_DASK_PATH}longest_shape_segments.parquet")

In [234]:
len(
    longest_shape
), longest_shape.longest_shape_id.nunique(), longest_shape.calitp_itp_id.nunique()

(126896, 3960, 175)

In [235]:
longest_shape.sample().drop(columns=["geometry", "geometry_arrowized"])

Unnamed: 0,calitp_itp_id,calitp_url_number,route_id,direction_id,longest_shape_id,route_dir_identifier,route_length,segment_sequence
88806,294,0,101,0,105886,1497132652,41734.59,8


In [236]:
# Says missing geospatial data
segments_crosswalks = pd.read_parquet(
    f"{GCS_DASK_PATH}segments_route_direction_crosswalk.parquet"
)

In [237]:
len(segments_crosswalks)

5150

In [238]:
segments_crosswalks.sample()

Unnamed: 0,calitp_itp_id,route_id,direction_id,route_dir_identifier
2105,323,Ventura County Line,1,1840695654


In [239]:
# Read in one segment for ONE itp id first
# when using gpd.read_parquert() says
# Missing geo metadata in Parquet/Feather file.
# Use pandas.read_parquet/read_feather() instead.
segment_148 = pd.read_parquet(
    f"{GCS_DASK_PATH}vp_sjoin/vp_segment_148_{analysis_date}.parquet"
)

In [240]:
# segment_148 = gpd.GeoDataFrame(
#    segment_148, geometry=gpd.points_from_xy(segment_148.lon, segment_148.lat))

In [241]:
len(segment_148)

22692

In [242]:
segment_148 = segment_148.sort_values(["calitp_itp_id", "trip_id", "segment_sequence"])

In [243]:
segment_148.sample()

Unnamed: 0,calitp_itp_id,calitp_url_number,vehicle_timestamp,trip_id,route_dir_identifier,segment_sequence,lon,lat
263,148,0,2022-10-12 11:33:50,930,1342713973,0,30415.83,-187927.38


### Task 1
* Using GTFS schedule data, by route_id-shape_id, calculate the route_length of each shape_id as a proportion of the longest shape_id. 
* For each route_id, what's the shortest shape_id length, in proportion to the longest shape_id's length. if it's 100%, then all shape_ids are equal length for that route. if it's 50%, there's a short trip that exists that only runs 50% of the length and turns around.

Notes
* Is it correct to join on `shape_id` and `longest_shape_id`.

In [244]:
routelines.crs == longest_shape.crs

True

In [245]:
# Calculate length of geometry
routelines = routelines.assign(
    route_length=(routelines.geometry.to_crs(geography_utils.CA_StatePlane).length)
)

In [246]:
# routelines.drop(columns = ["geometry"]).sort_values(["calitp_itp_id","shape_id"]).head(200)

In [247]:
# Dissolve so only one row for each calitp_id/shape_id.
routelines_diss = routelines.dissolve(
    by=[
        "calitp_itp_id",
        "shape_id",
    ],
    aggfunc={
        "route_length": "sum",
    },
).reset_index()

In [248]:
routelines_diss.shape_id.nunique(), len(routelines_diss)

(6353, 7685)

In [249]:
# Dissolve so only one row for each calitp_id/shape_id/route_id
longest_shape_diss = longest_shape.dissolve(
    by=["calitp_itp_id", "longest_shape_id", "route_id", "route_dir_identifier"],
    aggfunc={
        "route_length": "sum",
    },
).reset_index()

In [250]:
# longest_shape.drop(columns = ["geometry", "geometry_arrowized"]).sort_values(["calitp_itp_id","route_id"]).head(200)

In [251]:
len(longest_shape_diss)

5150

In [252]:
# routelines_diss.drop(columns = ["geometry"]).sort_values("calitp_itp_id").head(100)

In [253]:
# Do an inner merge.
m1 = routelines_diss.merge(
    longest_shape_diss,
    how="inner",
    left_on=["calitp_itp_id", "shape_id"],
    right_on=["calitp_itp_id", "longest_shape_id"],
    suffixes=("_routelines", "_longest_line"),
)

In [254]:
len(m1)

5150

In [255]:
# Make sure this is a gdf
m1 = m1.set_geometry("geometry_routelines")

In [256]:
# Calculate out proportion of route length against longest.
m1["proportion_route_length"] = (
    m1["route_length_routelines"] / m1["route_length_longest_line"]
) * 100

In [257]:
m1.proportion_route_length.describe()

count   5150.00
mean      32.88
std       32.66
min        1.06
25%       12.62
50%       23.43
75%       41.06
max      657.58
Name: proportion_route_length, dtype: float64

In [258]:
# m1.drop(columns = ['geometry_routelines','geometry_longest_line']).sort_values(["calitp_itp_id", "route_id", "proportion_route_length"]).head(100)

### Task 2
* Testing with 148 Kings County Area Public Transit Agency
* Calculate time of trips?


Questions 
* How to define short trips - between segments? Or the entire trip as a whole? 
    * If trip as a whole, then just subtract the last segment minus 0?
* Are we trying to exclude short trips?

In [259]:
len(segment_148)

22692

In [260]:
segment_148_min = (
    segment_148.groupby(["calitp_itp_id", "trip_id", "route_dir_identifier",])
    .agg({"vehicle_timestamp": "min"})
    .rename(columns={"vehicle_timestamp": "min_time"})
    .reset_index()
)

In [261]:
segment_148_max = (
    segment_148.groupby(["calitp_itp_id", "trip_id", "route_dir_identifier",])
    .agg({"vehicle_timestamp": "max"})
    .rename(columns={"vehicle_timestamp": "max_time"})
    .reset_index()
)

In [262]:
segment_148_m = segment_148_max.merge(
    segment_148_min,
    how="inner",
    on=["calitp_itp_id", "trip_id", "route_dir_identifier"],
)

In [263]:
# https://stackoverflow.com/questions/51491724/calculate-difference-of-2-dates-in-minutes-in-pandas
segment_148_m['minutes_elapsed'] = (segment_148_m.max_time - segment_148_m.min_time).dt.total_seconds() / 60

In [264]:
segment_148_m.minutes_elapsed.describe()

count   232.00
mean     40.54
std      27.08
min      21.00
25%      28.48
50%      30.00
75%      32.50
max     189.50
Name: minutes_elapsed, dtype: float64

In [265]:
segment_148_m.route_dir_identifier.nunique()

14

In [266]:
segment_148_m.minutes_elapsed.describe()

count   232.00
mean     40.54
std      27.08
min      21.00
25%      28.48
50%      30.00
75%      32.50
max     189.50
Name: minutes_elapsed, dtype: float64

In [267]:
# Maybe anything below 25% percentile is a short trip to throw away
p25 = segment_148_m.minutes_elapsed.quantile(0.25).astype(float)

In [268]:
segment_148_filtered = (segment_148_m.loc[segment_148_m.minutes_elapsed > p25]).reset_index(drop = True)

In [269]:
f"{len(segment_148_filtered)/len(segment_148_m)}% of rows are left"

'0.7327586206896551% of rows are left'

In [270]:
segment_148_filtered.shape

(170, 6)

In [271]:
segment_148_filtered.columns

Index(['calitp_itp_id', 'trip_id', 'route_dir_identifier', 'max_time',
       'min_time', 'minutes_elapsed'],
      dtype='object')

In [272]:
m1.columns

Index(['calitp_itp_id', 'shape_id', 'geometry_routelines',
       'route_length_routelines', 'longest_shape_id', 'route_id',
       'route_dir_identifier', 'geometry_longest_line',
       'route_length_longest_line', 'proportion_route_length'],
      dtype='object')

In [273]:
segments_crosswalks.columns

Index(['calitp_itp_id', 'route_id', 'direction_id', 'route_dir_identifier'], dtype='object')

In [275]:
segment_148_filtered = segment_148_filtered.merge(
    segments_crosswalks,
    how="inner",
    on=['calitp_itp_id','route_dir_identifier'],
)

In [None]:
# segment_148_filtered

In [277]:
m2 = segment_148_filtered.merge(
    m1,
    how="inner",
    on=['calitp_itp_id','route_dir_identifier', "route_id"],
    indicator = True
)

In [None]:
# m2.drop(columns = ['geometry_routelines','geometry_longest_line'])

### Questions 
* For each operator, what's the % of RT trip_ids that would remain after those thresholds are used? Make a chart function that takes a single operator. Produce charts for all operators. Is the time or geographic coverage that's driving this excluding of trips? What is a recommended threshold to use?
* For short trips, do they tend to be 50% of the longest route length? 40% 30%? Have this handy to inform question 1.
