## RT trip diagnostics: thresholds for usable trips 

In [1]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import pandas as pd
from calitp.sql import to_snakecase
from shared_utils import geography_utils, utils



In [2]:
# Save files to GCS
from calitp.storage import get_fs
fs = get_fs()

In [3]:
# Record start and end time
import datetime
from loguru import logger

In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Look at files

In [5]:
GCS_DASK_PATH = "gs://calitp-analytics-data/data-analyses/dask_test/"

In [6]:
GCS_RT_PATH = "gs://calitp-analytics-data/data-analyses/rt_delay/"

In [7]:
analysis_date = "2022-10-12"

* Should I use this `get_routelines` from `A1_vehicle_positions`

In [8]:
# Read in route lines
routelines = gpd.read_parquet(
    f"{GCS_RT_PATH}compiled_cached_views/routelines_{analysis_date}.parquet"
)

In [9]:
len(routelines), routelines.shape_id.nunique(), routelines.calitp_itp_id.nunique()

(9430, 6353, 175)

In [10]:
routelines.sample().drop(columns=["geometry"])

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id
6715,290,1,350093


* The `route_dir_identifier` is used for segments to cut segments
for both directions the route runs.


In [11]:
# Read in longest_shape of each route
longest_shape = gpd.read_parquet(f"{GCS_DASK_PATH}longest_shape_segments.parquet")

In [12]:
len(
    longest_shape
), longest_shape.longest_shape_id.nunique(), longest_shape.calitp_itp_id.nunique()

(126896, 3960, 175)

In [13]:
longest_shape.sort_values(['calitp_itp_id', 'route_id']).head(1).drop(columns=["geometry", "geometry_arrowized"])

Unnamed: 0,calitp_itp_id,calitp_url_number,route_id,direction_id,longest_shape_id,route_dir_identifier,route_length,segment_sequence
0,4,0,10,0,shp-10-10,2184919314,11800.07,0


In [14]:
# Says missing geospatial data
segments_crosswalks = pd.read_parquet(
    f"{GCS_DASK_PATH}segments_route_direction_crosswalk.parquet"
)

In [15]:
len(segments_crosswalks)

5150

In [16]:
segments_crosswalks.sample()

Unnamed: 0,calitp_itp_id,route_id,direction_id,route_dir_identifier
1358,361,2144,0,1192375714


In [17]:
# Read in one segment for ONE itp id first
# when using gpd.read_parquert() says
# Missing geo metadata in Parquet/Feather file.
# Use pandas.read_parquet/read_feather() instead.
segment_148 = pd.read_parquet(
    f"{GCS_DASK_PATH}vp_sjoin/vp_segment_148_{analysis_date}.parquet"
)

In [18]:
# segment_148 = gpd.GeoDataFrame(
#    segment_148, geometry=gpd.points_from_xy(segment_148.lon, segment_148.lat))

In [19]:
len(segment_148)

22692

In [20]:
segment_148 = segment_148.sort_values(["calitp_itp_id", "trip_id", "segment_sequence"])

In [21]:
segment_148.sample()

Unnamed: 0,calitp_itp_id,calitp_url_number,vehicle_timestamp,trip_id,route_dir_identifier,segment_sequence,lon,lat
4811,148,0,2022-10-12 14:44:58,970,3249179650,21,17297.31,-190957.71


### Task 1
* Using GTFS schedule data, by route_id-shape_id, calculate the route_length of each shape_id as a proportion of the longest shape_id. 
* For each route_id, what's the shortest shape_id length, in proportion to the longest shape_id's length. if it's 100%, then all shape_ids are equal length for that route. if it's 50%, there's a short trip that exists that only runs 50% of the length and turns around.

Notes
* Is it correct to join on `shape_id` and `longest_shape_id`.

In [22]:
routelines.crs == longest_shape.crs

True

In [23]:
# Calculate length of geometry
routelines = routelines.assign(
    route_length=(routelines.geometry.to_crs(geography_utils.CA_StatePlane).length)
)

* Why are there two shape IDS with diffrent Cal ITP url numbers? 
* What do the Cal ITP url numbers signify?

In [24]:
routelines.calitp_url_number.value_counts()

0    7348
1    2011
2      71
Name: calitp_url_number, dtype: int64

In [63]:
routelines.drop(columns = ["geometry"]).sort_values(["calitp_itp_id","shape_id"]).head()

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,route_length
45,4,0,shp-10-09,40538.08
578,4,1,shp-10-09,40538.08
61,4,0,shp-10-10,38768.87
427,4,1,shp-10-10,38768.87
75,4,0,shp-12-14,57472.16


In [26]:
# Dissolve so only one row for each calitp_id/shape_id.
routelines_diss = routelines.dissolve(by=[
        "calitp_itp_id",
        "shape_id",
    ],
    aggfunc={
        "route_length": "sum",
    },
).reset_index()

In [27]:
routelines_diss.shape_id.nunique(), len(routelines_diss), len(routelines)

(6353, 7685, 9430)

In [73]:
# routelines_diss.drop(columns = ["geometry"]).sort_values(["calitp_itp_id","shape_id"]).head()

In [75]:
# longest_shape.drop(columns = ["geometry", "geometry_arrowized"]).sort_values(["calitp_itp_id","longest_shape_id"]).head(200)

In [49]:
# Dissolve so only one row for each calitp_id/shape_id/route_id
longest_shape_diss = longest_shape.dissolve(
    by=["calitp_itp_id", "longest_shape_id", "route_id", "route_dir_identifier"],
    aggfunc={
        "route_length": "sum",
    },
).reset_index()

In [55]:
# longest_shape_diss.drop(columns = ["geometry"]).sort_values(["calitp_itp_id", "route_id"]).head(10)

In [51]:
# Do an inner merge.
m1 = routelines_diss.merge(
    longest_shape_diss,
    how="inner",
    left_on=["calitp_itp_id", "shape_id"],
    right_on=["calitp_itp_id", "longest_shape_id"],
    suffixes=("_routelines", "_longest_line"),
)

In [52]:
# Make sure this is a gdf? Is this important?
m1 = m1.set_geometry("geometry_routelines")

In [53]:
# Calculate out proportion of route length against longest.
m1["proportion_route_length"] = (
    m1["route_length_routelines"] / m1["route_length_longest_line"]
) * 100

In [54]:
m1.proportion_route_length.describe()

count   5150.00
mean      24.97
std       22.50
min        1.06
25%       11.33
50%       19.34
75%       29.88
max      328.79
Name: proportion_route_length, dtype: float64

In [71]:
# m1.drop(columns = ['geometry_routelines','geometry_longest_line']).sort_values(["proportion_route_length"], ascending = False).head()

### Task 2
* Testing with 148 Kings County Area Public Transit Agency
* Calculate time of trips?


Questions 
* How to define short trips - between segments? Or the entire trip as a whole? 
    * If trip as a whole, then just subtract the last segment minus 0?
* Are we trying to exclude short trips?

In [76]:
len(segment_148)

22692

In [78]:
segment_148.head(1)

Unnamed: 0,calitp_itp_id,calitp_url_number,vehicle_timestamp,trip_id,route_dir_identifier,segment_sequence,lon,lat
287,148,0,2022-10-12 18:26:26,100,4023814891,0,31139.56,-187848.08


In [82]:
merge_cols = ["calitp_itp_id", "trip_id", "route_dir_identifier",]

In [77]:
segment_148_min = (
    segment_148.groupby(merge_cols)
    .agg({"vehicle_timestamp": "min"})
    .rename(columns={"vehicle_timestamp": "min_time"})
    .reset_index()
)

In [79]:
segment_148_max = (
    segment_148.groupby(merge_cols)
    .agg({"vehicle_timestamp": "max"})
    .rename(columns={"vehicle_timestamp": "max_time"})
    .reset_index()
)

In [83]:
segment_148_m = segment_148_max.merge(
    segment_148_min,
    how="inner",
    on=merge_cols,
)

In [84]:
# https://stackoverflow.com/questions/51491724/calculate-difference-of-2-dates-in-minutes-in-pandas
segment_148_m['minutes_elapsed'] = (segment_148_m.max_time - segment_148_m.min_time).dt.total_seconds() / 60

In [85]:
segment_148_m.minutes_elapsed.describe()

count   232.00
mean     40.54
std      27.08
min      21.00
25%      28.48
50%      30.00
75%      32.50
max     189.50
Name: minutes_elapsed, dtype: float64

In [88]:
segment_148_m.route_dir_identifier.nunique(), segment_148_m.route_dir_identifier.nunique()

(14, 14)

In [117]:
len(segment_148_m)

232

In [103]:
segment_148_m.head()

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,max_time,min_time,minutes_elapsed
0,148,100,4023814891,2022-10-12 18:57:26,2022-10-12 18:26:26,31.0
1,148,101,4023814891,2022-10-12 19:23:26,2022-10-12 18:57:56,25.5
2,148,102,4023814891,2022-10-12 19:55:26,2022-10-12 19:23:56,31.5
3,148,103,3304839464,2022-10-12 07:08:47,2022-10-12 06:42:16,26.52
4,148,104,3304839464,2022-10-12 07:39:17,2022-10-12 07:09:16,30.02


In [107]:
segment_148_m = segment_148_m.merge(
    segments_crosswalks,
    how="inner",
    on=['calitp_itp_id','route_dir_identifier'],
)

In [97]:
# segment_148_filtered

In [110]:
m2 = segment_148_m.merge(
    m1,
    how="left",
    on=['calitp_itp_id','route_dir_identifier', "route_id"]
)

In [119]:
len(m2), len(segment_148_m)

(232, 232)

In [116]:
# Why did only one route_dir_identifier remain? Shouldn't there be 2 different ones for each route...
m2.drop(columns = ['geometry_routelines','geometry_longest_line', 'max_time','min_time'])

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,minutes_elapsed,route_id,direction_id,shape_id,route_length_routelines,longest_shape_id,route_length_longest_line,proportion_route_length
0,148,100,4023814891,31.0,1,0,42,32036.9,42,97707.24,32.79
1,148,101,4023814891,25.5,1,0,42,32036.9,42,97707.24,32.79
2,148,102,4023814891,31.5,1,0,42,32036.9,42,97707.24,32.79
3,148,76,4023814891,36.0,1,0,42,32036.9,42,97707.24,32.79
4,148,77,4023814891,28.52,1,0,42,32036.9,42,97707.24,32.79
5,148,78,4023814891,34.0,1,0,42,32036.9,42,97707.24,32.79
6,148,79,4023814891,26.5,1,0,42,32036.9,42,97707.24,32.79
7,148,80,4023814891,31.0,1,0,42,32036.9,42,97707.24,32.79
8,148,81,4023814891,28.48,1,0,42,32036.9,42,97707.24,32.79
9,148,82,4023814891,31.0,1,0,42,32036.9,42,97707.24,32.79


In [122]:
(m2.sort_values(by = ["route_id", "minutes_elapsed"], ascending = [True, False])
 .drop_duplicates(subset = ["route_id","route_dir_identifier", "shape_id", "longest_shape_id"])
 .drop(columns = ['geometry_routelines','geometry_longest_line', 'max_time','min_time'])
) 

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,minutes_elapsed,route_id,direction_id,shape_id,route_length_routelines,longest_shape_id,route_length_longest_line,proportion_route_length
21,148,94,4023814891,36.5,1,0,42,32036.9,42,97707.24,32.79
199,148,949,4057616601,189.5,12,0,11,592745.19,11,32699338.23,1.81
201,148,951,3904987544,113.02,13,0,17,317803.63,17,9400870.98,3.38
116,148,259,2810247007,66.0,14,0,16,140603.47,16,1842697.37,7.63
231,148,983,3197879838,95.5,15,0,35,225434.37,35,4733416.4,4.76
119,148,263,2360190108,130.48,17,0,34,398081.57,34,14808611.82,2.69
36,148,112,3304839464,35.0,2,0,49,33114.89,49,111058.71,29.82
219,148,971,3249179650,94.0,20,0,71,126981.41,71,1507731.89,8.42
72,148,148,2867321087,32.0,3,1,40,27783.45,40,76158.06,36.48
104,148,229,3852881464,41.0,4,1,20,37529.58,20,137260.8,27.34


### Questions 
* For each operator, what's the % of RT trip_ids that would remain after those thresholds are used? Make a chart function that takes a single operator. Produce charts for all operators. Is the time or geographic coverage that's driving this excluding of trips? What is a recommended threshold to use?
* For short trips, do they tend to be 50% of the longest route length? 40% 30%? Have this handy to inform question 1.
