## RT trip diagnostics: thresholds for usable trips 

In [1]:
import dask.dataframe as dd
import dask_geopandas as dg
import geopandas as gpd
import pandas as pd
from calitp.sql import to_snakecase
from shared_utils import geography_utils, utils



In [2]:
# Save files to GCS
from calitp.storage import get_fs

fs = get_fs()

In [3]:
# Record start and end time
import datetime

from loguru import logger

In [4]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### Look at files

In [5]:
GCS_DASK_PATH = "gs://calitp-analytics-data/data-analyses/dask_test/"

In [6]:
GCS_RT_PATH = "gs://calitp-analytics-data/data-analyses/rt_delay/"

In [7]:
analysis_date = "2022-10-12"

In [8]:
# Read in route lines
routelines = gpd.read_parquet(
    f"{GCS_RT_PATH}compiled_cached_views/routelines_{analysis_date}.parquet"
)

In [9]:
len(routelines), routelines.shape_id.nunique(), routelines.calitp_itp_id.nunique()

(9430, 6353, 175)

In [10]:
routelines.sample().drop(columns=["geometry"])

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id
1500,76,0,p_498594


In [11]:
# Read in longest_shape of each route
longest_shape = gpd.read_parquet(f"{GCS_DASK_PATH}longest_shape_segments.parquet")

In [12]:
len(
    longest_shape
), longest_shape.longest_shape_id.nunique(), longest_shape.calitp_itp_id.nunique()

(126896, 3960, 175)

In [13]:
longest_shape.sample().drop(columns=["geometry", "geometry_arrowized"])

Unnamed: 0,calitp_itp_id,calitp_url_number,route_id,direction_id,longest_shape_id,route_dir_identifier,route_length,segment_sequence
41060,182,0,150-13157,1,1500197_JUNE22,4257327450,20749.79,19


In [14]:
# Says missing geospatial data
segments_crosswalks = pd.read_parquet(
    f"{GCS_DASK_PATH}segments_route_direction_crosswalk.parquet"
)

In [15]:
len(segments_crosswalks)

5150

In [16]:
segments_crosswalks.sample()

Unnamed: 0,calitp_itp_id,route_id,direction_id,route_dir_identifier
5022,228,1,1,4196048753


In [68]:
# Read in one segment for ONE itp id
# when using gpd.read_parquert() says
# Missing geo metadata in Parquet/Feather file.
# Use pandas.read_parquet/read_feather() instead.
segment_148 = pd.read_parquet(
    f"{GCS_DASK_PATH}vp_sjoin/vp_segment_148_{analysis_date}.parquet"
)

In [None]:
# segment_148 = gpd.GeoDataFrame(
#    segment_148, geometry=gpd.points_from_xy(segment_148.lon, segment_148.lat))

In [70]:
len(segment_148)

22692

In [80]:
segment_148 = segment_148.sort_values(["calitp_itp_id", "trip_id", "segment_sequence"])

In [92]:
segment_148.sample()

Unnamed: 0,calitp_itp_id,calitp_url_number,vehicle_timestamp,trip_id,route_dir_identifier,segment_sequence,lon,lat,max_timestamp
1437,148,0,2022-10-12 13:41:20,934,1342713973,4,27716.41,-187610.45,2022-10-12 13:41:20


### Task 1
* Using GTFS schedule data, by route_id-shape_id, calculate the route_length of each shape_id as a proportion of the longest shape_id. 

In [20]:
# Calculate length of geometry
routelines = routelines.assign(
    route_length=(routelines.geometry.to_crs(geography_utils.CA_StatePlane).length)
)

In [21]:
# Dissolve so only one row for each calitp_id/shape_id.
routelines_diss = routelines.dissolve(
    by=[
        "calitp_itp_id",
        "shape_id",
    ],
    aggfunc={
        "route_length": "sum",
    },
).reset_index()

In [27]:
routelines_diss.shape_id.nunique(), len(routelines_diss)

(6353, 7685)

In [36]:
# Dissolve so only one row for each calitp_id/shape_id/route_id
longest_shape_diss = longest_shape.dissolve(
    by=["calitp_itp_id", "longest_shape_id", "route_id"],
    aggfunc={
        "route_length": "sum",
    },
).reset_index()

In [38]:
len(longest_shape_diss)

5150

In [28]:
# routelines_diss.drop(columns = ["geometry"]).sort_values("calitp_itp_id").head(100)

In [42]:
# Do an inner merge.
m1 = routelines_diss.merge(
    longest_shape_diss,
    how="inner",
    left_on=["calitp_itp_id", "shape_id"],
    right_on=["calitp_itp_id", "longest_shape_id"],
    suffixes=("_routelines", "_longest_line"),
)

In [46]:
# Make sure this is a gdf
m1 = m1.set_geometry("geometry_routelines")

In [53]:
# Calculate out proportion of route length against longest.
m1["proportion_route_length"] = (
    m1["route_length_routelines"] / m1["route_length_longest_line"]
) * 100

In [55]:
# m1.drop(columns = ['geometry_routelines','geometry_longest_line']).head(100)

In [56]:
m1.proportion_route_length.describe()

count   5150.00
mean      32.88
std       32.66
min        1.06
25%       12.62
50%       23.43
75%       41.06
max      657.58
Name: proportion_route_length, dtype: float64

### Task 2
* Testing with 148 Kings County Area Public Transit Agency
* Calculate time of trips?


Questions 
* How to define short trips - between segments? Or the entire trip as a whole? 
    * If trip as a whole, then just subtract the last segment minus 0?
* Are we trying to exclude short trips?

In [171]:
len(segment_148)

22692

In [172]:
segment_148_min = (
    segment_148.groupby(["calitp_itp_id", "trip_id", "route_dir_identifier",])
    .agg({"vehicle_timestamp": "min"})
    .rename(columns={"vehicle_timestamp": "min_time"})
    .reset_index()
)

In [174]:
segment_148_max = (
    segment_148.groupby(["calitp_itp_id", "trip_id", "route_dir_identifier",])
    .agg({"vehicle_timestamp": "max"})
    .rename(columns={"vehicle_timestamp": "max_time"})
    .reset_index()
)

In [175]:
segment_148_m = segment_148_max.merge(
    segment_148_min,
    how="inner",
    on=["calitp_itp_id", "trip_id", "route_dir_identifier"],
)

In [176]:
# https://stackoverflow.com/questions/51491724/calculate-difference-of-2-dates-in-minutes-in-pandas
segment_148_m['minutes_elapsed'] = (segment_148_m.max_time - segment_148_m.min_time).dt.total_seconds() / 60

In [177]:
segment_148_m.minutes_elapsed.describe()

count   232.00
mean     40.54
std      27.08
min      21.00
25%      28.48
50%      30.00
75%      32.50
max     189.50
Name: minutes_elapsed, dtype: float64

In [178]:
segment_148_m.route_dir_identifier.nunique()

14

In [179]:
segment_148_m.minutes_elapsed.describe()

count   232.00
mean     40.54
std      27.08
min      21.00
25%      28.48
50%      30.00
75%      32.50
max     189.50
Name: minutes_elapsed, dtype: float64

In [180]:
# Maybe anything below 25% percentile is a short trip to throw away
p25 = segment_148_m.minutes_elapsed.quantile(0.25).astype(float)

In [181]:
segment_148_filtered = (segment_148_m.loc[segment_148_m.minutes_elapsed > p25]).reset_index(drop = True)

In [182]:
f"{len(segment_148_filtered)/len(segment_148_m)}% of rows are left"

'0.7327586206896551% of rows are left'

In [185]:
segment_148_filtered.head()

Unnamed: 0,calitp_itp_id,trip_id,route_dir_identifier,max_time,min_time,minutes_elapsed
0,148,100,4023814891,2022-10-12 18:57:26,2022-10-12 18:26:26,31.0
1,148,102,4023814891,2022-10-12 19:55:26,2022-10-12 19:23:56,31.5
2,148,104,3304839464,2022-10-12 07:39:17,2022-10-12 07:09:16,30.02
3,148,105,3304839464,2022-10-12 08:09:17,2022-10-12 07:39:47,29.5
4,148,106,3304839464,2022-10-12 08:41:17,2022-10-12 08:09:47,31.5


### Questions 
* For each operator, what's the % of RT trip_ids that would remain after those thresholds are used? Make a chart function that takes a single operator. Produce charts for all operators. Is the time or geographic coverage that's driving this excluding of trips? What is a recommended threshold to use?
* For short trips, do they tend to be 50% of the longest route length? 40% 30%? Have this handy to inform question 1.
