# Identify GTFS shape overlap issues

Known issue from RT speedmaps: vehicle positions are "snapped" to shapes in wrong order when a route starts and ends in the same place. Calculated speeds are implausible and trips are dropped.

Goal: Identify how many shapes have this issue.

In [7]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(800_000_000_000) ## 800GB?

from calitp.tables import tbl
import geopandas as gpd
import numpy as np
import pandas as pd
import shared_utils
import datetime as dt
from siuba import * 

pd.set_option("display.max_columns", None)

import folium

In [8]:
# set testing parameters - test on known operator w/ issue (KART)
analysis_operator=148
analysis_dt= dt.date(2022, 6, 1)

In [3]:
help(shared_utils.gtfs_utils.get_route_shapes)

Help on function get_route_shapes in module shared_utils.gtfs_utils:

get_route_shapes(selected_date: str | datetime.date, itp_id_list: list[int] = None, get_df: bool = True, crs: str = 'EPSG:4326', trip_df: siuba.sql.verbs.LazyTbl | pandas.core.frame.DataFrame = None, custom_filtering: dict = None) -> geopandas.geodataframe.GeoDataFrame
    Return a subset of geography_utils.make_routes_gdf()
    to only have the `shape_id` values present on a selected day.
    
    geography_utils.make_routes_gdf() only selects based on calitp_extracted_at
    and calitp_deleted_at date range.
    
    Allow a pre-existing trips table to be supplied.
    If not, run a fresh trips query.
    
    Custom_filtering doesn't filter in the query (which relies on trips query),
    but can filter out after it's a gpd.GeoDataFrame



In [None]:
gdf = (shared_utils.gtfs_utils.get_route_shapes(selected_date = analysis_dt,itp_id_list = [analysis_operator]))

In [12]:
# this will be easier with points instead of line segments
df = (
    tbl.views.gtfs_schedule_dim_shapes()
    >> filter(
        _.calitp_extracted_at <= analysis_dt,
        _.calitp_deleted_at > analysis_dt,
    )
    >> filter(_.calitp_itp_id == analysis_operator)
    >> collect()
    >> arrange(_.shape_id,_.shape_pt_sequence)
)

In [13]:
df >> head (3)

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled,calitp_extracted_at,calitp_hash,shape_key,calitp_deleted_at
93,148,0,1,36.326554,-119.65269,1,,2021-04-15,AdmaRua/Amev0QtY7fOHiw==,1790136768008789276,2099-01-01
39,148,0,1,36.325585,-119.652414,2,,2021-04-15,amQi46eNe4KUFFe+n6Ir3g==,5741240405724154559,2099-01-01
43,148,0,1,36.325546,-119.652668,3,,2021-04-15,D+YbcrRTdl81KqxqupJYcA==,-4413887979557434232,2099-01-01


In [67]:
gdf_shapepts = (gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.shape_pt_lon, df.shape_pt_lat), crs='EPSG:4326')
       >> select(-_.calitp_extracted_at, -_.calitp_deleted_at)
      ).to_crs(shared_utils.geography_utils.CA_NAD83Albers)

In [62]:
def findoverlap(gdf):
    gdf = (gdf >> 
         mutate(firstpt = _.shape_pt_sequence == _.shape_pt_sequence.min(),
                lastpt = _.shape_pt_sequence == _.shape_pt_sequence.max(),
                secondlastpt = _.shape_pt_sequence == (_.shape_pt_sequence.max()-1)
               )
        )
    gdf_first = gdf >> filter(_.firstpt)
    gdf_last = gdf >> filter(_.lastpt)
    gdf_2ndlast = gdf >> filter(_.secondlastpt)
    gdf = (gdf >>
        mutate(firstlastdist = gdf_first.geometry.iloc[0].distance(gdf_last.geometry.iloc[0]),
               first2ndlastdist = gdf_first.geometry.iloc[0].distance(gdf_2ndlast.geometry.iloc[0]),
               overlap = if_else(_.firstlastdist>_.first2ndlastdist,1,0)
              )
          )
    return gdf

In [68]:
gdf_shapepts = gdf_shapepts.sort_values(["shape_id","shape_pt_sequence"], ignore_index=True).groupby("shape_id").apply(findoverlap)

list_shapepts = (gdf_shapepts >> distinct(_.shape_id, _keep_all = True))

In [70]:
list_shapepts

Unnamed: 0,calitp_itp_id,calitp_url_number,shape_id,shape_pt_lat,shape_pt_lon,shape_pt_sequence,shape_dist_traveled,calitp_hash,shape_key,geometry,firstpt,lastpt,secondlastpt,firstlastdist,first2ndlastdist,overlap
0,148,0,1,36.326554,-119.65269,1,,AdmaRua/Amev0QtY7fOHiw==,1790136768008789276,POINT (31139.731 -187773.181),True,False,False,47.165487,112.032982,0
1,148,0,11,36.325647,-119.652688,1,,H0etB3jZFB0u5MwpyF4cfg==,2301619185161438766,POINT (31140.280 -187873.972),True,False,False,85.138834,11.366133,1
2,148,0,12,36.325647,-119.652688,1,,/M+RlTLRHUtT3eLdDH1jkA==,8783299102294375557,POINT (31140.280 -187873.972),True,False,False,89.078562,11.366133,1
3,148,0,16,36.325647,-119.65269,1,,q47o82IZBBUFqyDAe9rP0w==,6505970922370712830,POINT (31140.100 -187873.972),True,False,False,82.324743,71.850506,1
4,148,0,17,36.325647,-119.65269,1,,wlrNi0ATbfigeqwNKY2xLg==,-7015007738816517595,POINT (31140.100 -187873.972),True,False,False,73.963219,41.648726,1
5,148,0,18,36.325647,-119.652689,1,,c5cMVKqjj7AM2IZjPAkxEQ==,6304920504287781450,POINT (31140.190 -187873.972),True,False,False,103.187243,11.380623,1
6,148,0,20,36.326554,-119.65269,1,,C/yOp31lBQMr1oixIBKFNg==,29689480193606411,POINT (31139.731 -187773.181),True,False,False,81.445516,9.444226,1
7,148,0,21,36.325647,-119.652688,1,,wu00tKjv/xSg1jdEoJEXyA==,6673110228731400608,POINT (31140.280 -187873.972),True,False,False,78.041508,11.366133,1
8,148,0,22,36.884997,-119.800181,1,,vBBQfWtpfLBF7jHUFywgGA==,-3062798826945553631,POINT (17784.942 -125746.500),True,False,False,63524.727595,63438.656597,1
9,148,0,23,36.326555,-119.652683,1,,UW7mOwCJu7nlhMSOVzdtTQ==,8942536727946352534,POINT (31140.358 -187773.068),True,False,False,103.037264,49.631444,1


In [77]:
gdf_shapepts_filter = (gdf_shapepts >> filter((_.shape_id=="49")))

gdf_shapepts_filter.explore("shape_pt_sequence")