In [1]:
import datetime
import _speed_utils as speed_utils
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,
    #analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions)
from shared_utils import calitp_color_palette as cp


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
CONFIG_PATH = './scripts/config.yml'

In [4]:
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [5]:
# Adding analysis_date here since there aren't any files for June yet
analysis_date = '2023-07-12'

### Observations (7/12)
* Step 1: Flagging
    * There are 2,704,812 rows in the dataframe original. About 10% of those rows are flagged as having zeroes in meters elapsed and seconds elapsed. 
    * There are around 4566 routes. About 57% of these routes had at least one trip with one or more rows flagged as zero.
* I took 2 passes at trying to understand why both these columns recorded zeroes.

* Step 2: `vp_pared_stops`. 
    * I grouped `vp_pared_stops` by 'shape_array_key','trip_id', and 'location_timestamp_local' OR `x` and `y`. I counted the number of unique stop sequences after grouping. If this trio had more than one unique stop sequence, that meant the timestamp or location recorded between sequences was duplicated.
    * Only around 9% of rows were flagged as having (obviously) repeated timestamps and locations. 
    * For all of these rows, both the timestamp and location were duplicated. 
    * All the routes that were flagged in step one needed a further look in step 3.
    
* Step 3: `vp_usable`
    * For one route and trip, find: all the recorded vehicle positions, sjoin of vps to segments,
    and the first and last points kept. 
    * Plot the three gdfs in a map to visually inspect what's happening.
    * Compare the sample route and trips with the trip with the highest percentage of non division by 0 rows to see what's going on. 
    
* Buckets of errors (all based on `stage0 vp`). 
    * There is only one recorded point in that segment in the raw data.
        * Fix: use the timestamp that comes after it.
        * <img src= "./speeds_images/only_one_pt_collected.png" width = 300>
    * Points are shared between segments
        * Use p20/p50/p80.
        * <img src= "./speeds_images/shared_vp.png" width = 300>
    * Points recorded are really far out and they don't touch the buffered segments.
        * Fix: figure out % of vehicle positions that are too far out.
        * <img src= "./speeds_images/dots_not_on_seg.png" width = 300>
    * No data captured for that segment at all.
        * Use p20/p50/p80. 
        * <img src= "./speeds_images/no_dots_collected.png" width = 300> 

### Flagging

In [6]:
# Flagged: all the rows in the df flagged
# divide_by_zero: only the rows that have 0 for meters and sec elapsed
# trips_count: % of trips with 1+ division by 0 row for a route
# route_most_populated_df: the trip for a route with the smallest % of rows that are divided by 0
# flagged, divide_by_zero, trips_count, route_most_populated_df = speed_utils.flagging_stage(analysis_date)

In [7]:
# divide_by_zero.loop_or_inlining.value_counts()

### Fixes  - % of vehicle positions that are too far away from the shapes to be joined

#### SJOIN 1
* Agg3 only contains normal at the moment.

In [8]:
USABLE_VP = STOP_SEG_DICT["stage1"]
INPUT_FILE_PREFIX = STOP_SEG_DICT["stage2"]
GROUPING_COL = STOP_SEG_DICT["grouping_col"]

In [9]:
normal_shapes = A2_valid_vehicle_positions.identify_stop_segment_cases(
        analysis_date, GROUPING_COL, 0)

In [10]:
abnormal_shapes = A2_valid_vehicle_positions.identify_stop_segment_cases(
        analysis_date, GROUPING_COL, 1)

In [11]:
datetime.datetime.now() 

datetime.datetime(2023, 7, 17, 15, 47, 17, 468154)

In [12]:
# Using A2 since I need the VPs to sjoin
# Only the pared down stops are saved out

usable_abnormal_vp = A2_valid_vehicle_positions.merge_usable_vp_with_sjoin_vpidx(
        abnormal_shapes,
        f"{USABLE_VP}_{analysis_date}",
        f"{INPUT_FILE_PREFIX}_{analysis_date}",
        GROUPING_COL
    )

    

In [13]:
usable_abnormal_vp = usable_abnormal_vp.assign(
    identifier = usable_abnormal_vp.gtfs_dataset_key.astype(str) + '/' + usable_abnormal_vp.shape_array_key.astype(str) + '/' + usable_abnormal_vp.trip_id.astype(str)
)

In [14]:
datetime.datetime.now() 

datetime.datetime(2023, 7, 17, 15, 53, 36, 365688)

In [16]:
agg = usable_abnormal_vp.groupby(['identifier']).vp_idx.nunique()

In [17]:
# agg = agg.compute()

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)


In [18]:
agg.head()

00accf770009aafd5dc103ff2eeddb37/0fbe3ecc523f44905177477254aea2a5/t_5634903_b_80735_tn_0     63
00accf770009aafd5dc103ff2eeddb37/1425c95c43a72622ab1c98372c0cb392/t_5634944_b_80735_tn_0    181
00accf770009aafd5dc103ff2eeddb37/1425c95c43a72622ab1c98372c0cb392/t_5635032_b_80735_tn_0     77
00accf770009aafd5dc103ff2eeddb37/14a663c742e8ad923c4e323e688563b6/t_5634935_b_80735_tn_0     54
00accf770009aafd5dc103ff2eeddb37/14a663c742e8ad923c4e323e688563b6/t_5634936_b_80735_tn_0    132
Name: vp_idx, dtype: int64

In [19]:
agg2 = agg.to_frame()

In [20]:
agg3 = agg2.reset_index()

In [21]:
agg3.shape

(10605, 2)

In [23]:
agg3.sample()

Unnamed: 0,index,vp_idx
3114,5222fe2cf728fd3f16b2ff51e133fe8c/9fa8c5e81b6f6cb056385afd0eddafbb/45-JP3n8Hrcy1,108


In [24]:
agg3["gtfs_dataset_key"] = agg3["index"].str.split("/").str[0]

In [25]:
agg3["trip_id"] = agg3["index"].str.split("/").str[2]

In [26]:
agg3 = agg3.drop(columns = ['index'])

In [27]:
agg3 = agg3.rename(columns = {'vp_idx':'number_of_unique_vps_sjoin'})

In [28]:
agg3.to_parquet(
    f"{SEGMENT_GCS}special_vps_testing.parquet"
)

In [None]:
#agg3 = pd.read_parquet(f"{SEGMENT_GCS}normalvps_testing.parquet")

#### A1 sjoin

In [None]:
# This is the end result from A1_sjoin
# Not using this since no trip_id information
#f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}"

In [None]:
#vp_to_seg = dd.read_parquet(
#        f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}",
#    )

In [None]:
#vp_to_seg = vp_to_seg.compute()

In [None]:
#vp_to_seg.sample()

In [None]:
# len(vp_to_seg)

In [None]:
# len(vp_to_seg[vp_to_seg.shape_array_key == "aafeeea21721ee9be95c6f794dabdac3"])

#### Usable VPS

In [29]:
f"{USABLE_VP}_{analysis_date}"

'vp_usable_2023-07-12'

In [30]:
usable_vp = dd.read_parquet(
        f"{SEGMENT_GCS}{USABLE_VP}_{analysis_date}"
    )

In [31]:
type(usable_vp)

dask.dataframe.core.DataFrame

In [34]:
datetime.datetime.now() 

datetime.datetime(2023, 7, 17, 16, 4, 34, 46605)

In [38]:
usable_vp = usable_vp.assign(
    identifier = usable_vp.gtfs_dataset_key.astype(str) + '/' + usable_vp.trip_id.astype(str)
)

In [39]:
# Using 
usable_vp = usable_vp.groupby(['identifier']).vp_idx.nunique()

In [None]:
usable_vp = usable_vp.compute()

##### Merge

In [None]:
agg3.sample()

In [None]:
pd.merge(usable_vp3, agg3, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge3 =  pd.merge(usable_vp3, agg3,  on = ['gtfs_dataset_key','trip_id'], how = 'inner')

In [None]:
merge3['percent_of_positions_retained'] = merge3.number_of_unique_vps_sjoin/merge3.number_of_unique_vps_all_pts * 100

In [None]:
merge3.percent_of_positions_retained.describe()

In [None]:
TRIP_GROUPING_COLS = STOP_SEG_DICT["trip_grouping_cols"]

In [None]:
crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(
        analysis_date, 
        ["feed_key", "trip_id"] + TRIP_GROUPING_COLS
    )

In [None]:
crosswalk = crosswalk.compute()

In [None]:
merge4 =  pd.merge(merge3, crosswalk,  on = ['gtfs_dataset_key','trip_id'], how = 'left')

In [None]:
merge4.sort_values(['percent_of_positions_retained']).head()

#### All Vehicle Positions

In [None]:
stop running

In [None]:
# VP all without paring
STG_0_FILE = STOP_SEG_DICT['stage0']
    

In [None]:
f"{STG_0_FILE}_{analysis_date}/"

In [None]:
all_pts = helpers.import_vehicle_positions(
            SEGMENT_GCS,
            f"{STG_0_FILE}_{analysis_date}/",
            "df",
            columns = ["gtfs_dataset_key", "trip_id","geometry"],
            partitioned = False
        )
all_pts = all_pts.compute()

In [None]:
all_pts.sample(5)

In [None]:
all_pts2 = (all_pts
        .groupby(['gtfs_dataset_key','trip_id'])
        .agg({'geometry':'nunique'}).reset_index()
        .rename(columns = {'geometry':'number_of_unique_vps_all_pts'})
        .reset_index(drop = True)
       )

In [None]:
len(all_pts[(all_pts.gtfs_dataset_key == "cdd2ad81863b6d4ad51676a1cb781ea8") & (all_pts.trip_id == "10686020")])

In [None]:
all_pts2[(all_pts2.gtfs_dataset_key == "cdd2ad81863b6d4ad51676a1cb781ea8") & (all_pts2.trip_id == "10686020")]

In [None]:
all_pts2[(all_pts2.gtfs_dataset_key == "593953c37ce48a9449bb90808ba0c1e1") & (all_pts2.trip_id == "t5FE-b3EA-sl13")]

In [None]:
all_pts2[(all_pts2.gtfs_dataset_key == "65d9589130415c685b89f4f7c2d8bd7e") & (all_pts2.trip_id == "10035002751332-DEC22")]

In [None]:
# How did this pop up in the grouped df if it doesn't even exist??
all_pts[(all_pts.gtfs_dataset_key == "00accf770009aafd5dc103ff2eeddb37") & (all_pts.trip_id == "-1160024104")]

In [None]:
all_pts3 = all_pts.drop(columns = ['geometry']).drop_duplicates().reset_index(drop = True)

In [None]:
len(all_pts3)

In [None]:
all_pts2.shape

In [None]:
len(all_pts)

##### Merge Sjoin with All Vehicles

In [None]:
# Merge to get rid of weird rows??
# These are all points 
pd.merge(all_pts3, all_pts2, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge1 = pd.merge(all_pts3, all_pts2, on = ['gtfs_dataset_key','trip_id'], how = 'inner')

In [None]:
merge1.sample()

In [None]:
merge1.shape

In [None]:
# These are sjoin positions
agg3.sample()

In [None]:
agg3.shape

In [None]:
pd.merge(merge1, agg3, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge2 = pd.merge(merge1, agg3, on = ['gtfs_dataset_key','trip_id'], how = 'inner')

In [None]:
merge2.sample(5)

### Investigate 

In [None]:
stop running

In [None]:
stage3_df = speed_utils.flag_stage3(divide_by_zero, analysis_date)

In [None]:
stage3_df.shape_array_key.nunique()

In [None]:
sort_by = ['_gtfs_dataset_name','shape_array_key','trip_id','stop_sequence']

In [None]:
preview_cols = sort_by + ['stop_id','gtfs_dataset_key','location_timestamp_local','pair','stage3_flag']

#### Find routes with a lot of n_trips that need to be evaluated.

In [None]:
shape_trips = speed_utils.count_trips_routes(stage3_df)

In [None]:
shape_trips.head(5)

In [None]:
gtfs_key1= "6c2d7daaf979779fa2089c6395baf98b"
trip_id1 = "904463"
shape_key1 = "5a788bd9c9aa5c5465875689a626baa9"

#### Example Trip 1
* Understanding the result from flag_stage3().
* Looking at AC Transit: stop sequences 6 and 7 have different stop_ids. However, their time stamps and locations are the same.
* It looks like they share the same point.
* However, this isn't due to paring too many points: there just aren't enough points to choose from in the raw data.
* Also sequence 2 is extremely long.

In [None]:
stage3_df[(stage3_df.stage3_flag != 'check in stage 2')].sort_values(by =sort_by)[preview_cols].head(2)

In [None]:
test_route1 = "03bc2779a66739316156b459ffc3eefa"
test_gtfs_key1 = "cdd2ad81863b6d4ad51676a1cb781ea8"
test_trip1 = "11776020"

In [None]:
# % of trips with problematic rows for this route
trips_count[trips_count.shape_array_key == test_route1]

In [None]:
speed_utils.original_df_rows(flagged, test_trip1, test_route1)

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route1,
                        trip = test_trip1,
                        gtfs_key = test_gtfs_key1)

#### Example Trip 2
* Underestanding the result from `flag_stage3()`. 
* Same thing as Ex Trip 1 except kooking at LA Metro: stop sequences 45 and 46 have different stop_ids. However, their time stamps and locations are the same. 
* Same issue: segments 45 and 46 are sharing points. There aren't enough points captured.
* In general, it looks like this route doesn't have a lot of rows that are ok.

In [None]:
test_route2 = "38c814829dff816aa87c606c3aab4f45"
test_gtfs_key2 = "65d9589130415c685b89f4f7c2d8bd7e"
test_trip2 = "10294000051654-DEC22"

In [None]:
# Original number of rows for this trip
len(flagged[(flagged.trip_id == test_trip2) & (flagged.shape_array_key == test_route2)])

In [None]:
# Rows with zeroes...a lot of them.
len(divide_by_zero[(divide_by_zero.trip_id == test_trip2) & (divide_by_zero.shape_array_key == test_route2)])

In [None]:
stage3_df[(stage3_df.stage3_flag != 'check in stage 2') & (stage3_df.shape_array_key == test_route2) 
   & (stage3_df.stop_sequence.isin([45,46]))].sort_values(by =sort_by)[preview_cols]

In [None]:
# % of trips with problematic rows for this route, lots of them
trips_count[trips_count.shape_array_key == test_route2]

In [None]:
speed_utils.original_df_rows(m1, test_trip2, test_route2)

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route2,
                        trip = test_trip2,
                        gtfs_key = test_gtfs_key2,)

In [None]:
# See which trip has the most rows that are ok for this route
route_most_populated_df[route_most_populated_df.shape_array_key == test_route2]

In [None]:
# Plot the trip from this route with the highest % of ok rows
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route2,
                        trip = "10294000051724-DEC22",
                        gtfs_key = test_gtfs_key2,)

#### Example Trip 3
* Choosing a route/trip with high n_trips that isn't Muni or LA Metro to shake things up.
* San Diego Vehicle Positions
* Segments 44 and 45 don't have any rows in the original dataframe for trip 16938440.

In [None]:
test_route3 = "1fc55d9df0cd785dddc864bf1b72976f"
test_gtfs_key3 = "a4f6fd5552107e05fe9743ac7cce2c55"
test_trip3 = "16938440"

In [None]:
speed_utils.original_df_rows(flagged, test_trip3, test_route3)

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route3,
                        trip = test_trip3,
                        gtfs_key = test_gtfs_key3,)

In [None]:
# Trip from this route with the most ok rows
route_most_populated_df[route_most_populated_df.shape_array_key == test_route3]

In [None]:
# Test the trip with the most ok rows
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route3,
                        trip = "16938341",
                        gtfs_key = test_gtfs_key3,)

In [None]:
speed_utils.original_df_rows(flagged, "16938341", test_route3)