In [1]:
import datetime
import _speed_utils as speed_utils
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import (
    SEGMENT_GCS,
    analysis_date,
)
from datetime import timedelta

from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions, cut_road_segments)
from shared_utils import calitp_color_palette as cp


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
import gcsfs
fs = gcsfs.GCSFileSystem()

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [4]:
CONFIG_PATH = './scripts/config.yml'

In [5]:
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [6]:
analysis_date = '2023-07-12'

### Observations (7/12)
* Step 1: Flagging
    * There are 2,704,812 rows in the dataframe original. About 10% of those rows are flagged as having zeroes in meters elapsed and seconds elapsed. 
    * There are around 4566 routes. About 57% of these routes had at least one trip with one or more rows flagged as zero.
* I took 2 passes at trying to understand why both these columns recorded zeroes.

* Step 2: `vp_pared_stops`. 
    * I grouped `vp_pared_stops` by 'shape_array_key','trip_id', and 'location_timestamp_local' OR `x` and `y`. I counted the number of unique stop sequences after grouping. If this trio had more than one unique stop sequence, that meant the timestamp or location recorded between sequences was duplicated.
    * Only around 9% of rows were flagged as having (obviously) repeated timestamps and locations. 
    * For all of these rows, both the timestamp and location were duplicated. 
    * All the routes that were flagged in step one needed a further look in step 3.
    
* Step 3: `vp_usable`
    * For one route and trip, find: all the recorded vehicle positions, sjoin of vps to segments,
    and the first and last points kept. 
    * Plot the three gdfs in a map to visually inspect what's happening.
    * Compare the sample route and trips with the trip with the highest percentage of non division by 0 rows to see what's going on. 
    
* Buckets of errors (all based on `stage0 vp`). 
    * There is only one recorded point in that segment in the raw data.
        * Fix: use the timestamp that comes after it.
        * <img src= "./speeds_images/only_one_pt_collected.png" width = 300>
    * Points are shared between segments
        * Use p20/p50/p80.
        * <img src= "./speeds_images/shared_vp.png" width = 300>
    * Points recorded are really far out and they don't touch the buffered segments.
        * Fix: figure out % of vehicle positions that are too far out.
        * <img src= "./speeds_images/dots_not_on_seg.png" width = 300>
    * No data captured for that segment at all.
        * Use p20/p50/p80. 
        * <img src= "./speeds_images/no_dots_collected.png" width = 300> 

### Flagging

In [7]:
# Flagged: all the rows in the df flagged
# divide_by_zero: only the rows that have 0 for meters and sec elapsed
# trips_count: % of trips with 1+ division by 0 row for a route
# route_most_populated_df: the trip for a route with the smallest % of rows that are divided by 0
flagged, divide_by_zero, trips_count, route_most_populated_df = speed_utils.flagging_stage(analysis_date)

2023-07-25 15:26:44.994977
There are 2517857 rows in the original dataframe
sec_cat      meters_cat    
sec is avg   meters is avg     1873776
             meters is high     101374
             meters is low      127169
sec is high  meters is avg       78053
             meters is high      24519
             meters is low       23738
sec is low   meters is low      289228
dtype: int64
ok                 78.45
division by 0      11.49
meters too low      5.99
seconds too high    4.07
Name: flag, dtype: float64
22727 unique trips flagged.
2578 routes flagged out of 4087.
63.0780523611451 routes have 1+ row that has zeroes for meters/sec elapsed
14 operators are not flagged.
36.92194763885491% of routes have 1+ division by 0 row
Took 0:04:21.469724


In [8]:
# shape_trips = speed_utils.count_trips_routes(stage3_df)

NameError: name 'stage3_df' is not defined

In [9]:
stage3_df = speed_utils.flag_stage3(divide_by_zero, analysis_date)

2023-07-25 15:32:28.433483
check in stage 2                   265623
repeated timestamps & locations     23593
repeated locations                     12
Name: stage3_flag, dtype: int64
Have to check 91.83861866762554 % of rows in stage 2
Took 0:00:56.208809


### Fix 1  - % of vehicle positions that are too far away from the shapes to be joined
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/A0_preprocessing.py

In [10]:
INPUT_FILE_PREFIX = STOP_SEG_DICT["stage0"]

In [11]:
# 14_600_897 rows
original = dd.read_parquet(
        f"{SEGMENT_GCS}{INPUT_FILE_PREFIX}_{analysis_date}.parquet"
    )

In [12]:
# len(original)

#### Crosswalk

In [13]:
TRIP_GROUPING_COLS = STOP_SEG_DICT["trip_grouping_cols"]

In [14]:
crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(
        analysis_date, 
        ["feed_key", "trip_id"] + TRIP_GROUPING_COLS
    )

In [15]:
crosswalk = crosswalk.compute()

In [16]:
crosswalk.sample()

Unnamed: 0,feed_key,trip_id,shape_array_key,gtfs_dataset_key
48167,a5d208ac54d606decccdaf14f70d81de,t_5283742_b_77396_tn_0,312e365f526f773ebafdaeb5da96aea3,d25781d68d011222a8bfb2de64cc6f7d


#### A1 sjoin
* Contains the columns:
    * vp_idx
    * shape_array_key
    * stop_sequence

In [17]:
USABLE_VP = STOP_SEG_DICT["stage1"]
INPUT_FILE_PREFIX = STOP_SEG_DICT["stage2"]
GROUPING_COL = STOP_SEG_DICT["grouping_col"]

In [18]:
def load_sjoin(analysis_date:str):
    INPUT_FILE_PREFIX = STOP_SEG_DICT["stage2"]
    df = dd.read_parquet(
      f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}",
    )
    return df

In [19]:
# This is the end result from A1_sjoin
f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}"

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_sjoin/vp_stop_segment_2023-07-12'

#### Sjoin
['vp_idx', 'shape_array_key', 'stop_sequence']

In [20]:
vp_to_seg = dd.read_parquet(
      f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}",
    )

In [21]:
# vp_to_seg = vp_to_seg.compute()

In [22]:
# vp_to_seg.columns

In [23]:
# 24_973_725
# len(vp_to_seg)

In [24]:
# 11_350_051
# vp_to_seg.vp_idx.nunique().compute()

#### Usable VPS
* _gtfs_dataset_name	
* schedule_gtfs_dataset_key
* trip_id	
* trip_instance_key	
* location_timestamp	
* location_timestamp_local	
* hour	
* gtfs_dataset_key	
* x	y	
* vp_idx

In [25]:
usable_vp = dd.read_parquet(
        f"{SEGMENT_GCS}{USABLE_VP}_{analysis_date}"
    )

In [26]:
usable_vp = usable_vp.drop_duplicates(subset = ['vp_idx'])

In [27]:
# 14_579_242
# usable_vp.vp_idx.nunique().compute()

In [28]:
# 14,579,242
# len(usable_vp)

In [29]:
# https://stackoverflow.com/questions/49139371/slicing-out-a-few-rows-from-a-dask-dataframe
npart = round(len(usable_vp)/1_000_000)
parted_usable_vp = usable_vp.repartition(npartitions=npart)

In [30]:
#npart

In [31]:
#my_list = [n for n in range(0,npart)]

In [32]:
def load_usable_vp(analysis_date:str):
    USABLE_VP = STOP_SEG_DICT["stage1"]
    usable_vp = dd.read_parquet(
        f"{SEGMENT_GCS}{USABLE_VP}_{analysis_date}"
    )
    
    usable_vp = usable_vp.drop_duplicates(subset = ['vp_idx'])
    
    return usable_vp

In [33]:
# usable_vp1 = parted_usable_vp.partitions[0]

In [34]:
# usable_vp15 = parted_usable_vp.partitions[14]

#### Merge test2 with A1sjoin

In [35]:
# outer_merge = dd.merge(usable_vp1, vp_to_seg, on = ['vp_idx'], how = 'outer', indicator = True)

In [36]:
def percent_sjoin_pts(usable_vp: dd.DataFrame, sjoin_vps: dd.DataFrame):
    start = datetime.datetime.now()
    
    # Use a left join because these are the vps that were not properly joined
    merge1 = dd.merge(usable_vp, sjoin_vps, on = ['vp_idx'], how = 'left', indicator = True)
    
    # Drop duplicated vp idx
    # Since same pt can attach to multiple segs
    merge1 = merge1.drop_duplicates(subset = ['vp_idx'])

    # First groupby, have to use observed = True because
    # merge and gtfs dataset key are the categorical data type
    agg1 = (merge1.groupby(['gtfs_dataset_key','trip_id','_merge'],observed=True,)
                .agg({'trip_instance_key':'count'})
                .reset_index()
               )
    
    agg1 = agg1.compute() 
    
    # Create new columns so the dataframe will be the way I like it
    agg1['left_only'] = None
    agg1['both'] = None
    agg1['pts_not_in_sjoin'] = np.where(agg1._merge == 'left_only', agg1.trip_instance_key, agg1.left_only)
    agg1['sjoin_pts'] = np.where(agg1._merge == 'both', agg1.trip_instance_key, agg1.both)
   
    # One row for each route/trip id
    agg2 = (agg1
            .groupby(['gtfs_dataset_key','trip_id'], observed=True, group_keys=False)
            .agg({'pts_not_in_sjoin':'sum','sjoin_pts':'sum'})
            .reset_index()
           )
    
    print(f"{len(agg1)-len(agg2)} rows dropped, now {len(agg2)} rows")
    
    # Add some additional columns
    agg2 = agg2.fillna(0)
    agg2['all_pts'] = agg2.pts_not_in_sjoin + agg2.sjoin_pts
    agg2['percent_of_pts'] = agg2.sjoin_pts/agg2.all_pts * 100
    
    # Filter out vpidx already found 
    new_usable_vps = merge1[['vp_idx']].drop_duplicates().compute()
    new_usable_vps = new_usable_vps.vp_idx.to_list()
    
    end = datetime.datetime.now()
    print(f"Time lapsed: {end-start}")

    return new_usable_vps, agg2

In [37]:
# new_usable_vps, merge1 = percent_sjoin_pts(usable_vp1,vp_to_seg)

In [38]:
# new_usable_vps2, merge2 = percent_sjoin_pts(usable_vp15,vp_to_seg)

#### Full merge
* Not working as intended. Stops right at the final file.

In [39]:
# https://www.geeksforgeeks.org/python-reversing-list/
def reverse(lst):
    new_list = lst[::-1]
    return new_list

In [40]:
def percent_sjoin_all_pts(analysis_date:str):
    
    start = datetime.datetime.now()
    print(start)
    
    # Load usable vp
    usable_vps_og = load_usable_vp(analysis_date)
    
    # Load sjoin
    sjoin_og = load_sjoin(analysis_date)
    
    # Break it apart
    # https://stackoverflow.com/questions/49139371/slicing-out-a-few-rows-from-a-dask-dataframe
    npart = round(len(usable_vps_og)/500_000)
    usable_vps_og = usable_vps_og.repartition(npartitions=npart)
    my_list = [n for n in range(0,npart)]
    
    for i in reverse(my_list):
        my_results = []
        try:
            sjoin_og = sjoin_og[~sjoin_og.vp_idx.isin(remaining_vp_idx_list)].reset_index(drop = True)
        except:
            sjoin_og
            
        # Apply function
        remaining_vp_idx_list, results = percent_sjoin_pts(usable_vps_og.partitions[i], sjoin_og)
        my_results.append(results)
        
        # Save
        results.to_parquet(f"{SEGMENT_GCS}ah_testing/part_{i}.parquet")
        print(f"done with {i}")
        
    
    final = pd.concat(my_results, axis=0).reset_index(drop=True)
    end = datetime.datetime.now()
    print(f"Time lapsed: {end-start}")
    return final
        

In [41]:
# test = percent_sjoin_all_pts(analysis_date)

In [42]:
def find_files(phrase_to_find: str, file_path:str) -> list:
    """
    Grab a list of files that contain the
    phrase inputted. 
    """
    # Create a list of all the files in my folder
    all_files_in_folder = fs.ls(file_path)

    # Grab only files with the string "Verizon_no_coverage_"
    my_files = [i for i in all_files_in_folder if phrase_to_find in i]

    # String to add to read the files
    my_string = "gs://"
    my_files = [my_string + i for i in my_files]
    
    # Extract digit of parquet 
    return my_files


In [43]:
file_names = find_files('part',f"{SEGMENT_GCS}ah_testing/")

In [44]:
all_file_numbers = []

In [45]:
for file in file_names:
    # https://stackoverflow.com/questions/11339210/how-to-get-integer-values-from-a-string-in-python
        file_number = "".join(i for i in file if i.isdigit())
        all_file_numbers.append(file_number)
      

In [46]:
dataframes_list = []

In [47]:
#for i in all_file_numbers:
    #temp_df = dd.read_parquet(f"{SEGMENT_GCS}ah_testing/part_{i}.parquet")
    #dataframes_list.append(temp_df)

In [48]:
# final = dd.concat(dataframes_list, axis=0).reset_index(drop=True)

In [49]:
# final.head()

In [50]:
# final.shape

In [51]:
# final_less_than_100 = final[final.percent_of_pts != 100].reset_index(drop = True)

In [52]:
# final_less_than_100.shape

In [53]:
# final_less_than_100.percent_of_pts.describe()

In [54]:
# final = pd.merge(final , crosswalk, on = ['trip_id','gtfs_dataset_key'], how = 'left')

In [55]:
# final.percent_of_pts.describe()

#### Check for missing operators/trips

In [56]:
# final.gtfs_dataset_key.nunique(), final.trip_id.nunique()

In [57]:
usable_vp = usable_vp.compute()

In [58]:
usable_vp.gtfs_dataset_key.nunique(), usable_vp.trip_id.nunique()

(89, 70978)

In [59]:
# og = set(usable_vp.trip_id.unique().tolist())
# agg = set(final.trip_id.unique().tolist())


In [60]:
# len(og-agg)

### Fix 2 - Keep timestamps nearby if there aren't enough points
* Use the flagged_df to see which routes have a very low % of points.
* For those routes without enough points, find the timestamps a few minutes around. 
* How do I know which point is wrong though??
* Can use stage3 dataframe, since those have something wrong with them...

In [61]:
def route_most_populated(flagged:pd.DataFrame)-> pd.DataFrame:
    """
    For each route, the "quality" of vehicle positions varies by trip.
    Find the trip with the highest percentage of "ok" rows. Ok is defined
    as a row with non-zero values populated for meters_elapsed and 
    sec_elapsed
    
    Args:
        flagged: df from categorize_meters_speeds_pandas()
    """
    # First aggregation to count number of stops by flag
    agg1 = (flagged
        .groupby(['gtfs_dataset_key','shape_array_key','trip_id','flag'])
        .agg({'stop_sequence':'nunique'})
        .rename(columns = {'stop_sequence':'number_of_rows'})
        .reset_index()
       )
    
    # Create separate cols for the number of rows that are ok and rows that are division by 0
    # https://stackoverflow.com/questions/49161120/set-value-of-one-pandas-column-based-on-value-in-another-column
    agg1['division_by_zero'] = None
    agg1['ok'] = None
    agg1['division_by_zero'] = np.where(agg1.flag == 'division by 0', agg1.number_of_rows, agg1.division_by_zero)
    agg1['ok'] = np.where(agg1.flag != 'division by 0', agg1.number_of_rows, agg1.ok)
    agg1['division_by_zero'] = agg1['division_by_zero'].fillna(0)
    agg1['ok'] = agg1['ok'].fillna(0)
    
    # Aggregate again to simplify the df 
    agg1 = agg1.drop(columns = ['flag','number_of_rows'])
    agg2 = (agg1
            .groupby(['gtfs_dataset_key','shape_array_key','trip_id'])
            .agg({'division_by_zero':'sum','ok':'sum'})
            .reset_index()
           )
    
    # Find total rows for that trip
    agg2['total_rows'] = agg2.division_by_zero + agg2.ok
    
    # Find total % of rows that are ok
    agg2['percent_of_ok_rows'] = (agg2.ok/agg2.total_rows * 100)
    
    return agg2

In [62]:
trips_more_points = route_most_populated(flagged)

In [63]:
trips_more_points.shape

(70244, 7)

In [64]:
trips_more_points.head()

Unnamed: 0,gtfs_dataset_key,shape_array_key,trip_id,division_by_zero,ok,total_rows,percent_of_ok_rows
0,00accf770009aafd5dc103ff2eeddb37,01757ff15f8d471eecfe9cf0bec4d039,t_5634867_b_80735_tn_0,9,62,71,87.32
1,00accf770009aafd5dc103ff2eeddb37,01757ff15f8d471eecfe9cf0bec4d039,t_5634886_b_80735_tn_0,0,12,12,100.0
2,00accf770009aafd5dc103ff2eeddb37,01757ff15f8d471eecfe9cf0bec4d039,t_5635016_b_80735_tn_0,10,70,80,87.5
3,00accf770009aafd5dc103ff2eeddb37,01757ff15f8d471eecfe9cf0bec4d039,t_5635109_b_80735_tn_0,5,65,70,92.86
4,00accf770009aafd5dc103ff2eeddb37,01757ff15f8d471eecfe9cf0bec4d039,t_5635110_b_80735_tn_0,11,69,80,86.25


In [65]:
average_per_route = (trips_more_points
                     .groupby(['gtfs_dataset_key','shape_array_key'])
                     .agg({'percent_of_ok_rows':'mean'}).reset_index()
                     .rename(columns = {'percent_of_ok_rows':'avg_percent_ok_rows'})
                    )

In [66]:
average_per_route.shape

(4114, 3)

In [67]:
# merge
merge1 = pd.merge(trips_more_points,average_per_route, on =['gtfs_dataset_key','shape_array_key'], how = "inner") 

In [68]:
len(merge1[merge1.percent_of_ok_rows != 100])

46181

In [69]:
# https://stackoverflow.com/questions/63445182/how-to-compare-two-columns-using-pandas
# maybe I shouldn't filter this?? find all rows that aren't 100.
comparison_column = np.where(merge1["percent_of_ok_rows"] >= merge1["avg_percent_ok_rows"],0,1)

In [70]:
merge1['comparison'] = comparison_column

In [71]:
rows_fix2 = merge1[merge1.comparison == 1].reset_index(drop = True)

In [72]:
rows_fix2.shape

(22558, 9)

In [73]:
fix2_routes = list(rows_fix2.shape_array_key.unique())

In [74]:
fix2_trips = list(rows_fix2.trip_id.unique())

In [75]:
fix2_keys = list(rows_fix2.gtfs_dataset_key.unique())

In [76]:
len(fix2_routes), len(fix2_trips), len(fix2_keys)

(2565, 21739, 61)

In [77]:
rows_fix2.sample(5)

Unnamed: 0,gtfs_dataset_key,shape_array_key,trip_id,division_by_zero,ok,total_rows,percent_of_ok_rows,avg_percent_ok_rows,comparison
1421,2f15f573b6bb4c856fd31175aa24a342,5523d85a07e35f44a543e32d346a5a45,40001,2,22,24,91.67,95.08,1
2712,4340d7ccf61d3a169068ffef1e310daf,b4840044dca249c22e9ea8b75768e9f5,t33E-b10-sl2,4,11,15,73.33,81.68,1
13256,a4f6fd5552107e05fe9743ac7cce2c55,92ae9763f0123204d152619f1d16252a,17100248,11,30,41,73.17,74.64,1
8086,65d9589130415c685b89f4f7c2d8bd7e,f38f13e9bfc2c3b4e07469ac92a6d841,10125001521233-JUNE23,24,26,50,52.0,81.55,1
12319,a4f6fd5552107e05fe9743ac7cce2c55,51cb4f5382fedaca0065818f12eb77ba,17073374,9,9,18,50.0,51.33,1


In [78]:
# Find first and last point or use vp usable??
usable_vp.shape

(14579242, 11)

#### Filter the trips that need to be fixed.

In [79]:
usable_vp2 = usable_vp[(usable_vp.trip_id.isin(fix2_trips))].reset_index(drop = True)

In [80]:
divide_by_zero2 = divide_by_zero[divide_by_zero.trip_id.isin(fix2_trips)].reset_index(drop = True)

In [90]:
usable_vp2.trip_id.nunique(), divide_by_zero2.trip_id.nunique()

(21739, 21739)

In [91]:
len(divide_by_zero2), len(usable_vp2)

(185760, 4062461)

#### Test with one trip
* Mapped in example 1
* Duplicate the next that has a changed location within reason

In [82]:
test_trip = "t120-sl9-p84-r1A"

In [89]:
test_shape = "21aedea4c0d05c570e042903cc62c9cf"

In [84]:
rows_fix2[rows_fix2.trip_id == test_trip]

Unnamed: 0,gtfs_dataset_key,shape_array_key,trip_id,division_by_zero,ok,total_rows,percent_of_ok_rows,avg_percent_ok_rows,comparison
16335,bc21582ac1ad2b7de0647dfd5b76f67c,21aedea4c0d05c570e042903cc62c9cf,t120-sl9-p84-r1A,3,10,13,76.92,86.16,1


In [85]:
test_divide_zero = divide_by_zero2[divide_by_zero2.trip_id == test_trip]

In [86]:
test_zero_stop_seq = list(test_divide_zero.stop_sequence.unique())

In [115]:
test_flagged = flagged[flagged.trip_id == test_trip]

In [137]:
test_flagged.flag.value_counts()

ok                  7
division by 0       3
meters too low      2
seconds too high    1
Name: flag, dtype: int64

In [142]:
seq_to_fix = test_flagged[test_flagged.flag == 'division by 0'][['stop_sequence']].drop_duplicates()

In [139]:
seq_to_fix

Unnamed: 0,stop_sequence
311049,200
311073,270
311096,360


##### Usable

In [92]:
test_usable_vp = usable_vp2[usable_vp2.trip_id == test_trip]

In [93]:
test_usable_vp.shape

(125, 11)

In [114]:
test_usable_vp.sample()

Unnamed: 0,_gtfs_dataset_name,schedule_gtfs_dataset_key,trip_id,trip_instance_key,location_timestamp,location_timestamp_local,hour,gtfs_dataset_key,x,y,vp_idx
57,Petaluma Vehicle Positions,ddad56d2731ac6296304cecfba77d88e,t120-sl9-p84-r1A,5c58cdd4b2302bee66f69218ad1baa0a,2023-07-12 23:09:12+00:00,2023-07-12 16:09:12,16,0a17ae4ebd7b4570cbb8e63a44b49536,-122.67,38.27,61135


##### Pared

In [94]:
# test_vpidx = list(test_usable_vp.vp_idx.unique())

In [95]:
PARED = STOP_SEG_DICT["stage3"]

In [96]:
normal = pd.read_parquet(f"{SEGMENT_GCS}{PARED}_normal_{analysis_date}")

In [97]:
special = pd.read_parquet(f"{SEGMENT_GCS}{PARED}_special_{analysis_date}")

In [98]:
all_pared = pd.concat([normal, special], axis=0)

In [99]:
all_pared = all_pared.sort_values(['gtfs_dataset_key','shape_array_key','trip_id','stop_sequence']).reset_index()

In [100]:
test_pared =  all_pared[(all_pared.trip_id == test_trip)]

In [101]:
test_pared.shape

(26, 14)

In [102]:
test_pared.columns

Index(['__null_dask_index__', 'vp_idx', '_gtfs_dataset_name',
       'schedule_gtfs_dataset_key', 'trip_id', 'trip_instance_key',
       'location_timestamp', 'location_timestamp_local', 'hour',
       'gtfs_dataset_key', 'x', 'y', 'shape_array_key', 'stop_sequence'],
      dtype='object')

In [103]:
# https://stackoverflow.com/questions/53065104/how-can-i-subtract-3-hours-from-a-datetime-in-a-pandas-dataframe-column
test_pared['add_min'] = test_pared.location_timestamp_local + timedelta(minutes=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [122]:
test_pared['location_sum'] = test_pared.x.astype(str) + test_pared.y.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [124]:
test_one_seq = test_pared[test_pared.stop_sequence == 200]

In [125]:
test_one_seq.sample()

Unnamed: 0,__null_dask_index__,vp_idx,_gtfs_dataset_name,schedule_gtfs_dataset_key,trip_id,trip_instance_key,location_timestamp,location_timestamp_local,hour,gtfs_dataset_key,x,y,shape_array_key,stop_sequence,add_min,loc,location_sum
3874851,4342,11573428,Bay Area 511 Petaluma VehiclePositions,5ecac12d61709ec34b5f30a34fdbab64,t120-sl9-p84-r1A,3016185266a75e3365d76454874cc68e,2023-07-12 23:00:38+00:00,2023-07-12 16:00:38,16,bc21582ac1ad2b7de0647dfd5b76f67c,-122.64,38.26,21aedea4c0d05c570e042903cc62c9cf,200,2023-07-12 16:05:38,-122.6392738.2568,-122.6392738.2568


In [105]:
time_constraint = test_one_seq['location_timestamp_local'] - pd.to_timedelta(2, unit='min')

In [106]:
time_constraint = time_constraint.iloc[0]

In [107]:
time_constraint

Timestamp('2023-07-12 15:58:38')

In [126]:
loc_constraint = test_one_seq.location_sum.iloc[0]

In [127]:
loc_constraint

'-122.6392738.2568'

In [128]:
intersect_df2 = test_pared[(test_pared.location_timestamp_local <= time_constraint) & (test_pared.location_sum != loc_constraint)] 

In [132]:
intersect_df2 = intersect_df2.sort_values(['gtfs_dataset_key','shape_array_key','stop_sequence'])

In [134]:
# Keep only last 
intersect_df2 = intersect_df2.tail(1)

In [None]:
intersect_df2.

In [135]:
intersect_df2

Unnamed: 0,__null_dask_index__,vp_idx,_gtfs_dataset_name,schedule_gtfs_dataset_key,trip_id,trip_instance_key,location_timestamp,location_timestamp_local,hour,gtfs_dataset_key,x,y,shape_array_key,stop_sequence,add_min,loc,location_sum
3874847,19033,11573421,Bay Area 511 Petaluma VehiclePositions,5ecac12d61709ec34b5f30a34fdbab64,t120-sl9-p84-r1A,3016185266a75e3365d76454874cc68e,2023-07-12 22:58:08+00:00,2023-07-12 15:58:08,15,bc21582ac1ad2b7de0647dfd5b76f67c,-122.63,38.25,21aedea4c0d05c570e042903cc62c9cf,120,2023-07-12 16:03:08,-122.6310838.252224,-122.6310838.252224


### Investigate - Don't Delete

In [None]:
stop

In [None]:
# stage3_df = speed_utils.flag_stage3(divide_by_zero, analysis_date)

In [None]:
stage3_df.shape_array_key.nunique()

In [None]:
sort_by = ['_gtfs_dataset_name','shape_array_key','trip_id','stop_sequence']

In [None]:
preview_cols = sort_by + ['stop_id','gtfs_dataset_key','location_timestamp_local','pair','stage3_flag']

#### Find routes with a lot of n_trips that need to be evaluated.

In [None]:
shape_trips.head(5)

In [None]:
def count_all_pts_sjoin(flagged: pd.DataFrame, gtfs_key:str, trip:str, route:str):
    unique_trips = speed_utils.import_unique_trips(
    gtfs_key,trip,route)
    
    all_pts = speed_utils.import_vehicle_positions(
    unique_trips, gtfs_key, trip)
    
    segs = speed_utils.import_segments(flagged,route, gtfs_key, trip)
    
    sjoin = speed_utils.sjoin_vp_segments(segs, all_pts)
    print(f"{all_pts.geometry.nunique()} points for all vehicle positions, {sjoin.geometry_left.nunique()} after sjoin.")

#### Example Trip 1
* Understanding the result from flag_stage3().
* Looking at AC Transit: stop sequences 6 and 7 have different stop_ids. However, their time stamps and locations are the same.
* It looks like they share the same point.
* However, this isn't due to paring too many points: there just aren't enough points to choose from in the raw data.
* Also sequence 2 is extremely long.

In [None]:
test_route1 = "21aedea4c0d05c570e042903cc62c9cf"
test_gtfs_key1 = "bc21582ac1ad2b7de0647dfd5b76f67c"
test_trip1 = "t120-sl9-p84-r1A"

In [None]:
# % of trips with problematic rows for this route
trips_count[trips_count.shape_array_key == test_route1]

In [None]:
speed_utils.original_df_rows(flagged, test_trip1, test_route1)

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route1,
                        trip = test_trip1,
                        gtfs_key = test_gtfs_key1)

#### Example Trip 2
* Route is missing part of the segment?

In [None]:
test_route2 = "2fcc8c55ad61684b2e73860522d0626b"
test_gtfs_key2 = "0faa34840bb65e96b7f83b7f379c2edd"
test_trip2 = "1_Trip4_H_COVID"

In [None]:
merge4[merge4.trip_id == test_trip2][preview_cols2]

In [None]:
# Original number of rows for this trip
#len(flagged[(flagged.trip_id == test_trip2) & (flagged.shape_array_key == test_route2)])

In [None]:
# Rows with zeroes...a lot of them.
#len(divide_by_zero[(divide_by_zero.trip_id == test_trip2) & (divide_by_zero.shape_array_key == test_route2)])

In [None]:
#stage3_df[(stage3_df.stage3_flag != 'check in stage 2') & (stage3_df.shape_array_key == test_route2) 
#   & (stage3_df.stop_sequence.isin([45,46]))].sort_values(by =sort_by)[preview_cols]

In [None]:
# % of trips with problematic rows for this route, lots of them
#trips_count[trips_count.shape_array_key == test_route2]

In [None]:
#speed_utils.original_df_rows(m1, test_trip2, test_route2)

In [None]:
count_all_pts_sjoin(stage3_df, test_gtfs_key2, test_trip2, test_route2)

In [None]:
86/125

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                     date = analysis_date,
                       route = test_route2,
                    trip = test_trip2,
                      gtfs_key = test_gtfs_key2,)

In [None]:
# See which trip has the most rows that are ok for this route
#route_most_populated_df[route_most_populated_df.shape_array_key == test_route2]

In [None]:
# Plot the trip from this route with the highest % of ok rows
#speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
#                        date = analysis_date,
#                        route = test_route2,
 #                       trip = "10294000051724-DEC22",
 #                       gtfs_key = test_gtfs_key2,)

#### Example Trip 3
* One results says 100% of usable vps are sjoined, othe says only 60%
* Which one is more accurate? Seems like result from my second test is more accurate (a1sjoin)

In [None]:
shape_key3 = "7522dcf861b71950ebce7fc12d3b7e61"
gtfs_key3 = "0af37e731f00a843fb9a0fe286f8d958"
test_trip3 = "1093534"

In [None]:
merge4[merge4.trip_id == test_trip3][preview_cols2]

In [None]:
count_all_pts_sjoin(stage3_df, gtfs_key3, test_trip3, shape_key3)

In [None]:
# stage3_df[(stage3_df.shape_array_key == shape_key3) & (stage3_df.trip_id == test_trip3)].sort_values(by =sort_by)[preview_cols]

In [None]:
# speed_utils.original_df_rows(flagged, test_trip3, gtfs_key3)

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                       date = analysis_date,
                       route = shape_key3,
                       trip = test_trip3,
                       gtfs_key = gtfs_key3,)

In [None]:
# Trip from this route with the most ok rows
#route_most_populated_df[route_most_populated_df.shape_array_key == test_route3]

In [None]:
# Test the trip with the most ok rows
#speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
#                        date = analysis_date,
#                        route = test_route3,
#                        trip = "16938341",
 #                       gtfs_key = test_gtfs_key3,)

In [None]:
#speed_utils.original_df_rows(flagged, "16938341", test_route3)

#### Example Trip 4
* See which result is more accurate.
* SEems like second one is more accurate

In [None]:
gtfs_key4 = "00e412908245377894949d292fb79610"
trip_id4= "t_1524229_b_30719_tn_0"
shape_key4 = "0d53f0e2ed64d9ce4e0c8e63aa102a41"

In [None]:
merge4[merge4.trip_id == trip_id4][preview_cols2]

In [None]:
count_all_pts_sjoin(stage3_df, gtfs_key4, trip_id4, shape_key4)

In [None]:
82/84

In [None]:
speed_utils.original_df_rows(flagged, trip_id4, shape_key4)

In [None]:
# Test the trip with the most ok rows
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = shape_key4,
                        trip = trip_id4,
                        gtfs_key = gtfs_key4,)

#### Example Trip 5
* Checking out the difference
* Segments are not all captured.

In [None]:
merge4[merge4.trip_id == "1093295"][preview_cols2]

In [None]:
route5 = "df6aff9f6c51360bdf4819865e53681d"
operator5 = "0af37e731f00a843fb9a0fe286f8d958"
trip5 = "1093295"

In [None]:
count_all_pts_sjoin(stage3_df, operator5, trip5, route5)

In [None]:
149/252

In [None]:
# Test the trip with the most ok rows
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = route5,
                        trip = trip5,
                        gtfs_key = operator5,)

#### Example Trip 6
*  231% of positions retained, 104 vehicle positions in sjoin but 45 in original. 
* Also part of the segments is missing

In [None]:
route6= "ca68b32661ba4f531c66249bfe6a78e9"
operator6 = "3b0ddd2a33e5998da450917623a5c545"
trip6 = "t_5653186_b_30571_tn_1"

In [None]:
merge1[(merge1.trip_id == trip6)]

In [None]:
count_all_pts_sjoin(stage3_df, operator6, trip6, route6)

In [None]:
81/110

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = route6,
                        trip = trip6,
                        gtfs_key = operator6,)

#### Example Trip 7

In [None]:

route7= "377aa54acee92ffd7de6c9f15cb98f3f"
operator7 = "1aec012cf85cb59b80880a01b2d1b1ef"
trip7 = "3098"

In [None]:
merge1[(merge1.gtfs_dataset_key == "1aec012cf85cb59b80880a01b2d1b1ef") & (merge1.trip_id == "3098")]

In [None]:
count_all_pts_sjoin(stage3_df, operator7, trip7, route7)

In [None]:
67/70

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = route7,
                        trip = trip7,
                        gtfs_key = operator7,)

### Draft

In [None]:
stop running

#### Test 1
* Breaking apart usable_vp to be similar in format as what I did with A2 Sjoin

In [None]:
test = parted_usable_vp.partitions[0]

In [None]:
test1 = parted_usable_vp.partitions[1]

In [None]:
test = test.assign(
    identifier = test.gtfs_dataset_key.astype(str) + '/' + test.trip_id.astype(str)
)

In [None]:
# test = test.drop_duplicates(subset = ['vp_idx']) then do a count

In [None]:
# Using 
test = test.groupby(['identifier']).vp_idx.nunique()

In [None]:
# Using 
test1 = test1.groupby(['gtfs_dataset_key','trip_id']).vp_idx.nunique()

In [None]:
test1 = test1.compute()

In [None]:
test1.head

In [None]:
test1 = test1.reset_index()

In [None]:
test1.vp_idx.describe()

In [None]:
test = test.compute()

In [None]:
test.head()

In [None]:
test = test.to_frame()

In [None]:
test = test.reset_index()

In [None]:
test["gtfs_dataset_key"] = test["identifier"].str.split("/").str[0]

In [None]:
test["trip_id"] = test["identifier"].str.split("/").str[1]

In [None]:
test.head()

In [None]:
test.vp_idx.describe()

### A2 Sjoin
* Valid trips that are `sjoin` to segments.

In [None]:
USABLE_VP = STOP_SEG_DICT["stage1"]
INPUT_FILE_PREFIX = STOP_SEG_DICT["stage2"]
GROUPING_COL = STOP_SEG_DICT["grouping_col"]

In [None]:
normal_shapes = A2_valid_vehicle_positions.identify_stop_segment_cases(
        analysis_date, GROUPING_COL, 0)

In [None]:
abnormal_shapes = A2_valid_vehicle_positions.identify_stop_segment_cases(
        analysis_date, GROUPING_COL, 1)

In [None]:
# datetime.datetime.now() 

In [None]:
# Using A2 since I need the VPs to sjoin
# Only the pared down stops are saved out
# This takes a super long time
"""
usable_abnormal_vp = A2_valid_vehicle_positions.merge_usable_vp_with_sjoin_vpidx(
        abnormal_shapes,
        f"{USABLE_VP}_{analysis_date}",
        f"{INPUT_FILE_PREFIX}_{analysis_date}",
        GROUPING_COL
    )

 """   

In [None]:
#usable_abnormal_vp = usable_abnormal_vp.assign(
#    identifier = usable_abnormal_vp.gtfs_dataset_key.astype(str) + '/' + usable_abnormal_vp.shape_array_key.astype(str) + '/' + usable_abnormal_vp.trip_id.astype(str)
#)

In [None]:
#datetime.datetime.now() 

In [None]:
#agg = usable_abnormal_vp.groupby(['identifier']).vp_idx.nunique()

In [None]:
# agg = agg.compute()

In [None]:
#agg2 = agg.to_frame()

In [None]:
#agg3 = agg2.reset_index()

In [None]:
#agg3.shape

In [None]:
#agg3.sample()

In [None]:
#agg3["gtfs_dataset_key"] = agg3["index"].str.split("/").str[0]

In [None]:
#agg3["trip_id"] = agg3["index"].str.split("/").str[2]

In [None]:
#agg3 = agg3.drop(columns = ['index'])

In [None]:
#agg3 = agg3.rename(columns = {'vp_idx':'number_of_unique_vps_sjoin'})

In [None]:
#agg3.to_parquet(
#    f"{SEGMENT_GCS}special_vps_testing.parquet"
#)

In [None]:
#agg3 = pd.read_parquet(f"{SEGMENT_GCS}normalvps_testing.parquet")

In [None]:
normal = pd.read_parquet(f"{SEGMENT_GCS}normal_vps_testing.parquet")

In [None]:
normal = normal.drop(columns = ['identifier'])

In [None]:
special = pd.read_parquet(f"{SEGMENT_GCS}special_vps_testing.parquet")

In [None]:
sjoin_results = pd.concat([special, normal])

In [None]:
sjoin_results.sample()

In [None]:
stop running

In [None]:
# VP all without paring
STG_0_FILE = STOP_SEG_DICT['stage0']
    

In [None]:
f"{STG_0_FILE}_{analysis_date}/"

In [None]:
all_pts = helpers.import_vehicle_positions(
            SEGMENT_GCS,
            f"{STG_0_FILE}_{analysis_date}/",
            "df",
            columns = ["gtfs_dataset_key", "trip_id","geometry"],
            partitioned = False
        )
all_pts = all_pts.compute()

In [None]:
all_pts.sample(5)

In [None]:
all_pts2 = (all_pts
        .groupby(['gtfs_dataset_key','trip_id'])
        .agg({'geometry':'nunique'}).reset_index()
        .rename(columns = {'geometry':'number_of_unique_vps_all_pts'})
        .reset_index(drop = True)
       )

In [None]:
len(all_pts[(all_pts.gtfs_dataset_key == "cdd2ad81863b6d4ad51676a1cb781ea8") & (all_pts.trip_id == "10686020")])

In [None]:
all_pts2[(all_pts2.gtfs_dataset_key == "cdd2ad81863b6d4ad51676a1cb781ea8") & (all_pts2.trip_id == "10686020")]

In [None]:
all_pts2[(all_pts2.gtfs_dataset_key == "593953c37ce48a9449bb90808ba0c1e1") & (all_pts2.trip_id == "t5FE-b3EA-sl13")]

In [None]:
all_pts2[(all_pts2.gtfs_dataset_key == "65d9589130415c685b89f4f7c2d8bd7e") & (all_pts2.trip_id == "10035002751332-DEC22")]

In [None]:
# How did this pop up in the grouped df if it doesn't even exist??
all_pts[(all_pts.gtfs_dataset_key == "00accf770009aafd5dc103ff2eeddb37") & (all_pts.trip_id == "-1160024104")]

In [None]:
all_pts3 = all_pts.drop(columns = ['geometry']).drop_duplicates().reset_index(drop = True)

In [None]:
len(all_pts3)

In [None]:
all_pts2.shape

In [None]:
len(all_pts)

##### Merge Sjoin with All Vehicles

In [None]:
# Merge to get rid of weird rows??
# These are all points 
pd.merge(all_pts3, all_pts2, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge1 = pd.merge(all_pts3, all_pts2, on = ['gtfs_dataset_key','trip_id'], how = 'inner')

In [None]:
merge1.sample()

In [None]:
merge1.shape

In [None]:
# These are sjoin positions
agg3.sample()

In [None]:
agg3.shape

In [None]:
pd.merge(merge1, agg3, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge2 = pd.merge(merge1, agg3, on = ['gtfs_dataset_key','trip_id'], how = 'inner')

In [None]:
merge2.sample(5)

#### Merge test1 with A2sjoin results

In [None]:
len(sjoin_results)

In [None]:
sjoin_results.gtfs_dataset_key.nunique(),sjoin_results.trip_id.nunique()

In [None]:
test.gtfs_dataset_key.nunique(),test.trip_id.nunique()

In [None]:
test.sample()

In [None]:
sjoin_results.sample()

In [None]:
# How is it possible there are some that are right_only?
# Lots of left only results b/c this is the full df. 
# test is just a subset
pd.merge(sjoin_results, test, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
# Should be inner
merge1 =  pd.merge(test, sjoin_results,  on = ['gtfs_dataset_key','trip_id'], how = 'left')

In [None]:
merge1.sample()

In [None]:
len(merge1)

In [None]:
merge1['percent_of_positions_retained'] = merge1.number_of_unique_vps_sjoin/merge1.vp_idx * 100

In [None]:
merge1.percent_of_positions_retained.describe()

In [None]:
merge1 = merge1.fillna(0)

##### Why are 94 of the rows over 100??
* Same point can joined to the segment. 
* Use nunique

In [None]:
merge1.sample(5)

In [None]:
len(merge1[merge1.percent_of_positions_retained > 100])

In [None]:
len(merge1[merge1.percent_of_positions_retained == 100])

In [None]:
merge1[merge1.percent_of_positions_retained > 100].sample(5)

#### Compare results

In [None]:
agg2.sample()

In [None]:
merge1.sample()

In [None]:
pd.merge(agg2, merge1, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge4 = pd.merge(agg2, merge1, on = ['gtfs_dataset_key','trip_id'], how = 'inner')

In [None]:
len(merge4)

In [None]:
merge4.sample()

In [None]:
merge4['difference'] = (merge4.percent_of_pts - merge4.percent_of_positions_retained)

In [None]:
merge4.difference.describe()

In [None]:
merge4.percent_of_positions_retained.describe()

In [None]:
preview_cols2 = ['shape_array_key','gtfs_dataset_key', 'trip_id', 'pts_not_in_sjoin', 'sjoin_pts', 'all_pts',
        'percent_of_pts', 'vp_idx', 'number_of_unique_vps_sjoin', 'percent_of_positions_retained', 'difference']

In [None]:
crosswalk.sample()

In [None]:
merge4.sample()

In [None]:
pd.merge(merge4, crosswalk, on =['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge4 = pd.merge(merge4, crosswalk, on =['gtfs_dataset_key','trip_id'], how = 'left')