In [1]:
import datetime
import _speed_utils as speed_utils
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions)
from shared_utils import calitp_color_palette as cp


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
CONFIG_PATH = './scripts/config.yml'

In [4]:
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [5]:
analysis_date = '2023-07-12'

### Observations (7/12)
* Step 1: Flagging
    * There are 2,704,812 rows in the dataframe original. About 10% of those rows are flagged as having zeroes in meters elapsed and seconds elapsed. 
    * There are around 4566 routes. About 57% of these routes had at least one trip with one or more rows flagged as zero.
* I took 2 passes at trying to understand why both these columns recorded zeroes.

* Step 2: `vp_pared_stops`. 
    * I grouped `vp_pared_stops` by 'shape_array_key','trip_id', and 'location_timestamp_local' OR `x` and `y`. I counted the number of unique stop sequences after grouping. If this trio had more than one unique stop sequence, that meant the timestamp or location recorded between sequences was duplicated.
    * Only around 9% of rows were flagged as having (obviously) repeated timestamps and locations. 
    * For all of these rows, both the timestamp and location were duplicated. 
    * All the routes that were flagged in step one needed a further look in step 3.
    
* Step 3: `vp_usable`
    * For one route and trip, find: all the recorded vehicle positions, sjoin of vps to segments,
    and the first and last points kept. 
    * Plot the three gdfs in a map to visually inspect what's happening.
    * Compare the sample route and trips with the trip with the highest percentage of non division by 0 rows to see what's going on. 
    
* Buckets of errors (all based on `stage0 vp`). 
    * There is only one recorded point in that segment in the raw data.
        * Fix: use the timestamp that comes after it.
        * <img src= "./speeds_images/only_one_pt_collected.png" width = 300>
    * Points are shared between segments
        * Use p20/p50/p80.
        * <img src= "./speeds_images/shared_vp.png" width = 300>
    * Points recorded are really far out and they don't touch the buffered segments.
        * Fix: figure out % of vehicle positions that are too far out.
        * <img src= "./speeds_images/dots_not_on_seg.png" width = 300>
    * No data captured for that segment at all.
        * Use p20/p50/p80. 
        * <img src= "./speeds_images/no_dots_collected.png" width = 300> 

### Flagging

In [6]:
# Flagged: all the rows in the df flagged
# divide_by_zero: only the rows that have 0 for meters and sec elapsed
# trips_count: % of trips with 1+ division by 0 row for a route
# route_most_populated_df: the trip for a route with the smallest % of rows that are divided by 0
# flagged, divide_by_zero, trips_count, route_most_populated_df = speed_utils.flagging_stage(analysis_date)

In [7]:
#divide_by_zero.loop_or_inlining.value_counts()

### Fix 1  - % of vehicle positions that are too far away from the shapes to be joined
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/A0_preprocessing.py

In [8]:
INPUT_FILE_PREFIX = STOP_SEG_DICT["stage0"]

In [9]:
# 14_600_897 rows
original = dd.read_parquet(
        f"{SEGMENT_GCS}{INPUT_FILE_PREFIX}_{analysis_date}.parquet"
    )

In [10]:
# len(original)

#### Crosswalk

In [11]:
TRIP_GROUPING_COLS = STOP_SEG_DICT["trip_grouping_cols"]

In [12]:
crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(
        analysis_date, 
        ["feed_key", "trip_id"] + TRIP_GROUPING_COLS
    )

In [13]:
crosswalk = crosswalk.compute()

In [14]:
crosswalk.sample()

Unnamed: 0,feed_key,trip_id,shape_array_key,gtfs_dataset_key
12697,fd90fb59d425f917fa504ca6f0f6958c,10993010,fb66cbb0f831849c10a567f83fa4a273,a93588d23992dd2f6456dcdace8b9988


#### A1 sjoin
* Contains the columns:
    * vp_idx
    * shape_array_key
    * stop_sequence

In [15]:
USABLE_VP = STOP_SEG_DICT["stage1"]
INPUT_FILE_PREFIX = STOP_SEG_DICT["stage2"]
GROUPING_COL = STOP_SEG_DICT["grouping_col"]

In [16]:
def load_sjoin(analysis_date:str):
    INPUT_FILE_PREFIX = STOP_SEG_DICT["stage2"]
    df = dd.read_parquet(
      f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}",
    )
    return df

In [17]:
# This is the end result from A1_sjoin
f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}"

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_sjoin/vp_stop_segment_2023-07-12'

In [18]:
vp_to_seg = dd.read_parquet(
      f"{SEGMENT_GCS}vp_sjoin/{INPUT_FILE_PREFIX}_{analysis_date}",
    )

In [19]:
# vp_to_seg = vp_to_seg.compute()

In [20]:
# 24_973_725
# len(vp_to_seg)

In [21]:
# 11_350_051
# vp_to_seg.vp_idx.nunique().compute()

#### Usable VPS
* _gtfs_dataset_name	
* schedule_gtfs_dataset_key
* trip_id	
* trip_instance_key	
* location_timestamp	
* location_timestamp_local	
* hour	
* gtfs_dataset_key	
* x	y	
* vp_idx

In [22]:
usable_vp = dd.read_parquet(
        f"{SEGMENT_GCS}{USABLE_VP}_{analysis_date}"
    )

In [23]:
usable_vp = usable_vp.drop_duplicates(subset = ['vp_idx'])

In [24]:
# 14_579_242
# usable_vp.vp_idx.nunique().compute()

In [25]:
# 14,579,242
# len(usable_vp)

In [26]:
# https://stackoverflow.com/questions/49139371/slicing-out-a-few-rows-from-a-dask-dataframe
npart = round(len(usable_vp)/1_000_000)
parted_usable_vp = usable_vp.repartition(npartitions=npart)

In [27]:
#npart

In [28]:
#my_list = [n for n in range(0,npart)]

In [29]:
def load_usable_vp(analysis_date:str):
    USABLE_VP = STOP_SEG_DICT["stage1"]
    usable_vp = dd.read_parquet(
        f"{SEGMENT_GCS}{USABLE_VP}_{analysis_date}"
    )
    
    usable_vp = usable_vp.drop_duplicates(subset = ['vp_idx'])
    
    return usable_vp

In [30]:
usable_vp1 = parted_usable_vp.partitions[0]

#### Merge test2 with A1sjoin

In [31]:
# outer_merge = dd.merge(usable_vp1, vp_to_seg, on = ['vp_idx'], how = 'outer', indicator = True)

In [57]:
def percent_sjoin_pts(usable_vp: dd.DataFrame, sjoin_vps: dd.DataFrame):
    start = datetime.datetime.now()
    
    # Use a left join because these are the vps that were not properly joined
    merge1 = dd.merge(usable_vp, sjoin_vps, on = ['vp_idx'], how = 'left', indicator = True)
    
    # print(f"{len(merge1)} rows")
    
    # Drop duplicated vp idx
    # Since same pt can attach to multiple segs
    merge1 = merge1.drop_duplicates(subset = ['vp_idx'])

    # First groupby, have to use observed = True because
    # merge and gtfs dataset key are the categorical data type
    agg1 = (merge1.groupby(['gtfs_dataset_key','trip_id','_merge'],observed=True,)
                .agg({'trip_instance_key':'count'})
                .reset_index()
               )
    
    agg1 = agg1.compute() 
    
    # Create new columns so the dataframe will be the way I like it
    agg1['left_only'] = None
    agg1['both'] = None
    agg1['pts_not_in_sjoin'] = np.where(agg1._merge == 'left_only', agg1.trip_instance_key, agg1.left_only)
    agg1['sjoin_pts'] = np.where(agg1._merge == 'both', agg1.trip_instance_key, agg1.both)
   
    # One row for each route/trip id
    agg2 = (agg1
            .groupby(['gtfs_dataset_key','trip_id'], observed=True, group_keys=False)
            .agg({'pts_not_in_sjoin':'sum','sjoin_pts':'sum'})
            .reset_index()
           )
    
    print(f"{len(agg1)-len(agg2)} rows dropped, now {len(agg2)} rows")
    
    # Add some additional columns
    agg2 = agg2.fillna(0)
    agg2['all_pts'] = agg2.pts_not_in_sjoin + agg2.sjoin_pts
    agg2['percent_of_pts'] = agg2.sjoin_pts/agg2.all_pts * 100
    
    # Filter out vpidx already found 
    new_usable_vps = merge1[['vp_idx']].drop_duplicates().compute()
    new_usable_vps = new_usable_vps.vp_idx.to_list()
    
    end = datetime.datetime.now()
    print(f"Time lapsed: {end-start}")

    return new_usable_vps, agg2

In [70]:
new_usable_vps, merge1 = percent_sjoin_pts(usable_vp1,vp_to_seg)

ArrowInvalid: Parquet magic bytes not found in footer. Either the file is corrupted or this is not a parquet file.

In [44]:
type(merge1)

dask.dataframe.core.DataFrame

In [None]:
len(merge1)

In [None]:
# merge with crossalk
crosswalk.sample()

In [None]:
merge1 = pd.merge(merge1, crosswalk, on = ['trip_id','gtfs_dataset_key'], how = 'left')

In [None]:
merge1.sample(10)

In [None]:
merge1[(merge1.gtfs_dataset_key == "1aec012cf85cb59b80880a01b2d1b1ef") & (merge1.trip_id == "3098")]

In [None]:
merge1[(merge1.trip_id == "t_5653186_b_30571_tn_1")]

In [None]:
merge1[(merge1.trip_id == "t2DB-b1-sl2")]

In [None]:
merge1[(merge1.trip_id == "223")]

In [None]:
merge1[(merge1.trip_id == "1705020")]

#### Check for missing operators/trips

In [None]:
merge1.gtfs_dataset_key.nunique(), merge1.trip_id.nunique()

In [None]:
usable_vp1 = usable_vp1.compute()

In [None]:
usable_vp1.gtfs_dataset_key.nunique(), usable_vp1.trip_id.nunique()

In [None]:
og_gtfs = set(usable_vp1.trip_id.unique().tolist())
agg_gtfs = set(merge1.trip_id.unique().tolist())
og_gtfs - agg_gtfs

#### Full merge

In [52]:
# https://www.geeksforgeeks.org/python-reversing-list/
def reverse(lst):
    new_list = lst[::-1]
    return new_list

In [59]:
def percent_sjoin_all_pts(analysis_date:str):
    
    start = datetime.datetime.now()
    print(start)
    
    # Load usable vp
    usable_vps_og = load_usable_vp(analysis_date)
    
    # Load sjoin
    sjoin_og = load_sjoin(analysis_date)
    
    # Break it apart
    # https://stackoverflow.com/questions/49139371/slicing-out-a-few-rows-from-a-dask-dataframe
    npart = round(len(usable_vps_og)/1_000_000)
    usable_vps_og = usable_vps_og.repartition(npartitions=npart)
    my_list = [n for n in range(0,npart)]
    
    for i in reverse(my_list):
        my_results = []
        try:
            sjoin_og = sjoin_og[~sjoin_og.vp_idx.isin(remaining_vp_idx_list)].reset_index(drop = True)
        except:
            sjoin_og
            
        # Apply function
        remaining_vp_idx_list, results = percent_sjoin_pts(usable_vps_og.partitions[i], sjoin_og)
        my_results.append(results)
        
        # Save
        results.to_parquet(f"{SEGMENT_GCS}ah_testing/part_{i}.parquet")
        print(f"done with {i}")
        
    
    final = pd.concat(my_results, axis=0).reset_index(drop=True)
    end = datetime.datetime.now()
    print(f"Time lapsed: {end-start}")
    return final
        

In [61]:
# test = percent_sjoin_all_pts(analysis_date)

In [66]:
pt14 = pd.read_parquet(f"gs://calitp-analytics-data/data-analyses/rt_segment_speeds/ah_testing/part_14.parquet")

In [69]:
pt14.percent_of_pts.describe()

count   7040.00
mean      85.81
std       27.16
min        0.00
25%       85.11
50%       99.31
75%      100.00
max      100.00
Name: percent_of_pts, dtype: float64

### Investigate - Don't Delete

In [None]:
stop

In [None]:
stage3_df = speed_utils.flag_stage3(divide_by_zero, analysis_date)

In [None]:
stage3_df.shape_array_key.nunique()

In [None]:
sort_by = ['_gtfs_dataset_name','shape_array_key','trip_id','stop_sequence']

In [None]:
preview_cols = sort_by + ['stop_id','gtfs_dataset_key','location_timestamp_local','pair','stage3_flag']

#### Find routes with a lot of n_trips that need to be evaluated.

In [None]:
shape_trips = speed_utils.count_trips_routes(stage3_df)

In [None]:
shape_trips.head(5)

In [None]:
def count_all_pts_sjoin(flagged: pd.DataFrame, gtfs_key:str, trip:str, route:str):
    unique_trips = speed_utils.import_unique_trips(
    gtfs_key,trip,route)
    
    all_pts = speed_utils.import_vehicle_positions(
    unique_trips, gtfs_key, trip)
    
    segs = speed_utils.import_segments(flagged,route, gtfs_key, trip)
    
    sjoin = speed_utils.sjoin_vp_segments(segs, all_pts)
    print(f"{all_pts.geometry.nunique()} points for all vehicle positions, {sjoin.geometry_left.nunique()} after sjoin.")

#### Example Trip 1
* Understanding the result from flag_stage3().
* Looking at AC Transit: stop sequences 6 and 7 have different stop_ids. However, their time stamps and locations are the same.
* It looks like they share the same point.
* However, this isn't due to paring too many points: there just aren't enough points to choose from in the raw data.
* Also sequence 2 is extremely long.

In [None]:
test_route1 = "e22aab342fa9be03e18fbbd8fed80659"
test_gtfs_key1 = "c0e3039da063db95ebabd3fe4ee611a4"
test_trip1 = "11359079_M11"

In [None]:
# % of trips with problematic rows for this route
trips_count[trips_count.shape_array_key == test_route1]

In [None]:
speed_utils.original_df_rows(flagged, test_trip1, test_route1)

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route1,
                        trip = test_trip1,
                        gtfs_key = test_gtfs_key1)

#### Example Trip 2
* Route is missing part of the segment?

In [None]:
test_route2 = "2fcc8c55ad61684b2e73860522d0626b"
test_gtfs_key2 = "0faa34840bb65e96b7f83b7f379c2edd"
test_trip2 = "1_Trip4_H_COVID"

In [None]:
merge4[merge4.trip_id == test_trip2][preview_cols2]

In [None]:
# Original number of rows for this trip
#len(flagged[(flagged.trip_id == test_trip2) & (flagged.shape_array_key == test_route2)])

In [None]:
# Rows with zeroes...a lot of them.
#len(divide_by_zero[(divide_by_zero.trip_id == test_trip2) & (divide_by_zero.shape_array_key == test_route2)])

In [None]:
#stage3_df[(stage3_df.stage3_flag != 'check in stage 2') & (stage3_df.shape_array_key == test_route2) 
#   & (stage3_df.stop_sequence.isin([45,46]))].sort_values(by =sort_by)[preview_cols]

In [None]:
# % of trips with problematic rows for this route, lots of them
#trips_count[trips_count.shape_array_key == test_route2]

In [None]:
#speed_utils.original_df_rows(m1, test_trip2, test_route2)

In [None]:
count_all_pts_sjoin(stage3_df, test_gtfs_key2, test_trip2, test_route2)

In [None]:
86/125

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                     date = analysis_date,
                       route = test_route2,
                    trip = test_trip2,
                      gtfs_key = test_gtfs_key2,)

In [None]:
# See which trip has the most rows that are ok for this route
#route_most_populated_df[route_most_populated_df.shape_array_key == test_route2]

In [None]:
# Plot the trip from this route with the highest % of ok rows
#speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
#                        date = analysis_date,
#                        route = test_route2,
 #                       trip = "10294000051724-DEC22",
 #                       gtfs_key = test_gtfs_key2,)

#### Example Trip 3
* One results says 100% of usable vps are sjoined, othe says only 60%
* Which one is more accurate? Seems like result from my second test is more accurate (a1sjoin)

In [None]:
shape_key3 = "7522dcf861b71950ebce7fc12d3b7e61"
gtfs_key3 = "0af37e731f00a843fb9a0fe286f8d958"
test_trip3 = "1093534"

In [None]:
merge4[merge4.trip_id == test_trip3][preview_cols2]

In [None]:
count_all_pts_sjoin(stage3_df, gtfs_key3, test_trip3, shape_key3)

In [None]:
# stage3_df[(stage3_df.shape_array_key == shape_key3) & (stage3_df.trip_id == test_trip3)].sort_values(by =sort_by)[preview_cols]

In [None]:
# speed_utils.original_df_rows(flagged, test_trip3, gtfs_key3)

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                       date = analysis_date,
                       route = shape_key3,
                       trip = test_trip3,
                       gtfs_key = gtfs_key3,)

In [None]:
# Trip from this route with the most ok rows
#route_most_populated_df[route_most_populated_df.shape_array_key == test_route3]

In [None]:
# Test the trip with the most ok rows
#speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
#                        date = analysis_date,
#                        route = test_route3,
#                        trip = "16938341",
 #                       gtfs_key = test_gtfs_key3,)

In [None]:
#speed_utils.original_df_rows(flagged, "16938341", test_route3)

#### Example Trip 4
* See which result is more accurate.
* SEems like second one is more accurate

In [None]:
gtfs_key4 = "00e412908245377894949d292fb79610"
trip_id4= "t_1524229_b_30719_tn_0"
shape_key4 = "0d53f0e2ed64d9ce4e0c8e63aa102a41"

In [None]:
merge4[merge4.trip_id == trip_id4][preview_cols2]

In [None]:
count_all_pts_sjoin(stage3_df, gtfs_key4, trip_id4, shape_key4)

In [None]:
82/84

In [None]:
speed_utils.original_df_rows(flagged, trip_id4, shape_key4)

In [None]:
# Test the trip with the most ok rows
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = shape_key4,
                        trip = trip_id4,
                        gtfs_key = gtfs_key4,)

#### Example Trip 5
* Checking out the difference
* Segments are not all captured.

In [None]:
merge4[merge4.trip_id == "1093295"][preview_cols2]

In [None]:
route5 = "df6aff9f6c51360bdf4819865e53681d"
operator5 = "0af37e731f00a843fb9a0fe286f8d958"
trip5 = "1093295"

In [None]:
count_all_pts_sjoin(stage3_df, operator5, trip5, route5)

In [None]:
149/252

In [None]:
# Test the trip with the most ok rows
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = route5,
                        trip = trip5,
                        gtfs_key = operator5,)

#### Example Trip 6
*  231% of positions retained, 104 vehicle positions in sjoin but 45 in original. 
* Also part of the segments is missing

In [None]:
route6= "ca68b32661ba4f531c66249bfe6a78e9"
operator6 = "3b0ddd2a33e5998da450917623a5c545"
trip6 = "t_5653186_b_30571_tn_1"

In [None]:
merge1[(merge1.trip_id == trip6)]

In [None]:
count_all_pts_sjoin(stage3_df, operator6, trip6, route6)

In [None]:
81/110

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = route6,
                        trip = trip6,
                        gtfs_key = operator6,)

#### Example Trip 7

In [None]:

route7= "377aa54acee92ffd7de6c9f15cb98f3f"
operator7 = "1aec012cf85cb59b80880a01b2d1b1ef"
trip7 = "3098"

In [None]:
merge1[(merge1.gtfs_dataset_key == "1aec012cf85cb59b80880a01b2d1b1ef") & (merge1.trip_id == "3098")]

In [None]:
count_all_pts_sjoin(stage3_df, operator7, trip7, route7)

In [None]:
67/70

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = route7,
                        trip = trip7,
                        gtfs_key = operator7,)

### Fix 2 - Keep timestamps nearby if there aren't enough points

### Draft

In [None]:
stop running

#### Test 1
* Breaking apart usable_vp to be similar in format as what I did with A2 Sjoin

In [None]:
test = parted_usable_vp.partitions[0]

In [None]:
test1 = parted_usable_vp.partitions[1]

In [None]:
test = test.assign(
    identifier = test.gtfs_dataset_key.astype(str) + '/' + test.trip_id.astype(str)
)

In [None]:
# test = test.drop_duplicates(subset = ['vp_idx']) then do a count

In [None]:
# Using 
test = test.groupby(['identifier']).vp_idx.nunique()

In [None]:
# Using 
test1 = test1.groupby(['gtfs_dataset_key','trip_id']).vp_idx.nunique()

In [None]:
test1 = test1.compute()

In [None]:
test1.head

In [None]:
test1 = test1.reset_index()

In [None]:
test1.vp_idx.describe()

In [None]:
test = test.compute()

In [None]:
test.head()

In [None]:
test = test.to_frame()

In [None]:
test = test.reset_index()

In [None]:
test["gtfs_dataset_key"] = test["identifier"].str.split("/").str[0]

In [None]:
test["trip_id"] = test["identifier"].str.split("/").str[1]

In [None]:
test.head()

In [None]:
test.vp_idx.describe()

### A2 Sjoin
* Valid trips that are `sjoin` to segments.

In [None]:
USABLE_VP = STOP_SEG_DICT["stage1"]
INPUT_FILE_PREFIX = STOP_SEG_DICT["stage2"]
GROUPING_COL = STOP_SEG_DICT["grouping_col"]

In [None]:
normal_shapes = A2_valid_vehicle_positions.identify_stop_segment_cases(
        analysis_date, GROUPING_COL, 0)

In [None]:
abnormal_shapes = A2_valid_vehicle_positions.identify_stop_segment_cases(
        analysis_date, GROUPING_COL, 1)

In [None]:
# datetime.datetime.now() 

In [None]:
# Using A2 since I need the VPs to sjoin
# Only the pared down stops are saved out
# This takes a super long time
"""
usable_abnormal_vp = A2_valid_vehicle_positions.merge_usable_vp_with_sjoin_vpidx(
        abnormal_shapes,
        f"{USABLE_VP}_{analysis_date}",
        f"{INPUT_FILE_PREFIX}_{analysis_date}",
        GROUPING_COL
    )

 """   

In [None]:
#usable_abnormal_vp = usable_abnormal_vp.assign(
#    identifier = usable_abnormal_vp.gtfs_dataset_key.astype(str) + '/' + usable_abnormal_vp.shape_array_key.astype(str) + '/' + usable_abnormal_vp.trip_id.astype(str)
#)

In [None]:
#datetime.datetime.now() 

In [None]:
#agg = usable_abnormal_vp.groupby(['identifier']).vp_idx.nunique()

In [None]:
# agg = agg.compute()

In [None]:
#agg2 = agg.to_frame()

In [None]:
#agg3 = agg2.reset_index()

In [None]:
#agg3.shape

In [None]:
#agg3.sample()

In [None]:
#agg3["gtfs_dataset_key"] = agg3["index"].str.split("/").str[0]

In [None]:
#agg3["trip_id"] = agg3["index"].str.split("/").str[2]

In [None]:
#agg3 = agg3.drop(columns = ['index'])

In [None]:
#agg3 = agg3.rename(columns = {'vp_idx':'number_of_unique_vps_sjoin'})

In [None]:
#agg3.to_parquet(
#    f"{SEGMENT_GCS}special_vps_testing.parquet"
#)

In [None]:
#agg3 = pd.read_parquet(f"{SEGMENT_GCS}normalvps_testing.parquet")

In [None]:
normal = pd.read_parquet(f"{SEGMENT_GCS}normal_vps_testing.parquet")

In [None]:
normal = normal.drop(columns = ['identifier'])

In [None]:
special = pd.read_parquet(f"{SEGMENT_GCS}special_vps_testing.parquet")

In [None]:
sjoin_results = pd.concat([special, normal])

In [None]:
sjoin_results.sample()

In [None]:
stop running

In [None]:
# VP all without paring
STG_0_FILE = STOP_SEG_DICT['stage0']
    

In [None]:
f"{STG_0_FILE}_{analysis_date}/"

In [None]:
all_pts = helpers.import_vehicle_positions(
            SEGMENT_GCS,
            f"{STG_0_FILE}_{analysis_date}/",
            "df",
            columns = ["gtfs_dataset_key", "trip_id","geometry"],
            partitioned = False
        )
all_pts = all_pts.compute()

In [None]:
all_pts.sample(5)

In [None]:
all_pts2 = (all_pts
        .groupby(['gtfs_dataset_key','trip_id'])
        .agg({'geometry':'nunique'}).reset_index()
        .rename(columns = {'geometry':'number_of_unique_vps_all_pts'})
        .reset_index(drop = True)
       )

In [None]:
len(all_pts[(all_pts.gtfs_dataset_key == "cdd2ad81863b6d4ad51676a1cb781ea8") & (all_pts.trip_id == "10686020")])

In [None]:
all_pts2[(all_pts2.gtfs_dataset_key == "cdd2ad81863b6d4ad51676a1cb781ea8") & (all_pts2.trip_id == "10686020")]

In [None]:
all_pts2[(all_pts2.gtfs_dataset_key == "593953c37ce48a9449bb90808ba0c1e1") & (all_pts2.trip_id == "t5FE-b3EA-sl13")]

In [None]:
all_pts2[(all_pts2.gtfs_dataset_key == "65d9589130415c685b89f4f7c2d8bd7e") & (all_pts2.trip_id == "10035002751332-DEC22")]

In [None]:
# How did this pop up in the grouped df if it doesn't even exist??
all_pts[(all_pts.gtfs_dataset_key == "00accf770009aafd5dc103ff2eeddb37") & (all_pts.trip_id == "-1160024104")]

In [None]:
all_pts3 = all_pts.drop(columns = ['geometry']).drop_duplicates().reset_index(drop = True)

In [None]:
len(all_pts3)

In [None]:
all_pts2.shape

In [None]:
len(all_pts)

##### Merge Sjoin with All Vehicles

In [None]:
# Merge to get rid of weird rows??
# These are all points 
pd.merge(all_pts3, all_pts2, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge1 = pd.merge(all_pts3, all_pts2, on = ['gtfs_dataset_key','trip_id'], how = 'inner')

In [None]:
merge1.sample()

In [None]:
merge1.shape

In [None]:
# These are sjoin positions
agg3.sample()

In [None]:
agg3.shape

In [None]:
pd.merge(merge1, agg3, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge2 = pd.merge(merge1, agg3, on = ['gtfs_dataset_key','trip_id'], how = 'inner')

In [None]:
merge2.sample(5)

#### Merge test1 with A2sjoin results

In [None]:
len(sjoin_results)

In [None]:
sjoin_results.gtfs_dataset_key.nunique(),sjoin_results.trip_id.nunique()

In [None]:
test.gtfs_dataset_key.nunique(),test.trip_id.nunique()

In [None]:
test.sample()

In [None]:
sjoin_results.sample()

In [None]:
# How is it possible there are some that are right_only?
# Lots of left only results b/c this is the full df. 
# test is just a subset
pd.merge(sjoin_results, test, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
# Should be inner
merge1 =  pd.merge(test, sjoin_results,  on = ['gtfs_dataset_key','trip_id'], how = 'left')

In [None]:
merge1.sample()

In [None]:
len(merge1)

In [None]:
merge1['percent_of_positions_retained'] = merge1.number_of_unique_vps_sjoin/merge1.vp_idx * 100

In [None]:
merge1.percent_of_positions_retained.describe()

In [None]:
merge1 = merge1.fillna(0)

##### Why are 94 of the rows over 100??
* Same point can joined to the segment. 
* Use nunique

In [None]:
merge1.sample(5)

In [None]:
len(merge1[merge1.percent_of_positions_retained > 100])

In [None]:
len(merge1[merge1.percent_of_positions_retained == 100])

In [None]:
merge1[merge1.percent_of_positions_retained > 100].sample(5)

#### Compare results

In [None]:
agg2.sample()

In [None]:
merge1.sample()

In [None]:
pd.merge(agg2, merge1, on = ['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge4 = pd.merge(agg2, merge1, on = ['gtfs_dataset_key','trip_id'], how = 'inner')

In [None]:
len(merge4)

In [None]:
merge4.sample()

In [None]:
merge4['difference'] = (merge4.percent_of_pts - merge4.percent_of_positions_retained)

In [None]:
merge4.difference.describe()

In [None]:
merge4.percent_of_positions_retained.describe()

In [None]:
preview_cols2 = ['shape_array_key','gtfs_dataset_key', 'trip_id', 'pts_not_in_sjoin', 'sjoin_pts', 'all_pts',
        'percent_of_pts', 'vp_idx', 'number_of_unique_vps_sjoin', 'percent_of_positions_retained', 'difference']

In [None]:
crosswalk.sample()

In [None]:
merge4.sample()

In [None]:
pd.merge(merge4, crosswalk, on =['gtfs_dataset_key','trip_id'], how = 'outer', indicator = True)[['_merge']].value_counts()

In [None]:
merge4 = pd.merge(merge4, crosswalk, on =['gtfs_dataset_key','trip_id'], how = 'left')