In [1]:
import datetime
import _speed_utils as speed_utils
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,
    #analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions)
from shared_utils import calitp_color_palette as cp


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
CONFIG_PATH = './scripts/config.yml'

In [4]:
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [5]:
# Adding analysis_date here since there aren't any files for June yet
analysis_date = '2023-05-17'

### Observations (7/12)
* Step 1: Flagging
    * There are 2,704,812 rows in the dataframe original. About 10% of those rows are flagged as having zeroes in meters elapsed and seconds elapsed. 
    * There are around 4566 routes. About 57% of these routes had at least one trip with one or more rows flagged as zero.
* I took 2 passes at trying to understand why both these columns recorded zeroes.

* Step 2: `vp_pared_stops`. 
    * I grouped `vp_pared_stops` by 'shape_array_key','trip_id', and 'location_timestamp_local' OR `x` and `y`. I counted the number of unique stop sequences after grouping. If this trio had more than one unique stop sequence, that meant the timestamp or location recorded between sequences was duplicated.
    * Only around 9% of rows were flagged as having (obviously) repeated timestamps and locations. 
    * For all of these rows, both the timestamp and location were duplicated. 
    * All the routes that were flagged in step one needed a further look in step 3.
    
* Step 3: `vp_usable`
    * For one route and trip, find: all the recorded vehicle positions, sjoin of vps to segments,
    and the first and last points kept. 
    * Plot the three gdfs in a map to visually inspect what's happening.
    * Compare the sample route and trips with the trip with the highest percentage of non division by 0 rows to see what's going on. 
    
* Buckets of errors (all based on `stage0 vp`). 
    * There is only one recorded point in that segment in the raw data.
        * Fix: use the timestamp that comes after it.
        * <img src= "./speeds_images/only_one_pt_collected.png" width = 300>
    * Points are shared between segments
        * Use p20/p50/p80.
        * <img src= "./speeds_images/shared_vp.png" width = 300>
    * Points recorded are really far out and they don't touch the buffered segments.
        * Fix: figure out % of vehicle positions that are too far out.
        * <img src= "./speeds_images/dots_not_on_seg.png" width = 300>
    * No data captured for that segment at all.
        * Use p20/p50/p80. 
        * <img src= "./speeds_images/no_dots_collected.png" width = 300> 

### Flagging

In [6]:
# Flagged: all the rows in the df flagged
# divide_by_zero: only the rows that have 0 for meters and sec elapsed
# trips_count: % of trips with 1+ division by 0 row for a route
# route_most_populated_df: the trip for a route with the smallest % of rows that are divided by 0
# flagged, divide_by_zero, trips_count, route_most_populated_df = speed_utils.flagging_stage(analysis_date)

In [7]:
# divide_by_zero.loop_or_inlining.value_counts()

### Fixes

#### VPs too far out 

In [8]:
USABLE_VP = STOP_SEG_DICT["stage1"]
INPUT_FILE_PREFIX = STOP_SEG_DICT["stage2"]
GROUPING_COL = STOP_SEG_DICT["grouping_col"]

In [9]:
normal_shapes = A2_valid_vehicle_positions.identify_stop_segment_cases(
        analysis_date, GROUPING_COL, 0)

In [10]:
usable_normal_vp = A2_valid_vehicle_positions.merge_usable_vp_with_sjoin_vpidx(
        normal_shapes,
        f"{USABLE_VP}_{analysis_date}",
        f"{INPUT_FILE_PREFIX}_{analysis_date}",
        GROUPING_COL
    )
    

In [12]:
usable_normal_vp = usable_normal_vp.assign(
    identifier = usable_normal_vp.gtfs_dataset_key.astype(str) + '/' + usable_normal_vp.shape_array_key.astype(str) + '/' + usable_normal_vp.trip_id.astype(str)
)

In [13]:
usable_normal_vp.columns

Index(['vp_idx', '_gtfs_dataset_name', 'trip_id', 'location_timestamp',
       'gtfs_dataset_key', 'location_timestamp_local', 'activity_date', 'hour',
       'x', 'y', 'shape_array_key', 'stop_sequence', 'identifier'],
      dtype='object')

In [14]:
agg = usable_normal_vp.groupby(['identifier']).vp_idx.nunique()

In [18]:
agg = agg.compute()

In [23]:
type(agg)

pandas.core.series.Series

In [22]:
agg.head()

identifier
00accf770009aafd5dc103ff2eeddb37/0e69f3b447f85898af234663d28cf1e4/t_5562081_b_80156_tn_0     99
00accf770009aafd5dc103ff2eeddb37/1780c72496eee5d86aea18655a99431e/t_5561633_b_80156_tn_0    289
00accf770009aafd5dc103ff2eeddb37/1780c72496eee5d86aea18655a99431e/t_5561634_b_80156_tn_0    279
00accf770009aafd5dc103ff2eeddb37/1780c72496eee5d86aea18655a99431e/t_5561651_b_80156_tn_0     31
00accf770009aafd5dc103ff2eeddb37/1780c72496eee5d86aea18655a99431e/t_5561652_b_80156_tn_0    216
Name: vp_idx, dtype: int64

In [None]:
# https://docs.dask.org/en/latest/generated/dask.dataframe.groupby.SeriesGroupBy.nunique.html
vp_agg = (usable_normal_vp
        .groupby(['gtfs_dataset_key','shape_array_key','trip_id'])
        .agg({'vp_idx':'count'}).reset_index()
        .rename(columns = {'vp_idx':'number_of_unique_vps_sjoin'})
        .reset_index(drop = True)
       )

In [None]:
usable_normal_vp = usable_normal_vp.compute()

In [None]:
# VP all without paring
STG_0_FILE = STOP_SEG_DICT['stage0']
    
all_pts = helpers.import_vehicle_positions(
            SEGMENT_GCS,
            f"{STG_0_FILE}_{analysis_date}/",
            "df",
            columns = ["gtfs_dataset_key", "trip_id","geometry"],
            partitioned = False
        )
all_pts = all_pts.compute()

In [None]:
STG1 = STOP_SEG_DICT['stage1']

In [None]:
usable_vps = A1_sjoin_vp_segments.add_grouping_col_to_vp(
    vp_file_name = f"{STG1}_{analysis_date}",
    analysis_date= analysis_date,
    trip_grouping_cols = STOP_SEG_DICT['trip_grouping_cols']) 

In [None]:
usable_normal_vp.sample()

In [None]:
all_pts_agg = (all_pts_merge
        .groupby(['gtfs_dataset_key','shape_array_key','trip_id'])
        .agg({'geometry':'nunique'}).reset_index()
        .rename(columns = {'geometry':'number_of_unique_vps_all_pts'})
        .reset_index(drop = True)
       )

#### Redo sjoin to retain more info.
* Use pared stops with segments 

In [None]:
# sjoin_df = speed_utils.import_stage_2(analysis_date)

In [None]:
# sjoin_df.shape

In [None]:
# sjoin_df = speed_utils.sjoin_vp_segments(seg, all_points)

In [None]:
SEGMENT_FILE = STOP_SEG_DICT["segments_file"]

In [None]:
FILE = STOP_SEG_DICT['segments_file']
segs = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet").to_crs(PROJECT_CRS)

In [None]:
segs.shape

In [None]:
usable_vps = A1_sjoin_vp_segments.add_grouping_col_to_vp(
    vp_file_name = f"{STG1}_{analysis_date}",
    analysis_date= analysis_date,
    trip_grouping_cols = STOP_SEG_DICT['trip_grouping_cols']) 

In [None]:
merge1 = 

In [None]:
usable_vps.sample()

In [None]:
print('hi')

In [None]:
# 2:45
sjoin_1 = speed_utils.sjoin_vp_segments(segs, usable_vps)

### Investigate 

In [None]:
stop running

In [None]:
stage3_df = speed_utils.flag_stage3(divide_by_zero, analysis_date)

In [None]:
stage3_df.shape_array_key.nunique()

In [None]:
sort_by = ['_gtfs_dataset_name','shape_array_key','trip_id','stop_sequence']

In [None]:
preview_cols = sort_by + ['stop_id','gtfs_dataset_key','location_timestamp_local','pair','stage3_flag']

#### Find routes with a lot of n_trips that need to be evaluated.

In [None]:
shape_trips = speed_utils.count_trips_routes(stage3_df)

In [None]:
shape_trips.head(5)

#### Example Trip 1
* Understanding the result from flag_stage3().
* Looking at AC Transit: stop sequences 6 and 7 have different stop_ids. However, their time stamps and locations are the same.
* It looks like they share the same point.
* However, this isn't due to paring too many points: there just aren't enough points to choose from in the raw data.
* Also sequence 2 is extremely long.

In [None]:
stage3_df[(stage3_df.stage3_flag != 'check in stage 2')].sort_values(by =sort_by)[preview_cols].head(2)

In [None]:
test_route1 = "03bc2779a66739316156b459ffc3eefa"
test_gtfs_key1 = "cdd2ad81863b6d4ad51676a1cb781ea8"
test_trip1 = "11776020"

In [None]:
# % of trips with problematic rows for this route
trips_count[trips_count.shape_array_key == test_route1]

In [None]:
speed_utils.original_df_rows(flagged, test_trip1, test_route1)

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route1,
                        trip = test_trip1,
                        gtfs_key = test_gtfs_key1)

#### Example Trip 2
* Underestanding the result from `flag_stage3()`. 
* Same thing as Ex Trip 1 except kooking at LA Metro: stop sequences 45 and 46 have different stop_ids. However, their time stamps and locations are the same. 
* Same issue: segments 45 and 46 are sharing points. There aren't enough points captured.
* In general, it looks like this route doesn't have a lot of rows that are ok.

In [None]:
test_route2 = "38c814829dff816aa87c606c3aab4f45"
test_gtfs_key2 = "65d9589130415c685b89f4f7c2d8bd7e"
test_trip2 = "10294000051654-DEC22"

In [None]:
# Original number of rows for this trip
len(flagged[(flagged.trip_id == test_trip2) & (flagged.shape_array_key == test_route2)])

In [None]:
# Rows with zeroes...a lot of them.
len(divide_by_zero[(divide_by_zero.trip_id == test_trip2) & (divide_by_zero.shape_array_key == test_route2)])

In [None]:
stage3_df[(stage3_df.stage3_flag != 'check in stage 2') & (stage3_df.shape_array_key == test_route2) 
   & (stage3_df.stop_sequence.isin([45,46]))].sort_values(by =sort_by)[preview_cols]

In [None]:
# % of trips with problematic rows for this route, lots of them
trips_count[trips_count.shape_array_key == test_route2]

In [None]:
speed_utils.original_df_rows(m1, test_trip2, test_route2)

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route2,
                        trip = test_trip2,
                        gtfs_key = test_gtfs_key2,)

In [None]:
# See which trip has the most rows that are ok for this route
route_most_populated_df[route_most_populated_df.shape_array_key == test_route2]

In [None]:
# Plot the trip from this route with the highest % of ok rows
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route2,
                        trip = "10294000051724-DEC22",
                        gtfs_key = test_gtfs_key2,)

#### Example Trip 3
* Choosing a route/trip with high n_trips that isn't Muni or LA Metro to shake things up.
* San Diego Vehicle Positions
* Segments 44 and 45 don't have any rows in the original dataframe for trip 16938440.

In [None]:
test_route3 = "1fc55d9df0cd785dddc864bf1b72976f"
test_gtfs_key3 = "a4f6fd5552107e05fe9743ac7cce2c55"
test_trip3 = "16938440"

In [None]:
speed_utils.original_df_rows(flagged, test_trip3, test_route3)

In [None]:
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route3,
                        trip = test_trip3,
                        gtfs_key = test_gtfs_key3,)

In [None]:
# Trip from this route with the most ok rows
route_most_populated_df[route_most_populated_df.shape_array_key == test_route3]

In [None]:
# Test the trip with the most ok rows
speed_utils.stage2_trouble_shooting(flagged_df= stage3_df,
                        date = analysis_date,
                        route = test_route3,
                        trip = "16938341",
                        gtfs_key = test_gtfs_key3,)

In [None]:
speed_utils.original_df_rows(flagged, "16938341", test_route3)