In [1]:
import datetime
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
import altair as alt
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,  
    analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions,B2_avg_speeds_by_segment)
from shared_utils import calitp_color_palette as cp, rt_utils, geography_utils
import _threshold_utils as threshold_utils
import _rt_scheduled_utils as rt_scheduled_utils
CONFIG_PATH = './scripts/config.yml'
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date

'2023-08-15'

In [4]:
dictionary = helpers.get_parameters(CONFIG_PATH, "stop_segments")

## % of Meters
* start with the speeds_stop_segments_{analysis_date} parquet (which is produced in B1_speeds_by_segment_trip ).
* grab in stop_segments_{analysis_date} (in CRS 3310 already), and you can get the segment's length.
* merge with the speeds by segment-trip, which contains the meters_elapsed column
* calculate pct where meters_elapsed/segment_length
* show me some charts around this, a couple of descriptives to make sure these are all ok
* give me rough descriptives of how many rows we're dropping if we go with keeping at least 30%, 40%, 50%

### Open these files just for testing

In [5]:
def import_speeds_segs(analysis_date:str, 
                       max_speed_cutoff: int, 
                       dict_inputs:dict)-> pd.DataFrame:
    FILE = dict_inputs['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}")
    
    return df

In [6]:
def load_segments(analysis_date:str, dict_inputs:dict) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    FILE = dict_inputs['segments_file']
    segments = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet")
    
    return segments

## Open Files
* Add `import_speeds_segs`,`calculate_segment_length`,`merge_segments_speeds` into `B2`

In [7]:
segments_gdf = load_segments(analysis_date, dictionary)

In [51]:
speeds = import_speeds_segs(analysis_date,70,  dictionary)

In [52]:
def calculate_segment_length(segments:gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Find the length of segments
    """
    segments = segments.assign(
        meters_length=(segments.geometry.length)
    )
    #df = segments.drop(columns = ['geometry','district','district_name'])
    
    return segments

In [53]:
def merge_segments_speeds(speeds: pd.DataFrame,
                          segments:gpd.GeoDataFrame, 
                          ) -> pd.DataFrame:
    """
    Merge speeds and segments, calculate length.
    """
    segments = calculate_segment_length(segments)
    
    merge_cols = ['shape_array_key','stop_sequence','schedule_gtfs_dataset_key']
    merge1 = pd.merge(segments, speeds, on = merge_cols, how = "inner")
    
    merge1['percent'] = merge1.meters_elapsed/merge1.meters_length * 100
    
    merge1.percent = merge1.percent.fillna(0)
    return merge1

In [54]:
m1 = merge_segments_speeds(speeds, segments_gdf)

## Visualizing Speed
* https://nbviewer.org/github/cal-itp/data-analyses/blob/filter-speeds-avgs/rt_segment_speeds/18_speed_distribution.ipynb
* https://analysis.calitp.org/rt/district_07-los-angeles/9__speedmaps__district_07-los-angeles__itp_id_300.html

In [None]:
def valid_trips_by_cutoff(df, percentages:list):
    """
    Returns a table of how many trips/routes/operators/etc are kept 
    after applying a certain percentile cutoff for the 
    length of segment that's covered.
    
    Ex: if a segment has 40% of its length covered in RT 
    data, it falls in the 20th percentile.
    
    Args:
        df: result from  merge_segments_speeds(speeds, segments_gdf)
        percentages: the percentiles in floats like 0.2, 0.25, 0.3
    """
    final = pd.DataFrame()
    og_len = len(df)
    og_trips = df.trip_id.nunique()
    og_shape_array_key = df.shape_array_key.nunique()
    
    for i in percentages:
        # Round up percent to nearest 5. Ex: 43 becomes 45.
        percent = myround(df.percent.quantile(i).astype(float))
        temp = (df[(df.percent >= percent)])
        
        temp = temp.assign(
            percentile = f"Min. of {percent}% of seg length covered")
        
        temp = (temp
                .groupby(['percentile'])
                .agg({'gtfs_dataset_name':'count',
                      'speed_mph':'mean',
                      'shape_array_key':'nunique',
                      'trip_id':'nunique',
                      'gtfs_dataset_key':'nunique'})
                .reset_index()
                .rename(columns = {'gtfs_dataset_name':'n_rows',
                                  'speed_mph':'mean_speed_mph',
                                  'shape_array_key':'n_kept_routes',
                                  'trip_id':'n_kept_trips',
                                  'gtfs_dataset_key':'n_kept_operators'})
               )
        
        final = pd.concat([final, temp], axis=0)

    
    final = final.assign(
        percentage_kept_rows=final.n_rows.divide(og_len) * 100,
        percentage_kept_trips = final.n_kept_trips.divide(og_trips) * 100,
        percentage_kept_routes = final.n_kept_routes.divide(og_shape_array_key)*100)
    
    round_cols = ['percentage_kept_rows', 'percentage_kept_trips', 'percentage_kept_routes']
    final[round_cols] = final[round_cols].round(0)
    return final

### % of rows kept

In [None]:
test = valid_trips_by_cutoff(m1, [.1,.2,.3,.4,.5,.6,.7])

In [None]:
test = threshold_utils.pre_clean(test)

In [None]:
test

In [None]:
# Main chart
def bar_chart(df, x_column: str, y_column:str, title:str):
    chart = (alt.Chart(df)
         .mark_bar()
         .encode(x=x_column, y= y_column, 
          color=alt.Color(y_column, 
          scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
          legend=None),
          tooltip = df.columns.tolist())
         .properties(title = title)
            )
    chart = threshold_utils.chart_size((chart), 400,300)
    return chart
    

In [None]:
bar_chart(test, 'Percentage Kept Rows','Percentile', 'Rows Kept After % Segment Cutoff')

## Edited B2 function

In [8]:
avg_test = B2_avg_speeds_by_segment.speeds_with_segment_geom(analysis_date, 
                                    70, 
                                    dictionary,
                                    0.55)

In [9]:
STG5_FILE = dictionary['stage5']
og_avg = gpd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet")

In [10]:
# See unique stops-keys-routes. They should have no more than 2.
avg_test.groupby(['shape_array_key','schedule_gtfs_dataset_key','stop_sequence']).agg({'district':'count'}).sort_values('district', ascending = False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,district
shape_array_key,schedule_gtfs_dataset_key,stop_sequence,Unnamed: 3_level_1
80fe7dbf91c2382be0bf2e2db58d713a,dbbe8ee4864a2715a40749605395d584,25,2
a09172ff2aceb45a7f5eb9e2a6eb7618,1e93c380452cfb80eac5e14e4f227992,14,2
a0a07c99396126a30fafe0c8338de8d2,d9d0325e50e50064e3cc8384b1751d67,13,2
a0a07c99396126a30fafe0c8338de8d2,d9d0325e50e50064e3cc8384b1751d67,12,2
a0a07c99396126a30fafe0c8338de8d2,d9d0325e50e50064e3cc8384b1751d67,11,2


* There are 3000 more rows

In [11]:
og_avg.shape

(264433, 13)

In [12]:
# 261593 w dropping dups at the end
avg_test.shape

(261593, 13)

In [13]:
len(avg_test)-len(og_avg)

-2840

## Checks

### Seeing why all the rows were cut

In [14]:
# Test 3 contains missing sequences that were added back on
# From "add_back_missing_stops"
# avg_test.loc[avg_test.time_of_day == None].sample(3).drop(columns = ['geometry'])

In [55]:
def look_at_missing_row(shape, sequence):
    display(og_avg.loc[(og_avg.shape_array_key == shape) & (og_avg.stop_sequence == sequence)].drop(columns = ['geometry']))
    display(m1.loc[(m1.shape_array_key == shape) & (m1.stop_sequence == sequence)][['percent', 'speed_mph']])
    display(avg_test.loc[(avg_test.shape_array_key == shape) & (avg_test.stop_sequence == sequence)].drop(columns = ['geometry']))

In [18]:
missing_shape2  = "4d006755475d7c2a03c525995311ec16"
missing_stop_seq2 = 29

In [56]:
look_at_missing_row(missing_shape2, missing_stop_seq2)

Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
79495,4d006755475d7c2a03c525995311ec16,29,97b8c635bbc4cdd02bc5c27d0e83df3c,e6650363-497d-4797-8931-a83c0bc389dc,0,3,District 3 - Marysville,18.28,1,18.28,18.28,all_day


Unnamed: 0,percent,speed_mph
816051,52.16,18.28


Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
78558,4d006755475d7c2a03c525995311ec16,29,97b8c635bbc4cdd02bc5c27d0e83df3c,e6650363-497d-4797-8931-a83c0bc389dc,0,3,District 3 - Marysville,,,,,


## Comparison
* i still want a left join for segments, but only for segments that have RT trips ever (that end up going missing, after the 45% rule)

* i don't want a left join to show all segments ever, bc segments are cut from scheduled shapes. more operators have schedule data than RT. so i don't want a mess of segments at the end of the left join off schedule, but a left join for "RT shapes"

In [20]:
# Results from original script
rt_segs_agg = (og_avg
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('og_')
           )

In [21]:
# My test after filtering but adding back in cut rows
avg_test_agg = (avg_test
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('testing_')
           )

In [22]:
avg_test_agg.shape, rt_segs_agg.shape

((4729, 3), (4723, 3))

In [23]:
m1 = (pd
      .merge(rt_segs_agg, avg_test_agg, 
             left_on = ['og_shape_array_key', 'og_schedule_gtfs_dataset_key'],
             right_on = ['testing_shape_array_key', 'testing_schedule_gtfs_dataset_key'], 
             how = 'outer', indicator = True)
     )

In [24]:
m1['total_thrown_out_stops'] = m1.og_total_stops - m1.testing_total_stops

In [25]:
m1.sample(3)

Unnamed: 0,og_shape_array_key,og_schedule_gtfs_dataset_key,og_total_stops,testing_shape_array_key,testing_schedule_gtfs_dataset_key,testing_total_stops,_merge,total_thrown_out_stops
3085,a7bf8eb28b02ca73c1c24c502410a3da,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,34.0,a7bf8eb28b02ca73c1c24c502410a3da,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,35,both,-1.0
4237,e5d93362670f8af60349929c61a0f0fd,eb9acbcb42315399bb54df78adfd3dac,21.0,e5d93362670f8af60349929c61a0f0fd,eb9acbcb42315399bb54df78adfd3dac,22,both,-1.0
4097,ddee90f73f4761e12b2d848ad8c2ab04,0139b1253130b33adcd4b3a4490530d2,4.0,ddee90f73f4761e12b2d848ad8c2ab04,0139b1253130b33adcd4b3a4490530d2,5,both,-1.0


In [26]:
m1._merge.value_counts()

both          4723
right_only       6
left_only        0
Name: _merge, dtype: int64

In [27]:
m1.total_thrown_out_stops.describe()

count   4723.00
mean      -1.20
std        2.21
min      -50.00
25%       -1.00
50%       -1.00
75%       -1.00
max        0.00
Name: total_thrown_out_stops, dtype: float64

In [28]:
m1.loc[m1.total_thrown_out_stops < -40]

Unnamed: 0,og_shape_array_key,og_schedule_gtfs_dataset_key,og_total_stops,testing_shape_array_key,testing_schedule_gtfs_dataset_key,testing_total_stops,_merge,total_thrown_out_stops
3673,c8b41fafd9be5e579ad230bd28f74d17,7cc0cb1871dfd558f11a2885c145d144,6.0,c8b41fafd9be5e579ad230bd28f74d17,7cc0cb1871dfd558f11a2885c145d144,56,both,-50.0


In [29]:
# avg_test.loc[avg_test.shape_array_key == "c8b41fafd9be5e579ad230bd28f74d17"].drop(columns = ['geometry'])

In [30]:
# og_avg.loc[og_avg.shape_array_key == "c8b41fafd9be5e579ad230bd28f74d17"].drop(columns = ['geometry'])

In [31]:
# merge1.loc[merge1.shape_array_key == "c8b41fafd9be5e579ad230bd28f74d17"][['stop_sequence','meters_elapsed','sec_elapsed','speed_mph','percent']].sort_values(by = ['stop_sequence'])

#### Seeing all the sequences thrown out

In [32]:
merge_cols = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining']

In [33]:
subset = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'p50_mph', 'n_trips',
       'p20_mph', 'p80_mph', 'time_of_day']

In [34]:
og_avg2 = og_avg[subset]

In [35]:
og_avg2.shape

(264433, 10)

In [36]:
avg_test2 = avg_test[subset]

In [37]:
avg_test2.shape

(261593, 10)

In [38]:
m2 = pd.merge(og_avg2, avg_test2, on = merge_cols, how = 'outer', indicator = True)

In [39]:
m2._merge.value_counts()

both          483813
right_only      5681
left_only          0
Name: _merge, dtype: int64

### Seeing which routes were thrown out

In [40]:
og_routes = set(og_avg.shape_array_key.unique().tolist())
test_routes = set(avg_test.shape_array_key.unique().tolist())

new_routes = list(test_routes - og_routes)

In [41]:
new_routes

['3995aa113b7940405836a9d24e295ebb',
 '3adfce7b37ad5dc1dd831e9acf5a3b90',
 'ac713007941051bd6aa80018b8158260',
 'd79dacb8b4fe1f5416ec106f4ac5672d',
 'e80cd0231a2f6cf9b714a932b0a16707',
 'e4af5a81c32da5b8813e995742d7ca1b']

In [42]:
new_routes_df = avg_test.loc[avg_test.shape_array_key.isin(new_routes)]

In [43]:
new_routes_df.shape_array_key.nunique()

6

In [44]:
new_routes_df.shape

(11, 13)

In [45]:
new_routes_df.drop(columns = ['geometry'])

Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
57887,3995aa113b7940405836a9d24e295ebb,1,1adf7a7bde86b42ed014f6de74c7132e,91021,0,11,District 11 - San Diego,,,,,
57888,3995aa113b7940405836a9d24e295ebb,2,1adf7a7bde86b42ed014f6de74c7132e,99591,0,11,District 11 - San Diego,,,,,
59548,3adfce7b37ad5dc1dd831e9acf5a3b90,4,7cc0cb1871dfd558f11a2885c145d144,13868,0,4,District 4 - Oakland,,,,,
171408,ac713007941051bd6aa80018b8158260,3,7cc0cb1871dfd558f11a2885c145d144,17181,0,4,District 4 - Oakland,,,,,
171409,ac713007941051bd6aa80018b8158260,4,7cc0cb1871dfd558f11a2885c145d144,15361,0,4,District 4 - Oakland,,,,,
171410,ac713007941051bd6aa80018b8158260,5,7cc0cb1871dfd558f11a2885c145d144,15359,0,4,District 4 - Oakland,,,,,
171411,ac713007941051bd6aa80018b8158260,6,7cc0cb1871dfd558f11a2885c145d144,15369,0,4,District 4 - Oakland,,,,,
216542,d79dacb8b4fe1f5416ec106f4ac5672d,1,07d3b79f14cec8099119e1eb649f065b,6702259,1,3,District 3 - Marysville,,,,,
231919,e4af5a81c32da5b8813e995742d7ca1b,1,07d3b79f14cec8099119e1eb649f065b,6702259,0,3,District 3 - Marysville,,,,,
235493,e80cd0231a2f6cf9b714a932b0a16707,1,09e16227fc42c4fe90204a9d11581034,7764510,1,4,District 4 - Oakland,,,,,


In [46]:
type(new_routes_df)

geopandas.geodataframe.GeoDataFrame

In [47]:
new_routes_df.explore('shape_array_key')

## Sample segments and routes for Big Blue Bus

In [48]:
test_operator = "Big Blue Bus VehiclePositions"
test_org = "City of Santa Monica"
test_key = "6c2d7daaf979779fa2089c6395baf98b"

In [49]:
#pub_df = pd.read_parquet(
#    f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet", 
#    filters = [[("agency", "==", test_org)]]
#)

In [50]:
# Dark orange
shape_id1  = "26375"
stop_seq1 = 7
shape_array1 = pub_df[pub_df.shape_id==shape_id1].shape_array_key.iloc[0]

NameError: name 'pub_df' is not defined

In [None]:
# Light yellow 
shape_id2 = "26342"
stop_seq2 = 23
shape_array2 = pub_df[pub_df.shape_id==shape_id2].shape_array_key.iloc[0]

In [None]:
# Dark Red
shape_id3 = "26393"
stop_seq3 = 32
shape_array3 = pub_df[pub_df.shape_id==shape_id3].shape_array_key.iloc[0]

In [None]:
# Light orange
shape_id4 = "26372"
stop_seq4 = 14
shape_array4 = pub_df[pub_df.shape_id==shape_id4].shape_array_key.iloc[0]

In [None]:
# Green
shape_id5 = "26400"
stop_seq5= 8
shape_array5 = pub_df[pub_df.shape_id==shape_id5].shape_array_key.iloc[0]

In [None]:
def one_route_map(avg_speeds:gpd.GeoDataFrame, shape_array_key:str):
    display(avg_speeds[avg_speeds.shape_array_key==shape_array_key].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE,
    style_kwds = {'weight':5}))
    

In [None]:
def compare_average(avg_test:pd.DataFrame,
                    og_avg:pd.DataFrame, 
                    shape_array_key:str,
                    stop_sequence:int):
    
    print(f"route {shape_array_key}, sequence {stop_sequence}")
    drop_cols = ['shape_array_key','stop_sequence']
    print('Original')
    original = og_avg[(og_avg.shape_array_key == shape_array_key) &(og_avg.stop_sequence == stop_sequence)].drop(columns = drop_cols)
    display(original.drop(columns = ['geometry']))
    
    print('New')
    new = avg_test[(avg_test.shape_array_key == shape_array_key) & (avg_test.stop_sequence == stop_sequence)].drop(columns = drop_cols)
    display(new.drop(columns = ['geometry']))    
    # display(new.explore(width = 300, height = 300))

In [None]:
og_avg.p50_mph.describe()

In [None]:
avg_test.p50_mph.describe()

* Strangely enough, the 50th percentile speed became lower
* Sometimes the lower the length, the higher the speed?

In [None]:
# compare_average(avg_test, og_avg, shape_array3, stop_seq3)

In [None]:
# merge1.loc[(merge1.shape_array_key == shape_array3) & (merge1.stop_sequence == stop_seq3)][merge1_preview_cols].sort_values(['percent'])

* This is missing because the only row that was non-zero only covered 27% of the length
* Maybe up the threshold?

In [None]:
# compare_average(avg_test, og_avg, shape_array4, stop_seq4)

In [None]:
# merge1.loc[(merge1.shape_array_key == shape_array4) & (merge1.stop_sequence == stop_seq4)][merge1_preview_cols]

In [None]:
# compare_average(avg_test, og_avg, shape_array5, stop_seq5)

### Check using Big Blue Bus