In [37]:
import datetime
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
import altair as alt
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,  
    analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions,B2_avg_speeds_by_segment)
from shared_utils import calitp_color_palette as cp, rt_utils, geography_utils
import _threshold_utils as threshold_utils
import _rt_scheduled_utils as rt_scheduled_utils
CONFIG_PATH = './scripts/config.yml'
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [38]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## % of Meters
* start with the speeds_stop_segments_{analysis_date} parquet (which is produced in B1_speeds_by_segment_trip ).
* grab in stop_segments_{analysis_date} (in CRS 3310 already), and you can get the segment's length.
* merge with the speeds by segment-trip, which contains the meters_elapsed column
* calculate pct where meters_elapsed/segment_length
* show me some charts around this, a couple of descriptives to make sure these are all ok
* give me rough descriptives of how many rows we're dropping if we go with keeping at least 30%, 40%, 50%

### Open these files just for testing

In [39]:
def import_speeds_segs(analysis_date:str, 
                       max_speed_cutoff: int, 
                       dict_inputs:dict)-> pd.DataFrame:
    FILE = dict_inputs['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}", 
        filters = [[("speed_mph", "<=", max_speed_cutoff)]])
    
    return df

In [40]:
def load_segments(analysis_date:str, dict_inputs:dict) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    FILE = dict_inputs['segments_file']
    segments = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet")
    
    return segments

### Open Files
* Add `import_speeds_segs`,`calculate_segment_length`,`merge_segments_speeds` into `B2`

In [41]:
# dict_inputs = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [42]:
# dict_inputs

In [43]:
dictionary = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [44]:
speeds = import_speeds_segs(analysis_date,70, dictionary)

In [45]:
speeds.sample()

Unnamed: 0_level_0,gtfs_dataset_key,gtfs_dataset_name,trip_id,trip_instance_key,schedule_gtfs_dataset_key,shape_array_key,stop_sequence,min_time,min_dist,max_time,max_dist,meters_elapsed,sec_elapsed,speed_mph
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
9999,80e31832b8c72f53603cccacc8a6a798,VCTC Vehicle Positions,152-61,b784199ad5fc10a9f3b96e6c663a2159,1770249a5a2e770ca90628434d4934b1,23e0200e326571108abc3cb1066bcfc6,13,28078.0,0.0,29056.0,292.72,292.72,978.0,0.67


In [46]:
segments_gdf = load_segments(analysis_date, dictionary)

In [47]:
def calculate_segment_length(segments:gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    segments = segments.assign(
        meters_length=(segments.geometry.length)
    )
    df = segments.drop(columns = ['geometry','district','district_name'])
    
    return df

In [48]:
def merge_segments_speeds(speeds: pd.DataFrame,
                          segments:gpd.GeoDataFrame, 
                          ) -> pd.DataFrame:
    """
    Merge speeds and segments. Calculate length
    """
    segments = calculate_segment_length(segments)
    
    merge_cols = ['shape_array_key','stop_sequence','schedule_gtfs_dataset_key']
    merge1 = pd.merge(segments, speeds, on = merge_cols, how = "inner")
    
    merge1['percent'] = merge1.meters_elapsed/merge1.meters_length * 100
    
    merge1.percent = merge1.percent.fillna(0)
    return merge1

In [49]:
merge1 = merge_segments_speeds(speeds, segments_gdf)

In [50]:
merge1.columns

Index(['shape_array_key', 'stop_id', 'stop_sequence', 'loop_or_inlining',
       'schedule_gtfs_dataset_key', 'seg_idx', 'geometry_arrowized',
       'meters_length', 'gtfs_dataset_key', 'gtfs_dataset_name', 'trip_id',
       'trip_instance_key', 'min_time', 'min_dist', 'max_time', 'max_dist',
       'meters_elapsed', 'sec_elapsed', 'speed_mph', 'percent'],
      dtype='object')

In [51]:
def myround(x, base=5):
    return base * round(x/base)

In [52]:
def valid_trips_by_cutoff(df, percentages:list):
    final = pd.DataFrame()
    og_len = len(df)
    og_trips = df.trip_id.nunique()
    og_shape_array_key = df.shape_array_key.nunique()
    
    for i in percentages:
        # Round up percent to nearest 5. Ex: 43 becomes 45.
        percent = myround(df.percent.quantile(i).astype(float))
        temp = (df[(df.percent >= percent)])
        
        temp = temp.assign(
            percentile = f"Min. of {percent}% of seg length covered")
        
        temp = (temp
                .groupby(['percentile'])
                .agg({'gtfs_dataset_name':'count',
                      'speed_mph':'mean',
                      'shape_array_key':'nunique',
                     'trip_id':'nunique',
                     'gtfs_dataset_key':'nunique'})
                .reset_index()
                .rename(columns = {'gtfs_dataset_name':'n_rows',
                                  'speed_mph':'mean_speed_mph',
                                  'shape_array_key':'n_kept_routes',
                                  'trip_id':'n_kept_trips',
                                  'gtfs_dataset_key':'n_kept_operators'})
               )
        
        final = pd.concat([final, temp], axis=0)

    
    final = final.assign(
        percentage_kept_rows=final.n_rows.divide(og_len) * 100,
        percentage_kept_trips = final.n_kept_trips.divide(og_trips) * 100,
        percentage_kept_routes = final.n_kept_routes.divide(og_shape_array_key)*100)
    
    round_cols = ['percentage_kept_rows', 'percentage_kept_trips', 'percentage_kept_routes']
    final[round_cols] = final[round_cols].round(0)
    return final

In [53]:
test = valid_trips_by_cutoff(merge1, [.1,.2,.3,.4,.5,.6,.7])

In [54]:
#test

## Visualizing Speed
* https://nbviewer.org/github/cal-itp/data-analyses/blob/filter-speeds-avgs/rt_segment_speeds/18_speed_distribution.ipynb
* https://analysis.calitp.org/rt/district_07-los-angeles/9__speedmaps__district_07-los-angeles__itp_id_300.html

### % of rows kept

In [55]:
test = threshold_utils.pre_clean(test)

In [56]:
test

Unnamed: 0,Percentile,N Rows,Mean Speed Mph,N Kept Routes,N Kept Trips,N Kept Operators,Percentage Kept Rows,Percentage Kept Trips,Percentage Kept Routes
0,Min. of 10% of seg length covered,2026035,11.8,4080,65939,74,90.0,100.0,100.0
0,Min. of 40% of seg length covered,1820711,12.3,4066,65709,74,81.0,99.0,99.0
0,Min. of 60% of seg length covered,1573312,12.2,4055,65424,74,70.0,99.0,99.0
0,Min. of 75% of seg length covered,1308198,11.8,4039,64949,74,58.0,98.0,99.0
0,Min. of 85% of seg length covered,1070347,11.5,4019,64120,74,48.0,97.0,98.0
0,Min. of 90% of seg length covered,914280,11.2,3987,63142,74,41.0,95.0,98.0
0,Min. of 95% of seg length covered,698923,10.8,3917,61293,74,31.0,93.0,96.0


In [57]:
# Main chart
def bar_chart(df, x_column: str, y_column:str, title:str):
    chart = (alt.Chart(df)
         .mark_bar()
         .encode(x=x_column, y= y_column, 
          color=alt.Color(y_column, 
          scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
          legend=None),
          tooltip = df.columns.tolist())
         .properties(title = title)
            )
    chart = threshold_utils.chart_size((chart), 400,300)
    return chart
    

In [58]:
bar_chart(test, 'Percentage Kept Rows','Percentile', 'Rows Kept After % Segment Cutoff')

## Only keep speeds that meet a certain threshold
* Put it in `speeds_with_segment_geom` in `B2`

In [59]:
columns_to_keep = ['shape_array_key', 'stop_sequence', 'gtfs_dataset_key',
      'gtfs_dataset_name', 'trip_id', 'min_time', 'min_dist', 'max_time',
       'max_dist', 'meters_elapsed', 'sec_elapsed', 'speed_mph',
      'trip_instance_key', 'schedule_gtfs_dataset_key',]

In [60]:
def speeds_length_filter(speeds: pd.DataFrame,
                         segments: gpd.GeoDataFrame,
                         columns_to_keep: list,
                         percentile: float = 0.20) -> pd.DataFrame:
    
    # df = merge_segments_speeds(segments, analysis_date, max_speed_cut_off,dict_inputs)
    df= merge_segments_speeds(speeds, segments_gdf)
    
    percent = df.percent.quantile(percentile).astype(float)
    
    df = (df[(df.percent >= percent)])
    
    df = df[columns_to_keep]
    
    return df 

### Edited B2 function

In [61]:
# speeds.sample()

In [62]:
# speeds_filtered = speeds_length_filter(speeds, segments_gdf, 0.20)

In [63]:
# speeds_filtered.columns

In [64]:
# test = add_back_missing_stops(speeds_filtered, speeds)

In [65]:
# test.shape

In [66]:
# test.sample()

In [67]:
# concat1 = pd.concat([test, speeds_filtered])

In [68]:
# concat1.shape

In [69]:
# speeds.shape

In [93]:
def add_back_missing_stops(filtered_speeds:pd.DataFrame, 
                           original_speeds:pd.DataFrame,
                           columns_to_merge:list) -> pd.DataFrame:
    
    merge1 = pd.merge(original_speeds, filtered_speeds, on = columns_to_merge, how = "left", indicator = True)
    
    # Filter out for left only speeds that were deleted
    left_only = merge1.loc[merge1._merge == 'left_only']
    left_only = left_only[columns_to_merge].reset_index(drop = True)
    
    return left_only 


In [108]:
def speeds_with_segment_geom(
    analysis_date: str, 
    max_speed_cutoff: int = 70,
    dict_inputs: dict = {},
    percentile:float = 0.20,
) -> gpd.GeoDataFrame: 
    """
    Import the segment-trip table. 
    Average the speed_mph across all trips present in the segment.
    """
    SEGMENT_FILE = dict_inputs["segments_file"]
    SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"]
    SPEEDS_FILE = dict_inputs["stage4"]
    
    # Merge in segment geometry
    segments = helpers.import_segments(
        SEGMENT_GCS,
        f"{SEGMENT_FILE}_{analysis_date}",
        columns = SEGMENT_IDENTIFIER_COLS + [
            "schedule_gtfs_dataset_key", 
            "stop_id",
            "loop_or_inlining",
            "geometry", 
            "district", "district_name"
        ]
    )
    
    # Read in speeds
    df = pd.read_parquet(
        f"{SEGMENT_GCS}{SPEEDS_FILE}_{analysis_date}", 
        filters = [[("speed_mph", "<=", max_speed_cutoff), 
                    ("sec_elapsed", ">", 0), 
                    ("meters_elapsed", ">", 0)
                   ]]
    )
    
    df2 = df[df.speed_mph.notna() ].reset_index(drop=True)
    
    # Filter out abnormally high and low speeds
    # Threshold defaults to throwing away the bottom 20% of rows with low speeds.
    columns_keep_merge = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
      'gtfs_dataset_name', 'trip_id', 'min_time', 'min_dist', 'max_time',
       'max_dist', 'meters_elapsed', 'sec_elapsed', 'speed_mph',
      'trip_instance_key']
    
    df3 = speeds_length_filter(df2, segments, columns_keep_merge, percentile)
    time_of_day_df = sched_rt_utils.get_trip_time_buckets(analysis_date)

    df4 = pd.merge(
        df3, 
        time_of_day_df, 
        on = "trip_instance_key", 
        how = "inner"
    )
    
    all_day = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df4, 
        SEGMENT_IDENTIFIER_COLS
    )
    peak = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df4[df4.time_of_day.isin(["AM Peak", "PM Peak"])], 
        SEGMENT_IDENTIFIER_COLS
    )
    
    stats = pd.concat([
        all_day.assign(time_of_day = "all_day"),
        peak.assign(time_of_day = "peak")
    ], axis=0)
    
    # Add back in rows that were filtered out for 
    # segment length
    missing = add_back_missing_stops(stats, df, ['shape_array_key', 'stop_sequence'])
    
    # Concat & fill in NA 
    stats2 = pd.concat([missing, stats])
    stats2 = stats2.fillna(0) 
    
    # Merge in segment geometry with a changed CRS
    segments = segments.to_crs(geography_utils.WGS84)
    
    gdf = pd.merge(
        segments,
        stats2,
        on = SEGMENT_IDENTIFIER_COLS,
        how = "inner"
    )
    return gdf

### Check out speeds

In [109]:
# Teakes around 7 minutes
avg_test = speeds_with_segment_geom(analysis_date, 
                                    70, 
                                    dictionary,
                                    0.2)

In [110]:
avg_test.columns

Index(['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'geometry', 'district', 'district_name',
       'p50_mph', 'n_trips', 'p20_mph', 'p80_mph', 'time_of_day'],
      dtype='object')

In [99]:
STG5_FILE = dictionary['stage5']
og_avg = gpd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet")
og_avg = og_avg.drop(columns=["district", "district_name"])

In [111]:
og_avg.columns

Index(['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'geometry', 'p50_mph', 'n_trips',
       'p20_mph', 'p80_mph', 'time_of_day'],
      dtype='object')

In [100]:
STG5_FILE

'avg_speeds_stop_segments'

In [112]:
og_avg.shape

(230532, 11)

In [113]:
avg_test.shape

(236547, 13)

In [None]:
merge1_preview_cols = ['meters_length', 
       'trip_id', 'min_time', 'min_dist', 'max_time', 'max_dist',
       'meters_elapsed', 'sec_elapsed', 'speed_mph', 
       'percent']

## Sample segments and routes for Big Blue Bus

In [None]:
test_operator = "Big Blue Bus VehiclePositions"
test_org = "City of Santa Monica"
test_key = "6c2d7daaf979779fa2089c6395baf98b"

In [None]:
pub_df = pd.read_parquet(
    f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet", 
    filters = [[("agency", "==", test_org)]]
)

In [None]:
# Dark orange
shape_id1  = "26375"
stop_seq1 = 7
shape_array1 = pub_df[pub_df.shape_id==shape_id1].shape_array_key.iloc[0]

In [None]:
# Light yellow 
shape_id2 = "26342"
stop_seq2 = 23
shape_array2 = pub_df[pub_df.shape_id==shape_id2].shape_array_key.iloc[0]

In [None]:
# Dark Red
shape_id3 = "26393"
stop_seq3 = 32
shape_array3 = pub_df[pub_df.shape_id==shape_id3].shape_array_key.iloc[0]

In [None]:
# Light orange
shape_id4 = "26372"
stop_seq4 = 14
shape_array4 = pub_df[pub_df.shape_id==shape_id4].shape_array_key.iloc[0]

In [None]:
# Green
shape_id5 = "26400"
stop_seq5= 8
shape_array5 = pub_df[pub_df.shape_id==shape_id5].shape_array_key.iloc[0]

In [None]:
def one_route_map(avg_speeds:gpd.GeoDataFrame, shape_array_key:str):
    display(avg_speeds[avg_speeds.shape_array_key==shape_array_key].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE,
    style_kwds = {'weight':5}))
    

## Checks

In [None]:
def compare_average(avg_test:pd.DataFrame,
                    og_avg:pd.DataFrame, 
                    shape_array_key:str,
                    stop_sequence:int):
    
    print(f"route {shape_array_key}, sequence {stop_sequence}")
    drop_cols = ['shape_array_key','geometry','stop_sequence']
    print('Original')
    display(og_avg[(og_avg.shape_array_key == shape_array_key) &(og_avg.stop_sequence == stop_sequence)].drop(columns = drop_cols))
    
    print('New')
    display(avg_test[(avg_test.shape_array_key == shape_array_key) & (avg_test.stop_sequence == stop_sequence)].drop(columns = drop_cols))     

In [None]:
og_avg.p50_mph.describe()

In [None]:
avg_test.p50_mph.describe()

In [None]:
compare_average(avg_test, og_avg, shape_array1, stop_seq1)

In [None]:
compare_average(avg_test, og_avg, shape_array2, stop_seq2)

* Strangely enough, the 50th percentile speed became lower
* Sometimes the lower the length, the higher the speed?

In [None]:
compare_average(avg_test, og_avg, shape_array3, stop_seq3)

In [None]:
merge1.loc[(merge1.shape_array_key == shape_array3) & (merge1.stop_sequence == stop_seq3)][merge1_preview_cols].sort_values(['percent'])

* This is missing because the only row that was non-zero only covered 27% of the length
* Maybe up the threshold?

In [None]:

compare_average(avg_test, og_avg, shape_array4, stop_seq4)

In [None]:
merge1.loc[(merge1.shape_array_key == shape_array4) & (merge1.stop_sequence == stop_seq4)][merge1_preview_cols]

In [None]:
compare_average(avg_test, og_avg, shape_array5, stop_seq5)

## Comparison
* i still want a left join for segments, but only for segments that have RT trips ever (that end up going missing, after the 45% rule)

* i don't want a left join to show all segments ever, bc segments are cut from scheduled shapes. more operators have schedule data than RT. so i don't want a mess of segments at the end of the left join off schedule, but a left join for "RT shapes"

In [None]:
# Merge1: ALL rows even before cutting off max speed of 70 and segments with less than 
# 40% of the segment
rt_segs_agg = (merge1
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('og_')
           )

In [None]:
rt_segs_agg.sample()

In [None]:
# Rows that are above 70 mph and under 40% coverage of a segment are thrown away
avg_test_agg = (avg_test
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('testing_')
           )

In [None]:
avg_test_agg.sample()

In [None]:
avg_test_agg.shape, rt_segs_agg.shape

In [None]:
m1 = (pd
      .merge(rt_segs_agg, avg_test_agg, 
             left_on = ['og_shape_array_key', 'og_schedule_gtfs_dataset_key'],
             right_on = ['testing_shape_array_key', 'testing_schedule_gtfs_dataset_key'], 
             how = 'outer', indicator = True)
     )

In [None]:
m1['total_thrown_out_stops'] = m1.og_total_stops - m1.testing_total_stops

In [None]:
m1.sample(3)

In [None]:
m1.loc[m1._merge == 'left_only'].sample(3)

* 28 routes are missing.

In [None]:
m1._merge.value_counts()

In [None]:
m1.total_thrown_out_stops.describe()

#### Seeing all the sequences thrown out

In [None]:
subset = ['shape_array_key','schedule_gtfs_dataset_key', 'stop_sequence']

In [None]:
merge1.columns

In [None]:
merge2 = merge1[subset]

In [None]:
avg_test2 = avg_test[subset]

In [None]:
m2 = pd.merge(merge2, avg_test2, on = subset, how = 'outer', indicator = True)

In [None]:
m2._merge.value_counts()

In [None]:
m2.columns

In [None]:
thrown_out_sequences = m2.loc[m2._merge == "left_only"].reset_index()

In [None]:
thrown_out_sequences.sample()

In [None]:
thrown_out_sequences2 = (thrown_out_sequences
            .groupby(['shape_array_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_missing_stops'})
           )

In [None]:
thrown_out_sequences2.shape

In [None]:
thrown_out_sequences2.head()

In [None]:
thrown_out_sequences2.total_missing_stops.describe()