In [1]:
import datetime
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
import altair as alt
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,  
    analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions,B2_avg_speeds_by_segment)
from shared_utils import calitp_color_palette as cp, rt_utils, geography_utils
import _threshold_utils as threshold_utils
import _rt_scheduled_utils as rt_scheduled_utils
CONFIG_PATH = './scripts/config.yml'
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date

'2023-07-12'

## % of Meters
* start with the speeds_stop_segments_{analysis_date} parquet (which is produced in B1_speeds_by_segment_trip ).
* grab in stop_segments_{analysis_date} (in CRS 3310 already), and you can get the segment's length.
* merge with the speeds by segment-trip, which contains the meters_elapsed column
* calculate pct where meters_elapsed/segment_length
* show me some charts around this, a couple of descriptives to make sure these are all ok
* give me rough descriptives of how many rows we're dropping if we go with keeping at least 30%, 40%, 50%

### Open these files just for testing

In [4]:
def import_speeds_segs(analysis_date:str, 
                       max_speed_cutoff: int, 
                       dict_inputs:dict)-> pd.DataFrame:
    FILE = dict_inputs['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}", 
        filters = [[("speed_mph", "<=", max_speed_cutoff), 
                    ("sec_elapsed", ">", 0), 
                    ("meters_elapsed", ">", 0)
                   ]])
    
    return df

In [5]:
def load_segments(analysis_date:str, dict_inputs:dict) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    FILE = dict_inputs['segments_file']
    segments = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet")
    
    return segments

## Sample segments and routes for Big Blue Bus

In [6]:
test_operator = "Big Blue Bus VehiclePositions"
test_org = "City of Santa Monica"
test_key = "6c2d7daaf979779fa2089c6395baf98b"

In [7]:
pub_df = pd.read_parquet(
    f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet", 
    filters = [[("agency", "==", test_org)]]
)

In [8]:
# Dark orange
shape_id1  = "26375"
stop_seq1 = 7
shape_array1 = pub_df[pub_df.shape_id==shape_id1].shape_array_key.iloc[0]

In [9]:
# Light yellow 
shape_id2 = "26342"
stop_seq2 = 23
shape_array2 = pub_df[pub_df.shape_id==shape_id2].shape_array_key.iloc[0]

In [10]:
# Dark Red
shape_id3 = "26393"
stop_seq3 = 32
shape_array3 = pub_df[pub_df.shape_id==shape_id3].shape_array_key.iloc[0]

In [11]:
# Light orange
shape_id4 = "26372"
stop_seq4 = 14
shape_array4 = pub_df[pub_df.shape_id==shape_id4].shape_array_key.iloc[0]

In [12]:
# Green
shape_id5 = "26400"
stop_seq5= 8
shape_array5 = pub_df[pub_df.shape_id==shape_id5].shape_array_key.iloc[0]

In [13]:
def one_route_map(avg_speeds:gpd.GeoDataFrame, shape_array_key:str):
    display(avg_speeds[avg_speeds.shape_array_key==shape_array_key].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE,
    style_kwds = {'weight':5}))
    

## Open Files
* Add `import_speeds_segs`,`calculate_segment_length`,`merge_segments_speeds` into `B2`

In [14]:
dictionary = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [15]:
segments_gdf = load_segments(analysis_date, dictionary)

In [16]:
speeds = import_speeds_segs(analysis_date,70,  dictionary)

In [17]:
def calculate_segment_length(segments:gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Find the length of segments
    """
    segments = segments.assign(
        meters_length=(segments.geometry.length)
    )
    df = segments.drop(columns = ['geometry','district','district_name'])
    
    return df

In [18]:
def merge_segments_speeds(speeds: pd.DataFrame,
                          segments:gpd.GeoDataFrame, 
                          ) -> pd.DataFrame:
    """
    Merge speeds and segments, calculate length.
    """
    segments = calculate_segment_length(segments)
    
    merge_cols = ['shape_array_key','stop_sequence','schedule_gtfs_dataset_key']
    merge1 = pd.merge(segments, speeds, on = merge_cols, how = "inner")
    
    merge1['percent'] = merge1.meters_elapsed/merge1.meters_length * 100
    
    merge1.percent = merge1.percent.fillna(0)
    return merge1

In [19]:
merge1 = merge_segments_speeds(speeds, segments_gdf)

In [20]:
def myround(x, base=5):
    return base * round(x/base)

In [21]:
def valid_trips_by_cutoff(df, percentages:list):
    """
    Returns a table of how many trips/routes/operators/etc are kept 
    after applying a certain percentile cutoff for the 
    length of segment that's covered.
    
    Ex: if a segment has 40% of its length covered in RT 
    data, it falls in the 20th percentile.
    
    Args:
        df: result from  merge_segments_speeds(speeds, segments_gdf)
        percentages: the percentiles in floats like 0.2, 0.25, 0.3
    """
    final = pd.DataFrame()
    og_len = len(df)
    og_trips = df.trip_id.nunique()
    og_shape_array_key = df.shape_array_key.nunique()
    
    for i in percentages:
        # Round up percent to nearest 5. Ex: 43 becomes 45.
        percent = myround(df.percent.quantile(i).astype(float))
        temp = (df[(df.percent >= percent)])
        
        temp = temp.assign(
            percentile = f"Min. of {percent}% of seg length covered")
        
        temp = (temp
                .groupby(['percentile'])
                .agg({'gtfs_dataset_name':'count',
                      'speed_mph':'mean',
                      'shape_array_key':'nunique',
                      'trip_id':'nunique',
                      'gtfs_dataset_key':'nunique'})
                .reset_index()
                .rename(columns = {'gtfs_dataset_name':'n_rows',
                                  'speed_mph':'mean_speed_mph',
                                  'shape_array_key':'n_kept_routes',
                                  'trip_id':'n_kept_trips',
                                  'gtfs_dataset_key':'n_kept_operators'})
               )
        
        final = pd.concat([final, temp], axis=0)

    
    final = final.assign(
        percentage_kept_rows=final.n_rows.divide(og_len) * 100,
        percentage_kept_trips = final.n_kept_trips.divide(og_trips) * 100,
        percentage_kept_routes = final.n_kept_routes.divide(og_shape_array_key)*100)
    
    round_cols = ['percentage_kept_rows', 'percentage_kept_trips', 'percentage_kept_routes']
    final[round_cols] = final[round_cols].round(0)
    return final

In [22]:
test = valid_trips_by_cutoff(merge1, [.1,.2,.3,.4,.5,.6,.7])

## Visualizing Speed
* https://nbviewer.org/github/cal-itp/data-analyses/blob/filter-speeds-avgs/rt_segment_speeds/18_speed_distribution.ipynb
* https://analysis.calitp.org/rt/district_07-los-angeles/9__speedmaps__district_07-los-angeles__itp_id_300.html

### % of rows kept

In [23]:
test = threshold_utils.pre_clean(test)

In [24]:
test

Unnamed: 0,Percentile,N Rows,Mean Speed Mph,N Kept Routes,N Kept Trips,N Kept Operators,Percentage Kept Rows,Percentage Kept Trips,Percentage Kept Routes
0,Min. of 35% of seg length covered,1865253,12.2,4067,65747,74,90.0,99.0,100.0
0,Min. of 55% of seg length covered,1646070,12.2,4060,65524,74,79.0,99.0,99.0
0,Min. of 65% of seg length covered,1493619,12.1,4053,65322,74,72.0,99.0,99.0
0,Min. of 80% of seg length covered,1198016,11.7,4034,64656,74,58.0,98.0,99.0
0,Min. of 85% of seg length covered,1070347,11.5,4019,64120,74,51.0,97.0,98.0
0,Min. of 90% of seg length covered,914280,11.2,3987,63142,74,44.0,96.0,98.0
0,Min. of 95% of seg length covered,698923,10.8,3917,61293,74,34.0,93.0,96.0


In [25]:
# Main chart
def bar_chart(df, x_column: str, y_column:str, title:str):
    chart = (alt.Chart(df)
         .mark_bar()
         .encode(x=x_column, y= y_column, 
          color=alt.Color(y_column, 
          scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
          legend=None),
          tooltip = df.columns.tolist())
         .properties(title = title)
            )
    chart = threshold_utils.chart_size((chart), 400,300)
    return chart
    

In [26]:
bar_chart(test, 'Percentage Kept Rows','Percentile', 'Rows Kept After % Segment Cutoff')

## Only keep speeds that meet a certain threshold
* Put it in `speeds_with_segment_geom` in `B2`

In [29]:
def speeds_length_filter(speeds: pd.DataFrame,
                         segments: gpd.GeoDataFrame,
                         columns_to_keep: list,
                         percentile: float = 0.20) -> pd.DataFrame:
    
    """
    Do an inner merge on speeds and segments. 
    Filter out segment lengths that don't meet a 
    particular percentile we set. 
    """
    # Do an inner merge 
    df= merge_segments_speeds(speeds, segments_gdf)
    
    # Round the percent
    percent = myround(df.percent.quantile(percentile).astype(float))
    
    df2 = (df[(df.percent >= percent)])
    
    df2 = df2[columns_to_keep]
    
    # Keep both unfiltered and filtered
    return df, df2

In [30]:
# test1, test2 = speeds_length_filter(speeds, segments_gdf, columns_to_keep, 0.10)

In [31]:
# test1.shape

In [32]:

# test2.shape

In [33]:
# test2.columns

### Edited B2 function

In [34]:
columns_to_keep = ['shape_array_key', 'stop_sequence', 'gtfs_dataset_key',
      'gtfs_dataset_name', 'trip_id', 'min_time', 'min_dist', 'max_time',
       'max_dist', 'meters_elapsed', 'sec_elapsed', 'speed_mph',
      'trip_instance_key', 'schedule_gtfs_dataset_key',]

In [37]:
def add_back_missing_stops(filtered_speeds:pd.DataFrame, 
                           original_speeds:pd.DataFrame,
                           columns_to_merge:list) -> pd.DataFrame:
    
  
    merge1 = pd.merge(original_speeds, filtered_speeds, on = columns_to_merge, how = "left", indicator = True)
    print(merge1._merge.value_counts())
    
    
    # Filter out for left only speeds that were deleted
    left_only = merge1.loc[merge1._merge == 'left_only']
    
    # Check that filtering was right
    # print(len(original_speeds)-len(filtered_speeds) == len(left_only))
    
    # Only need to keep one instance of a deleted stop seq.
    left_only = left_only[columns_to_merge].drop_duplicates().reset_index(drop = True)

    return left_only 


In [38]:
# test3 = add_back_missing_stops(test2, test1, columns_to_keep)

In [39]:
# test3.shape

In [40]:
# test3.sample(3)

### Question
* When should I add back missing stops, right now I'm doing it after you are done calculating all the averages but maybe this should happen at a different point of the process.

In [89]:
def speeds_with_segment_geom(
    analysis_date: str, 
    max_speed_cutoff: int = 70,
    dict_inputs: dict = {},
    percentile:float = 0.20,
) -> gpd.GeoDataFrame: 
    """
    Import the segment-trip table. 
    Average the speed_mph across all trips present in the segment.
    """
    SEGMENT_FILE = dict_inputs["segments_file"]
    SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"]
    SPEEDS_FILE = dict_inputs["stage4"]
    
    # Merge in segment geometry
    segments = helpers.import_segments(
        SEGMENT_GCS,
        f"{SEGMENT_FILE}_{analysis_date}",
        columns = SEGMENT_IDENTIFIER_COLS + [
            "schedule_gtfs_dataset_key", 
            "stop_id",
            "loop_or_inlining",
            "geometry", 
            "district", "district_name"
        ]
    )
    
    # check CRS
    segments["segment_length"] = segments.geometry.length
    
    # Read in speeds
    df = pd.read_parquet(
        f"{SEGMENT_GCS}{SPEEDS_FILE}_{analysis_date}", 
        #filters = [[("speed_mph", "<=", max_speed_cutoff), 
        #            ("sec_elapsed", ">", 0), 
        #            ("meters_elapsed", ">", 0)
        #           ]]
    )
    
    #df2 = df[df.speed_mph.notna() ].reset_index(drop=True)
    
    unique_segments = df[SEGMENT_IDENTIFIER_COLS].drop_duplicates()
    
    # do a merge with segments
    
    
    df2 = df2.assign(
        pct_seg = df2.meters_elapsed.divide(df2.segment_length)
    )
    
    df3 = df2[(df2.pct_seg >= some_threshold) & (df2.speed_mph.notna()) & 
              (df2.sec_elapsed > 0) & (df2.meters_elapsed > 0)]
    
    # Filter out abnormally high and low speeds
    # Threshold defaults to throwing away the bottom 20% of rows with low speeds.
    columns_keep_merge = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
      'gtfs_dataset_name', 'trip_id', 'min_time', 'min_dist', 'max_time',
       'max_dist', 'meters_elapsed', 'sec_elapsed', 'speed_mph',
      'trip_instance_key']
    all_speeds, df3 = speeds_length_filter(df2, segments, columns_keep_merge, percentile)
    
    time_of_day_df = sched_rt_utils.get_trip_time_buckets(analysis_date)

    df4 = pd.merge(
        df3, 
        time_of_day_df, 
        on = "trip_instance_key", 
        how = "inner"
    )
    
    all_day = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df4, 
        SEGMENT_IDENTIFIER_COLS
    )
    peak = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df4[df4.time_of_day.isin(["AM Peak", "PM Peak"])], 
        SEGMENT_IDENTIFIER_COLS
    )
    
    stats = pd.concat([
        all_day.assign(time_of_day = "all_day"),
        peak.assign(time_of_day = "peak")
    ], axis=0)
    
    ## newstuff 
    # start with segments with geom (scheduled, we have way too many)
    # merge against unique_segments (these are present in RT...inner join)...we have geom for segments we care about
    # merge against stats (left join)
    
    # Add back in rows that were filtered out for 
    # segment length
    missing = add_back_missing_stops(stats, all_speeds, ['shape_array_key', 'stop_sequence'])
    
    # Concat & fill in NA 
    stats2 = pd.concat([missing, stats])
    stats2 = stats2.fillna(stats2.dtypes.replace({'float64': 0.0, 'object': 'None'}))
                           
    # Merge in segment geometry with a changed CRS
    segments = segments.to_crs(geography_utils.WGS84)
    
    gdf = pd.merge(
        segments,
        stats2,
        on = SEGMENT_IDENTIFIER_COLS,
        how = "inner"
    )
    
    return gdf

SyntaxError: invalid syntax (2594350709.py, line 49)

### Check out speeds

In [42]:
# 2:12
avg_test = speeds_with_segment_geom(analysis_date, 
                                    70, 
                                    dictionary,
                                    0.1)

both          4074061
left_only        9225
right_only          0
Name: _merge, dtype: int64


In [43]:
STG5_FILE = dictionary['stage5']
og_avg = gpd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet")

In [44]:
STG5_FILE

'avg_speeds_stop_segments'

In [45]:
avg_test.columns == og_avg.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

* There are 3000 less rows

In [46]:
avg_test.sample(3).drop(columns=['geometry'])

Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
196428,ddfd6767d8d7e9a8fba6734f8c47c310,2,1ebafaca8716652559b2017b6eedc4ef,821079,0,4,District 4 - Oakland,1.34,11.0,1.24,2.02,all_day
97752,6d7a6034a6d16c6821dd5572063c0b85,23,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,880,0,7,District 7 - Los Angeles,15.42,3.0,14.7,21.56,peak
174950,c607b07eae80cceddb1e8903400ea058,5,fb467982dcc77a7f9199bebe709bb700,64175,0,4,District 4 - Oakland,23.36,7.0,15.81,25.04,all_day


In [47]:
# See unique stops-keys-routes. They should have no more than 2.
avg_test.groupby(['shape_array_key','schedule_gtfs_dataset_key','stop_sequence']).agg({'district':'count'}).sort_values('district', ascending = False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,district
shape_array_key,schedule_gtfs_dataset_key,stop_sequence,Unnamed: 3_level_1
000b8c60f7767e8214f6ef6638d2cb83,78b44303c1714f6c6a4801637c2a5c9d,2,2
a02184b3fd23d678b7c12ed197936df7,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,14,2
a02184b3fd23d678b7c12ed197936df7,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,25,2
a02184b3fd23d678b7c12ed197936df7,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,24,2
a02184b3fd23d678b7c12ed197936df7,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,23,2


In [48]:
og_avg.shape

(230532, 13)

In [49]:
avg_test.shape

(227055, 13)

## Checks

### Checking one sequence that was deemed "missing"

In [50]:
# Test 3 contains missing sequences that were added back on
# From "add_back_missing_stops"
avg_test.loc[avg_test.time_of_day == 'None'].sample(3).drop(columns = ['geometry'])

Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
58561,41f674db173a1acc97aea4dac0187576,17,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,2253,0,7,District 7 - Los Angeles,0.0,0.0,0.0,0.0,
64950,4782a824634ba68149e036632f1d9651,3,43d8d305ee692724a532f30ea63a1cbe,1214,0,5,District 5 - San Luis Obispo,0.0,0.0,0.0,0.0,
44755,31108a486d64929d7a910564fe9ee9b2,2,eb9acbcb42315399bb54df78adfd3dac,490,0,5,District 5 - San Luis Obispo,0.0,0.0,0.0,0.0,


In [51]:
missing_shape1  = "609c3fe2d34fe2f239c12f998ae0fdb6"
missing_stop_seq1 = 33

In [52]:
def look_at_missing_row(shape, sequence):
    display(og_avg.loc[(og_avg.shape_array_key == shape) & (og_avg.stop_sequence == sequence)].drop(columns = ['geometry']))
    display(merge1.loc[(merge1.shape_array_key == shape) & (merge1.stop_sequence == sequence)][['percent', 'speed_mph']])
    display(avg_test.loc[(avg_test.shape_array_key == shape) & (avg_test.stop_sequence == sequence)].drop(columns = ['geometry']))

* Strange this was considered missing...how -->

In [53]:
look_at_missing_row(missing_shape1, missing_stop_seq1)

Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
88024,609c3fe2d34fe2f239c12f998ae0fdb6,33,34adcb78913b80f60f3295dc84561706,716,0,7,District 7 - Los Angeles,1.29,1,1.29,1.29,all_day


Unnamed: 0,percent,speed_mph
822919,17.24,1.29


Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
86745,609c3fe2d34fe2f239c12f998ae0fdb6,33,34adcb78913b80f60f3295dc84561706,716,0,7,District 7 - Los Angeles,0.0,0.0,0.0,0.0,


In [54]:
missing_shape2  = "4d006755475d7c2a03c525995311ec16"
missing_stop_seq2 = 29

In [55]:
look_at_missing_row(missing_shape2, missing_stop_seq2)

Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
70709,4d006755475d7c2a03c525995311ec16,29,97b8c635bbc4cdd02bc5c27d0e83df3c,e6650363-497d-4797-8931-a83c0bc389dc,0,3,District 3 - Marysville,15.25,1,15.25,15.25,all_day


Unnamed: 0,percent,speed_mph
651082,21.76,15.25


Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
69699,4d006755475d7c2a03c525995311ec16,29,97b8c635bbc4cdd02bc5c27d0e83df3c,e6650363-497d-4797-8931-a83c0bc389dc,0,3,District 3 - Marysville,0.0,0.0,0.0,0.0,


### Check using Big Blue Bus

In [56]:
def compare_average(avg_test:pd.DataFrame,
                    og_avg:pd.DataFrame, 
                    shape_array_key:str,
                    stop_sequence:int):
    
    print(f"route {shape_array_key}, sequence {stop_sequence}")
    drop_cols = ['shape_array_key','stop_sequence']
    print('Original')
    original = og_avg[(og_avg.shape_array_key == shape_array_key) &(og_avg.stop_sequence == stop_sequence)].drop(columns = drop_cols)
    display(original.drop(columns = ['geometry']))
    
    print('New')
    new = avg_test[(avg_test.shape_array_key == shape_array_key) & (avg_test.stop_sequence == stop_sequence)].drop(columns = drop_cols)
    display(new.drop(columns = ['geometry']))    
    display(new.explore(width = 300, height = 300))

In [57]:
og_avg.p50_mph.describe()

count   230532.00
mean        11.99
std          8.74
min          0.00
25%          5.99
50%          9.99
75%         15.80
max         69.96
Name: p50_mph, dtype: float64

In [58]:
avg_test.p50_mph.describe()

count   227055.00
mean        12.66
std          8.86
min          0.00
25%          6.58
50%         10.74
75%         16.67
max         69.96
Name: p50_mph, dtype: float64

In [59]:
# compare_average(avg_test, og_avg, shape_array1, stop_seq1)

In [60]:
# compare_average(avg_test, og_avg, shape_array2, stop_seq2)

* Strangely enough, the 50th percentile speed became lower
* Sometimes the lower the length, the higher the speed?

In [61]:
# compare_average(avg_test, og_avg, shape_array3, stop_seq3)

In [62]:
# merge1.loc[(merge1.shape_array_key == shape_array3) & (merge1.stop_sequence == stop_seq3)][merge1_preview_cols].sort_values(['percent'])

* This is missing because the only row that was non-zero only covered 27% of the length
* Maybe up the threshold?

In [63]:

# compare_average(avg_test, og_avg, shape_array4, stop_seq4)

In [64]:
# merge1.loc[(merge1.shape_array_key == shape_array4) & (merge1.stop_sequence == stop_seq4)][merge1_preview_cols]

In [65]:
# compare_average(avg_test, og_avg, shape_array5, stop_seq5)

## Comparison
* i still want a left join for segments, but only for segments that have RT trips ever (that end up going missing, after the 45% rule)

* i don't want a left join to show all segments ever, bc segments are cut from scheduled shapes. more operators have schedule data than RT. so i don't want a mess of segments at the end of the left join off schedule, but a left join for "RT shapes"

In [66]:
# Results from original script
rt_segs_agg = (og_avg
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('og_')
           )

In [67]:
rt_segs_agg.sample()

Unnamed: 0,og_shape_array_key,og_schedule_gtfs_dataset_key,og_total_stops
826,3464e30013d6dcad2a7e528bfb579c94,6d2c723904a2a0797ca993f541de70e6,5


In [68]:
# My test after filtering but adding back in cut rows
avg_test_agg = (avg_test
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('testing_')
           )

In [69]:
avg_test_agg.sample()

Unnamed: 0,testing_shape_array_key,testing_schedule_gtfs_dataset_key,testing_total_stops
54,0368a6697218759992a4b160df70d84b,c499f905e33929a641f083dad55c521e,23


In [70]:
avg_test_agg.shape, rt_segs_agg.shape

((4085, 3), (4085, 3))

In [71]:
m1 = (pd
      .merge(rt_segs_agg, avg_test_agg, 
             left_on = ['og_shape_array_key', 'og_schedule_gtfs_dataset_key'],
             right_on = ['testing_shape_array_key', 'testing_schedule_gtfs_dataset_key'], 
             how = 'outer', indicator = True)
     )

In [72]:
m1['total_thrown_out_stops'] = m1.og_total_stops - m1.testing_total_stops

In [73]:
m1.sample(3)

Unnamed: 0,og_shape_array_key,og_schedule_gtfs_dataset_key,og_total_stops,testing_shape_array_key,testing_schedule_gtfs_dataset_key,testing_total_stops,_merge,total_thrown_out_stops
3445,d8e74922c8732a6e5342bf90f5f538d3,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,80,d8e74922c8732a6e5342bf90f5f538d3,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,80,both,0
3458,d99bc2f12cc465a53ecbcc0d90b85bb7,78b44303c1714f6c6a4801637c2a5c9d,1,d99bc2f12cc465a53ecbcc0d90b85bb7,78b44303c1714f6c6a4801637c2a5c9d,1,both,0
2421,97de9c795bcfc94ec822d0842b7352e1,13cc00cd32512520df2bf2ed36cb08a6,17,97de9c795bcfc94ec822d0842b7352e1,13cc00cd32512520df2bf2ed36cb08a6,17,both,0


In [74]:
m1._merge.value_counts()

both          4085
left_only        0
right_only       0
Name: _merge, dtype: int64

* 28 routes are missing.

In [75]:
m1.total_thrown_out_stops.describe()

count   4085.00
mean       0.00
std        0.00
min        0.00
25%        0.00
50%        0.00
75%        0.00
max        0.00
Name: total_thrown_out_stops, dtype: float64

#### Seeing all the sequences thrown out

In [76]:
og_avg.columns

Index(['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'geometry', 'district', 'district_name',
       'p50_mph', 'n_trips', 'p20_mph', 'p80_mph', 'time_of_day'],
      dtype='object')

In [77]:
merge_cols = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining']

In [78]:
subset = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'p50_mph', 'n_trips',
       'p20_mph', 'p80_mph', 'time_of_day']

In [79]:
og_avg2 = og_avg[subset]

In [80]:
og_avg2.shape

(230532, 10)

In [81]:
avg_test2 = avg_test[subset]

In [82]:
avg_test2.shape

(227055, 10)

In [83]:
m2 = pd.merge(og_avg2, avg_test2, on = merge_cols, how = 'outer', indicator = True)

* Why are some showing up in others...
* Left: original, right: average.

In [84]:
m2._merge.value_counts()

both          427852
left_only          0
right_only         0
Name: _merge, dtype: int64

In [85]:
one_subset = m2.loc[(m2.shape_array_key == "000b8c60f7767e8214f6ef6638d2cb83") & (m2.stop_id == "2197")]

In [86]:
one_subset.shape

(4, 16)

In [87]:
one_subset.drop_duplicates().shape

(4, 16)

In [88]:
one_subset.drop_duplicates()

Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph_x,n_trips_x,p20_mph_x,p80_mph_x,time_of_day_x,p50_mph_y,n_trips_y,p20_mph_y,p80_mph_y,time_of_day_y,_merge
40,000b8c60f7767e8214f6ef6638d2cb83,12,78b44303c1714f6c6a4801637c2a5c9d,2197,0,0.63,7,0.26,2.31,all_day,2.65,2.0,2.62,2.68,all_day,both
41,000b8c60f7767e8214f6ef6638d2cb83,12,78b44303c1714f6c6a4801637c2a5c9d,2197,0,0.63,7,0.26,2.31,all_day,2.65,2.0,2.62,2.68,peak,both
42,000b8c60f7767e8214f6ef6638d2cb83,12,78b44303c1714f6c6a4801637c2a5c9d,2197,0,0.77,6,0.23,2.59,peak,2.65,2.0,2.62,2.68,all_day,both
43,000b8c60f7767e8214f6ef6638d2cb83,12,78b44303c1714f6c6a4801637c2a5c9d,2197,0,0.77,6,0.23,2.59,peak,2.65,2.0,2.62,2.68,peak,both
