In [1]:
import datetime
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
import altair as alt
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,  
    analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions,B2_avg_speeds_by_segment)
from shared_utils import calitp_color_palette as cp, rt_utils, geography_utils
import _threshold_utils as threshold_utils
import _rt_scheduled_utils as rt_scheduled_utils
CONFIG_PATH = './scripts/config.yml'
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
analysis_date

'2023-07-12'

## % of Meters
* start with the speeds_stop_segments_{analysis_date} parquet (which is produced in B1_speeds_by_segment_trip ).
* grab in stop_segments_{analysis_date} (in CRS 3310 already), and you can get the segment's length.
* merge with the speeds by segment-trip, which contains the meters_elapsed column
* calculate pct where meters_elapsed/segment_length
* show me some charts around this, a couple of descriptives to make sure these are all ok
* give me rough descriptives of how many rows we're dropping if we go with keeping at least 30%, 40%, 50%

### Open these files just for testing

In [4]:
def import_speeds_segs(analysis_date:str, 
                       max_speed_cutoff: int, 
                       dict_inputs:dict)-> pd.DataFrame:
    FILE = dict_inputs['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}", 
        filters = [[("speed_mph", "<=", max_speed_cutoff)]])
    
    return df

In [5]:
def load_segments(analysis_date:str, dict_inputs:dict) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    FILE = dict_inputs['segments_file']
    segments = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet")
    
    return segments

### Open Files
* Add `import_speeds_segs`,`calculate_segment_length`,`merge_segments_speeds` into `B2`

In [22]:
dictionary = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [23]:
segments_gdf = load_segments(analysis_date, dictionary)

In [24]:
speeds = import_speeds_segs(analysis_date,70,  dictionary)

In [25]:
speeds.columns

Index(['gtfs_dataset_key', 'gtfs_dataset_name', 'trip_id', 'trip_instance_key',
       'schedule_gtfs_dataset_key', 'shape_array_key', 'stop_sequence',
       'min_time', 'min_dist', 'max_time', 'max_dist', 'meters_elapsed',
       'sec_elapsed', 'speed_mph'],
      dtype='object')

In [26]:
segments_gdf.columns

Index(['shape_array_key', 'geometry', 'stop_id', 'stop_sequence',
       'loop_or_inlining', 'schedule_gtfs_dataset_key', 'seg_idx', 'district',
       'district_name', 'geometry_arrowized'],
      dtype='object')

In [27]:
def calculate_segment_length(segments:gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Find the length of segments
    """
    segments = segments.assign(
        meters_length=(segments.geometry.length)
    )
    df = segments.drop(columns = ['geometry','district','district_name'])
    
    return df

In [28]:
def merge_segments_speeds(speeds: pd.DataFrame,
                          segments:gpd.GeoDataFrame, 
                          ) -> pd.DataFrame:
    """
    Merge speeds and segments, calculate length.
    """
    segments = calculate_segment_length(segments)
    
    merge_cols = ['shape_array_key','stop_sequence','schedule_gtfs_dataset_key']
    merge1 = pd.merge(segments, speeds, on = merge_cols, how = "inner")
    
    merge1['percent'] = merge1.meters_elapsed/merge1.meters_length * 100
    
    merge1.percent = merge1.percent.fillna(0)
    return merge1

In [29]:
merge1 = merge_segments_speeds(speeds, segments_gdf)

In [30]:
def myround(x, base=5):
    return base * round(x/base)

In [31]:
def valid_trips_by_cutoff(df, percentages:list):
    """
    Returns a table of how many trips/routes/operators are kept 
    after applying a certain percentile cutoff for the 
    length of segment that's covered.
    
    Ex: if a segment has 40% of its length covered in RT 
    data, it falls in the 20th percentile.
    
    Args:
        df: result from  merge_segments_speeds(speeds, segments_gdf)
        percentages: the percentiles in floats like 0.2, 0.25, 0.3
    """
    final = pd.DataFrame()
    og_len = len(df)
    og_trips = df.trip_id.nunique()
    og_shape_array_key = df.shape_array_key.nunique()
    
    for i in percentages:
        # Round up percent to nearest 5. Ex: 43 becomes 45.
        percent = myround(df.percent.quantile(i).astype(float))
        temp = (df[(df.percent >= percent)])
        
        temp = temp.assign(
            percentile = f"Min. of {percent}% of seg length covered")
        
        temp = (temp
                .groupby(['percentile'])
                .agg({'gtfs_dataset_name':'count',
                      'speed_mph':'mean',
                      'shape_array_key':'nunique',
                      'trip_id':'nunique',
                      'gtfs_dataset_key':'nunique'})
                .reset_index()
                .rename(columns = {'gtfs_dataset_name':'n_rows',
                                  'speed_mph':'mean_speed_mph',
                                  'shape_array_key':'n_kept_routes',
                                  'trip_id':'n_kept_trips',
                                  'gtfs_dataset_key':'n_kept_operators'})
               )
        
        final = pd.concat([final, temp], axis=0)

    
    final = final.assign(
        percentage_kept_rows=final.n_rows.divide(og_len) * 100,
        percentage_kept_trips = final.n_kept_trips.divide(og_trips) * 100,
        percentage_kept_routes = final.n_kept_routes.divide(og_shape_array_key)*100)
    
    round_cols = ['percentage_kept_rows', 'percentage_kept_trips', 'percentage_kept_routes']
    final[round_cols] = final[round_cols].round(0)
    return final

In [32]:
test = valid_trips_by_cutoff(merge1, [.1,.2,.3,.4,.5,.6,.7])

## Visualizing Speed
* https://nbviewer.org/github/cal-itp/data-analyses/blob/filter-speeds-avgs/rt_segment_speeds/18_speed_distribution.ipynb
* https://analysis.calitp.org/rt/district_07-los-angeles/9__speedmaps__district_07-los-angeles__itp_id_300.html

### % of rows kept

In [33]:
test = threshold_utils.pre_clean(test)

In [34]:
test

Unnamed: 0,Percentile,N Rows,Mean Speed Mph,N Kept Routes,N Kept Trips,N Kept Operators,Percentage Kept Rows,Percentage Kept Trips,Percentage Kept Routes
0,Min. of 10% of seg length covered,2026035,11.8,4080,65939,74,90.0,100.0,100.0
0,Min. of 40% of seg length covered,1820711,12.3,4066,65709,74,81.0,99.0,99.0
0,Min. of 60% of seg length covered,1573312,12.2,4055,65424,74,70.0,99.0,99.0
0,Min. of 75% of seg length covered,1308198,11.8,4039,64949,74,58.0,98.0,99.0
0,Min. of 85% of seg length covered,1070347,11.5,4019,64120,74,48.0,97.0,98.0
0,Min. of 90% of seg length covered,914280,11.2,3987,63142,74,41.0,95.0,98.0
0,Min. of 95% of seg length covered,698923,10.8,3917,61293,74,31.0,93.0,96.0


In [35]:
# Main chart
def bar_chart(df, x_column: str, y_column:str, title:str):
    chart = (alt.Chart(df)
         .mark_bar()
         .encode(x=x_column, y= y_column, 
          color=alt.Color(y_column, 
          scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
          legend=None),
          tooltip = df.columns.tolist())
         .properties(title = title)
            )
    chart = threshold_utils.chart_size((chart), 400,300)
    return chart
    

In [36]:
bar_chart(test, 'Percentage Kept Rows','Percentile', 'Rows Kept After % Segment Cutoff')

## Only keep speeds that meet a certain threshold
* Put it in `speeds_with_segment_geom` in `B2`

In [37]:
def speeds_length_filter(speeds: pd.DataFrame,
                         segments: gpd.GeoDataFrame,
                         columns_to_keep: list,
                         percentile: float = 0.20) -> pd.DataFrame:
    
    """
    Do an inner merge on speeds and segments. 
    Filter out segment lengths that don't meet a 
    particular percentile we set. 
    """
    # Do an inner merge 
    df= merge_segments_speeds(speeds, segments_gdf)
    
    percent = df.percent.quantile(percentile).astype(float)
    
    df2 = (df[(df.percent >= percent)])
    
    df2 = df2[columns_to_keep]
    
    return df, df2

In [38]:
# test1, test2 = speeds_length_filter(speeds, segments_gdf, columns_to_keep, 0.20)

In [39]:
# test1.shape

In [40]:
# test2.shape

In [41]:
# test2.columns

### Edited B2 function

In [42]:
columns_to_keep = ['shape_array_key', 'stop_sequence', 'gtfs_dataset_key',
      'gtfs_dataset_name', 'trip_id', 'min_time', 'min_dist', 'max_time',
       'max_dist', 'meters_elapsed', 'sec_elapsed', 'speed_mph',
      'trip_instance_key', 'schedule_gtfs_dataset_key',]

In [43]:
columns_to_keep2 = ['shape_array_key', 'stop_sequence', 'gtfs_dataset_key',
      'gtfs_dataset_name', 'schedule_gtfs_dataset_key',]

In [44]:
def add_back_missing_stops(filtered_speeds:pd.DataFrame, 
                           original_speeds:pd.DataFrame,
                           columns_to_merge:list) -> pd.DataFrame:
    
    merge1 = pd.merge(original_speeds, filtered_speeds, on = columns_to_merge, how = "left", indicator = True)
    print(merge1._merge.value_counts())
    # Filter out for left only speeds that were deleted
    left_only = merge1.loc[merge1._merge == 'left_only']
    left_only = left_only[columns_to_merge].reset_index(drop = True)
    left_only = left_only[columns_to_merge] 
    return left_only 


In [45]:
#test3 = add_back_missing_stops(test2, test1, columns_to_keep)

In [46]:
#test3.sample()

In [47]:
def speeds_with_segment_geom(
    analysis_date: str, 
    max_speed_cutoff: int = 70,
    dict_inputs: dict = {},
    percentile:float = 0.20,
) -> gpd.GeoDataFrame: 
    """
    Import the segment-trip table. 
    Average the speed_mph across all trips present in the segment.
    """
    SEGMENT_FILE = dict_inputs["segments_file"]
    SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"]
    SPEEDS_FILE = dict_inputs["stage4"]
    
    # Merge in segment geometry
    segments = helpers.import_segments(
        SEGMENT_GCS,
        f"{SEGMENT_FILE}_{analysis_date}",
        columns = SEGMENT_IDENTIFIER_COLS + [
            "schedule_gtfs_dataset_key", 
            "stop_id",
            "loop_or_inlining",
            "geometry", 
            "district", "district_name"
        ]
    )
    
    # Read in speeds
    df = pd.read_parquet(
        f"{SEGMENT_GCS}{SPEEDS_FILE}_{analysis_date}", 
        filters = [[("speed_mph", "<=", max_speed_cutoff), 
                    ("sec_elapsed", ">", 0), 
                    ("meters_elapsed", ">", 0)
                   ]]
    )
    
    df2 = df[df.speed_mph.notna() ].reset_index(drop=True)
    
    # Filter out abnormally high and low speeds
    # Threshold defaults to throwing away the bottom 20% of rows with low speeds.
    columns_keep_merge = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
      'gtfs_dataset_name', 'trip_id', 'min_time', 'min_dist', 'max_time',
       'max_dist', 'meters_elapsed', 'sec_elapsed', 'speed_mph',
      'trip_instance_key']
    all_speeds, df3 = speeds_length_filter(df2, segments, columns_keep_merge, percentile)
    
    time_of_day_df = sched_rt_utils.get_trip_time_buckets(analysis_date)

    df4 = pd.merge(
        df3, 
        time_of_day_df, 
        on = "trip_instance_key", 
        how = "inner"
    )
    
    all_day = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df4, 
        SEGMENT_IDENTIFIER_COLS
    )
    peak = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df4[df4.time_of_day.isin(["AM Peak", "PM Peak"])], 
        SEGMENT_IDENTIFIER_COLS
    )
    
    stats = pd.concat([
        all_day.assign(time_of_day = "all_day"),
        peak.assign(time_of_day = "peak")
    ], axis=0)
    
    # Add back in rows that were filtered out for 
    # segment length
    missing = add_back_missing_stops(stats, all_speeds, ['shape_array_key', 'stop_sequence'])
    
    # Concat & fill in NA 
    stats2 = pd.concat([missing, stats])
    
    # NA columns
    stats2 = stats2.fillna(stats2.dtypes.replace({'float64': 0.0, 'object': 'None'}))
    stats2 = stats2.drop_duplicates() 
                           
    # Merge in segment geometry with a changed CRS
    segments = segments.to_crs(geography_utils.WGS84)
    
    gdf = pd.merge(
        segments,
        stats2,
        on = SEGMENT_IDENTIFIER_COLS,
        how = "inner"
    )
    
    return gdf

### Check out speeds

In [48]:
# 2:12
avg_test = speeds_with_segment_geom(analysis_date, 
                                    70, 
                                    dictionary,
                                    0.1)

both          4075096
left_only        8818
right_only          0
Name: _merge, dtype: int64


In [49]:
avg_test.columns

Index(['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'geometry', 'district', 'district_name',
       'p50_mph', 'n_trips', 'p20_mph', 'p80_mph', 'time_of_day'],
      dtype='object')

In [50]:
type(avg_test)

geopandas.geodataframe.GeoDataFrame

In [51]:
STG5_FILE = dictionary['stage5']
og_avg = gpd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet")

In [52]:
STG5_FILE

'avg_speeds_stop_segments'

In [53]:
avg_test.columns

Index(['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'geometry', 'district', 'district_name',
       'p50_mph', 'n_trips', 'p20_mph', 'p80_mph', 'time_of_day'],
      dtype='object')

In [54]:
og_avg.columns

Index(['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'geometry', 'district', 'district_name',
       'p50_mph', 'n_trips', 'p20_mph', 'p80_mph', 'time_of_day'],
      dtype='object')

* There are 6000 more rows if I don't drop the duplicates.
* There are 6000 less rows if I do. 
* Look at one sequence/route to see what's happening.

In [55]:
og_avg.shape

(230532, 13)

In [56]:
avg_test.shape

(227178, 13)

In [57]:
merge1_preview_cols = ['meters_length', 
       'trip_id', 'min_time', 'min_dist', 'max_time', 'max_dist',
       'meters_elapsed', 'sec_elapsed', 'speed_mph', 
       'percent']

In [58]:
og_avg.loc[(og_avg.shape_array_key == "a350e6fbbc6447015de2c60b93c1cc2c") & (og_avg.stop_sequence == 75)]

Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,geometry,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
146517,a350e6fbbc6447015de2c60b93c1cc2c,75,7cc0cb1871dfd558f11a2885c145d144,17853,1,"LINESTRING (-122.49547 37.76455, -122.49548 37.76456, -122.49549 37.76464, -122.49442 37.76469, -122.49359 37.76473)",4,District 4 - Oakland,4.24,13,1.61,5.01,all_day
146518,a350e6fbbc6447015de2c60b93c1cc2c,75,7cc0cb1871dfd558f11a2885c145d144,17853,1,"LINESTRING (-122.49547 37.76455, -122.49548 37.76456, -122.49549 37.76464, -122.49442 37.76469, -122.49359 37.76473)",4,District 4 - Oakland,4.27,8,2.49,5.01,peak


In [59]:
# Original df 
merge1.loc[(merge1.shape_array_key == "a350e6fbbc6447015de2c60b93c1cc2c") & (merge1.stop_sequence == 75)].drop(columns = ['geometry_arrowized']).sample(3)

Unnamed: 0,shape_array_key,stop_id,stop_sequence,loop_or_inlining,schedule_gtfs_dataset_key,seg_idx,meters_length,gtfs_dataset_key,gtfs_dataset_name,trip_id,trip_instance_key,min_time,min_dist,max_time,max_dist,meters_elapsed,sec_elapsed,speed_mph,percent
1460544,a350e6fbbc6447015de2c60b93c1cc2c,17853,75,1,7cc0cb1871dfd558f11a2885c145d144,119094,177.03,c0e3039da063db95ebabd3fe4ee611a4,Bay Area 511 Muni VehiclePositions,11351974_M11,1bc76d545c95e14495e47e16c295df01,37421.0,57.19,37436.0,57.19,0.0,15.0,0.0,0.0
1460559,a350e6fbbc6447015de2c60b93c1cc2c,17853,75,1,7cc0cb1871dfd558f11a2885c145d144,119094,177.03,c0e3039da063db95ebabd3fe4ee611a4,Bay Area 511 Muni VehiclePositions,11351997_M11,a6bdf24eaa2d8b20b732af4efcd63fb9,49778.0,0.3,49793.0,0.3,0.0,15.0,0.0,0.0
1460550,a350e6fbbc6447015de2c60b93c1cc2c,17853,75,1,7cc0cb1871dfd558f11a2885c145d144,119094,177.03,c0e3039da063db95ebabd3fe4ee611a4,Bay Area 511 Muni VehiclePositions,11351982_M11,9c1e6f526af54f2dfb07adac6aab6ec5,41739.0,0.0,41754.0,0.0,0.0,15.0,0.0,0.0


In [60]:
test = avg_test.loc[(avg_test.shape_array_key == "a350e6fbbc6447015de2c60b93c1cc2c") & (avg_test.stop_sequence == 75)]

In [61]:
test.shape

(2, 13)

In [62]:
test.drop_duplicates()

Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,geometry,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
144320,a350e6fbbc6447015de2c60b93c1cc2c,75,7cc0cb1871dfd558f11a2885c145d144,17853,1,"LINESTRING (-122.49547 37.76455, -122.49548 37.76456, -122.49549 37.76464, -122.49442 37.76469, -122.49359 37.76473)",4,District 4 - Oakland,4.27,10.0,3.68,6.41,all_day
144321,a350e6fbbc6447015de2c60b93c1cc2c,75,7cc0cb1871dfd558f11a2885c145d144,17853,1,"LINESTRING (-122.49547 37.76455, -122.49548 37.76456, -122.49549 37.76464, -122.49442 37.76469, -122.49359 37.76473)",4,District 4 - Oakland,4.61,6.0,4.24,5.08,peak


## Sample segments and routes for Big Blue Bus

In [63]:
test_operator = "Big Blue Bus VehiclePositions"
test_org = "City of Santa Monica"
test_key = "6c2d7daaf979779fa2089c6395baf98b"

In [64]:
pub_df = pd.read_parquet(
    f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet", 
    filters = [[("agency", "==", test_org)]]
)

In [65]:
# Dark orange
shape_id1  = "26375"
stop_seq1 = 7
shape_array1 = pub_df[pub_df.shape_id==shape_id1].shape_array_key.iloc[0]

In [66]:
# Light yellow 
shape_id2 = "26342"
stop_seq2 = 23
shape_array2 = pub_df[pub_df.shape_id==shape_id2].shape_array_key.iloc[0]

In [67]:
# Dark Red
shape_id3 = "26393"
stop_seq3 = 32
shape_array3 = pub_df[pub_df.shape_id==shape_id3].shape_array_key.iloc[0]

In [68]:
# Light orange
shape_id4 = "26372"
stop_seq4 = 14
shape_array4 = pub_df[pub_df.shape_id==shape_id4].shape_array_key.iloc[0]

In [69]:
# Green
shape_id5 = "26400"
stop_seq5= 8
shape_array5 = pub_df[pub_df.shape_id==shape_id5].shape_array_key.iloc[0]

In [70]:
def one_route_map(avg_speeds:gpd.GeoDataFrame, shape_array_key:str):
    display(avg_speeds[avg_speeds.shape_array_key==shape_array_key].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE,
    style_kwds = {'weight':5}))
    

## Checks

In [71]:
def compare_average(avg_test:pd.DataFrame,
                    og_avg:pd.DataFrame, 
                    shape_array_key:str,
                    stop_sequence:int):
    
    print(f"route {shape_array_key}, sequence {stop_sequence}")
    drop_cols = ['shape_array_key','stop_sequence']
    print('Original')
    original = og_avg[(og_avg.shape_array_key == shape_array_key) &(og_avg.stop_sequence == stop_sequence)].drop(columns = drop_cols)
    display(original.drop(columns = ['geometry']))
    
    print('New')
    new = avg_test[(avg_test.shape_array_key == shape_array_key) & (avg_test.stop_sequence == stop_sequence)].drop(columns = drop_cols)
    display(new.drop(columns = ['geometry']))    
    display(new.explore(width = 300, height = 300))

In [72]:
og_avg.p50_mph.describe()

count   230532.00
mean        11.99
std          8.74
min          0.00
25%          5.99
50%          9.99
75%         15.80
max         69.96
Name: p50_mph, dtype: float64

In [73]:
avg_test.p50_mph.describe()

count   227178.00
mean        12.66
std          8.86
min          0.00
25%          6.57
50%         10.73
75%         16.65
max         69.96
Name: p50_mph, dtype: float64

In [74]:
# compare_average(avg_test, og_avg, shape_array1, stop_seq1)

In [75]:
# compare_average(avg_test, og_avg, shape_array2, stop_seq2)

* Strangely enough, the 50th percentile speed became lower
* Sometimes the lower the length, the higher the speed?

In [76]:
# compare_average(avg_test, og_avg, shape_array3, stop_seq3)

In [77]:
# merge1.loc[(merge1.shape_array_key == shape_array3) & (merge1.stop_sequence == stop_seq3)][merge1_preview_cols].sort_values(['percent'])

* This is missing because the only row that was non-zero only covered 27% of the length
* Maybe up the threshold?

In [78]:

# compare_average(avg_test, og_avg, shape_array4, stop_seq4)

In [79]:
# merge1.loc[(merge1.shape_array_key == shape_array4) & (merge1.stop_sequence == stop_seq4)][merge1_preview_cols]

In [80]:
# compare_average(avg_test, og_avg, shape_array5, stop_seq5)

## Comparison
* i still want a left join for segments, but only for segments that have RT trips ever (that end up going missing, after the 45% rule)

* i don't want a left join to show all segments ever, bc segments are cut from scheduled shapes. more operators have schedule data than RT. so i don't want a mess of segments at the end of the left join off schedule, but a left join for "RT shapes"

In [81]:
# Results from original script
rt_segs_agg = (og_avg
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('og_')
           )

In [82]:
rt_segs_agg.sample()

Unnamed: 0,og_shape_array_key,og_schedule_gtfs_dataset_key,og_total_stops
850,35c43764e78df2a990a0e6c89b1736d5,587e730fac4db21d54037e0f12b0dd5d,29


In [83]:
# My test after filtering but adding back in cut rows
avg_test_agg = (avg_test
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('testing_')
           )

In [84]:
avg_test_agg.sample()

Unnamed: 0,testing_shape_array_key,testing_schedule_gtfs_dataset_key,testing_total_stops
1327,5339b0cc395600141acccb36b5887e0e,3f3f36b4c41cc6b5df3eb7f5d8ea6e3c,69


In [85]:
avg_test_agg.shape, rt_segs_agg.shape

((4085, 3), (4085, 3))

In [86]:
m1 = (pd
      .merge(rt_segs_agg, avg_test_agg, 
             left_on = ['og_shape_array_key', 'og_schedule_gtfs_dataset_key'],
             right_on = ['testing_shape_array_key', 'testing_schedule_gtfs_dataset_key'], 
             how = 'outer', indicator = True)
     )

In [87]:
m1['total_thrown_out_stops'] = m1.og_total_stops - m1.testing_total_stops

In [88]:
m1.sample(3)

Unnamed: 0,og_shape_array_key,og_schedule_gtfs_dataset_key,og_total_stops,testing_shape_array_key,testing_schedule_gtfs_dataset_key,testing_total_stops,_merge,total_thrown_out_stops
3183,c964f5984d4b4c618fd95f49ea21cbb3,7cc0cb1871dfd558f11a2885c145d144,39,c964f5984d4b4c618fd95f49ea21cbb3,7cc0cb1871dfd558f11a2885c145d144,39,both,0
1933,78a645e72f5571d189d5d8fc3008aae7,13cc00cd32512520df2bf2ed36cb08a6,2,78a645e72f5571d189d5d8fc3008aae7,13cc00cd32512520df2bf2ed36cb08a6,2,both,0
607,274042f8582a0378749cf9302fa513c9,dbbe8ee4864a2715a40749605395d584,44,274042f8582a0378749cf9302fa513c9,dbbe8ee4864a2715a40749605395d584,44,both,0


In [89]:
m1._merge.value_counts()

both          4085
left_only        0
right_only       0
Name: _merge, dtype: int64

* 28 routes are missing.

In [90]:
m1.total_thrown_out_stops.describe()

count   4085.00
mean       0.00
std        0.00
min        0.00
25%        0.00
50%        0.00
75%        0.00
max        0.00
Name: total_thrown_out_stops, dtype: float64

#### Seeing all the sequences thrown out

In [91]:
og_avg.columns

Index(['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'geometry', 'district', 'district_name',
       'p50_mph', 'n_trips', 'p20_mph', 'p80_mph', 'time_of_day'],
      dtype='object')

In [92]:
merge_cols = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining']

In [93]:
subset = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'p50_mph', 'n_trips',
       'p20_mph', 'p80_mph', 'time_of_day']

In [94]:
og_avg2 = og_avg[subset]

In [95]:
og_avg2.shape

(230532, 10)

In [96]:
avg_test2 = avg_test[subset]

In [97]:
avg_test2.shape

(227178, 10)

In [98]:
m2 = pd.merge(og_avg2, avg_test2, on = merge_cols, how = 'outer', indicator = True)

* Why are some showing up in others...
* Left: original, right: average.

In [99]:
m2._merge.value_counts()

both          428098
left_only          0
right_only         0
Name: _merge, dtype: int64

In [100]:
one_subset = m2.loc[(m2.shape_array_key == "000b8c60f7767e8214f6ef6638d2cb83") & (m2.stop_id == "2197")]

In [101]:
one_subset.shape

(4, 16)

In [102]:
one_subset.drop_duplicates().shape

(4, 16)

In [103]:
one_subset.drop_duplicates()

Unnamed: 0,shape_array_key,stop_sequence,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph_x,n_trips_x,p20_mph_x,p80_mph_x,time_of_day_x,p50_mph_y,n_trips_y,p20_mph_y,p80_mph_y,time_of_day_y,_merge
40,000b8c60f7767e8214f6ef6638d2cb83,12,78b44303c1714f6c6a4801637c2a5c9d,2197,0,0.63,7,0.26,2.31,all_day,2.65,2.0,2.62,2.68,all_day,both
41,000b8c60f7767e8214f6ef6638d2cb83,12,78b44303c1714f6c6a4801637c2a5c9d,2197,0,0.63,7,0.26,2.31,all_day,2.65,2.0,2.62,2.68,peak,both
42,000b8c60f7767e8214f6ef6638d2cb83,12,78b44303c1714f6c6a4801637c2a5c9d,2197,0,0.77,6,0.23,2.59,peak,2.65,2.0,2.62,2.68,all_day,both
43,000b8c60f7767e8214f6ef6638d2cb83,12,78b44303c1714f6c6a4801637c2a5c9d,2197,0,0.77,6,0.23,2.59,peak,2.65,2.0,2.62,2.68,peak,both
