In [1]:
import datetime
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
import altair as alt
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,  
    analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions,B2_avg_speeds_by_segment)
from shared_utils import calitp_color_palette as cp, rt_utils, geography_utils
import _threshold_utils as threshold_utils
import _rt_scheduled_utils as rt_scheduled_utils
CONFIG_PATH = './scripts/config.yml'
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## % of Meters
* start with the speeds_stop_segments_{analysis_date} parquet (which is produced in B1_speeds_by_segment_trip ).
* grab in stop_segments_{analysis_date} (in CRS 3310 already), and you can get the segment's length.
* merge with the speeds by segment-trip, which contains the meters_elapsed column
* calculate pct where meters_elapsed/segment_length
* show me some charts around this, a couple of descriptives to make sure these are all ok
* give me rough descriptives of how many rows we're dropping if we go with keeping at least 30%, 40%, 50%

### Open these files just for testing

In [3]:
def import_speeds_segs(analysis_date:str, 
                       max_speed_cutoff: int, 
                       dict_inputs:dict)-> pd.DataFrame:
    FILE = dict_inputs['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}", 
        filters = [[("speed_mph", "<=", max_speed_cutoff)]])
    
    return df

In [4]:
def load_segments(analysis_date:str, dict_inputs:dict) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    FILE = dict_inputs['segments_file']
    segments = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet")
    
    return segments

### Open Files
* Add `import_speeds_segs`,`calculate_segment_length`,`merge_segments_speeds` into `B2`

In [5]:
# dict_inputs = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [6]:
# dict_inputs

In [7]:
dictionary = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [8]:
speeds = import_speeds_segs(analysis_date,70, dictionary)

In [9]:
segments_gdf = load_segments(analysis_date, dictionary)

In [10]:
def calculate_segment_length(segments:gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    segments = segments.assign(
        meters_length=(segments.geometry.length)
    )
    df = segments.drop(columns = ['geometry','district','district_name'])
    
    return df

In [11]:
def merge_segments_speeds(speeds: pd.DataFrame,
                          segments:gpd.GeoDataFrame, 
                          ) -> pd.DataFrame:
    """
    Merge speeds and segments. Calculate length
    """
    segments = calculate_segment_length(segments)
    
    merge_cols = ['shape_array_key','stop_sequence','schedule_gtfs_dataset_key']
    merge1 = pd.merge(segments, speeds, on = merge_cols, how = "inner")
    
    merge1['percent'] = merge1.meters_elapsed/merge1.meters_length * 100
    
    merge1.percent = merge1.percent.fillna(0)
    return merge1

In [12]:
merge1 = merge_segments_speeds(speeds, segments_gdf)

In [13]:
def myround(x, base=5):
    return base * round(x/base)

In [14]:
def valid_trips_by_cutoff(df, percentages:list):
    final = pd.DataFrame()
    og_len = len(df)
    og_trips = df.trip_id.nunique()
    og_shape_array_key = df.shape_array_key.nunique()
    
    for i in percentages:
        # Round up percent to nearest 5. Ex: 43 becomes 45.
        percent = myround(df.percent.quantile(i).astype(float))
        temp = (df[(df.percent >= percent)])
        
        temp = temp.assign(
            percentile = f"Min. of {percent}% of seg length covered")
        
        temp = (temp
                .groupby(['percentile'])
                .agg({'gtfs_dataset_name':'count',
                      'speed_mph':'mean',
                      'shape_array_key':'nunique',
                     'trip_id':'nunique',
                     'gtfs_dataset_key':'nunique'})
                .reset_index()
                .rename(columns = {'gtfs_dataset_name':'n_rows',
                                  'speed_mph':'mean_speed_mph',
                                  'shape_array_key':'n_kept_routes',
                                  'trip_id':'n_kept_trips',
                                  'gtfs_dataset_key':'n_kept_operators'})
               )
        
        final = pd.concat([final, temp], axis=0)

    
    final = final.assign(
        percentage_kept_rows=final.n_rows.divide(og_len) * 100,
        percentage_kept_trips = final.n_kept_trips.divide(og_trips) * 100,
        percentage_kept_routes = final.n_kept_routes.divide(og_shape_array_key)*100)
    
    round_cols = ['percentage_kept_rows', 'percentage_kept_trips', 'percentage_kept_routes']
    final[round_cols] = final[round_cols].round(0)
    return final

In [15]:
test = valid_trips_by_cutoff(merge1, [.1,.2,.3,.4,.5,.6,.7])

In [16]:
#test

## Visualizing Speed
* https://nbviewer.org/github/cal-itp/data-analyses/blob/filter-speeds-avgs/rt_segment_speeds/18_speed_distribution.ipynb
* https://analysis.calitp.org/rt/district_07-los-angeles/9__speedmaps__district_07-los-angeles__itp_id_300.html

### % of rows kept

In [17]:
test = threshold_utils.pre_clean(test)

In [18]:
test

Unnamed: 0,Percentile,N Rows,Mean Speed Mph,N Kept Routes,N Kept Trips,N Kept Operators,Percentage Kept Rows,Percentage Kept Trips,Percentage Kept Routes
0,Min. of 10% of seg length covered,2026035,11.8,4080,65939,74,90.0,100.0,100.0
0,Min. of 40% of seg length covered,1820711,12.3,4066,65709,74,81.0,99.0,99.0
0,Min. of 60% of seg length covered,1573312,12.2,4055,65424,74,70.0,99.0,99.0
0,Min. of 75% of seg length covered,1308198,11.8,4039,64949,74,58.0,98.0,99.0
0,Min. of 85% of seg length covered,1070347,11.5,4019,64120,74,48.0,97.0,98.0
0,Min. of 90% of seg length covered,914280,11.2,3987,63142,74,41.0,95.0,98.0
0,Min. of 95% of seg length covered,698923,10.8,3917,61293,74,31.0,93.0,96.0


In [19]:
# Main chart
def bar_chart(df, x_column: str, y_column:str, title:str):
    chart = (alt.Chart(df)
         .mark_bar()
         .encode(x=x_column, y= y_column, 
          color=alt.Color(y_column, 
          scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
          legend=None),
          tooltip = df.columns.tolist())
         .properties(title = title)
            )
    chart = threshold_utils.chart_size((chart), 400,300)
    return chart
    

In [20]:
bar_chart(test, 'Percentage Kept Rows','Percentile', 'Rows Kept After % Segment Cutoff')

## Only keep speeds that meet a certain threshold
* Put it in `speeds_with_segment_geom` in `B2`

In [21]:
def speeds_length_filter(speeds: pd.DataFrame,
                         segments: gpd.GeoDataFrame,
                         percentile: float = 0.20) -> pd.DataFrame:
    
    # df = merge_segments_speeds(segments, analysis_date, max_speed_cut_off,dict_inputs)
    df= merge_segments_speeds(speeds, segments_gdf)
    
    percent = df.percent.quantile(percentile).astype(float)
    
    df = (df[(df.percent >= percent)])

    columns_to_keep = ['shape_array_key', 'stop_sequence', 'gtfs_dataset_key',
       'gtfs_dataset_name', 'trip_id', 'min_time', 'min_dist', 'max_time',
       'max_dist', 'meters_elapsed', 'sec_elapsed', 'speed_mph',
       'trip_instance_key', 'schedule_gtfs_dataset_key',]
    
    df = df[columns_to_keep]
    
    return df 

### Edited B2 function

In [22]:
def speeds_with_segment_geom(
    analysis_date: str, 
    max_speed_cutoff: int = 70,
    dict_inputs: dict = {},
    percentile:float = 0.20,
) -> gpd.GeoDataFrame: 
    """
    Import the segment-trip table. 
    Average the speed_mph across all trips present in the segment.
    """
    SEGMENT_FILE = dict_inputs["segments_file"]
    SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"]
    SPEEDS_FILE = dict_inputs["stage4"]
    
    # Merge in segment geometry
    segments = helpers.import_segments(
        SEGMENT_GCS,
        f"{SEGMENT_FILE}_{analysis_date}",
        columns = SEGMENT_IDENTIFIER_COLS + [
            "schedule_gtfs_dataset_key", 
            "stop_id",
            "loop_or_inlining",
            "geometry", 
            "district", "district_name"
        ]
    )
    
    # Read in speeds
    df = pd.read_parquet(
        f"{SEGMENT_GCS}{SPEEDS_FILE}_{analysis_date}", 
        filters = [[("speed_mph", "<=", max_speed_cutoff), 
                    ("sec_elapsed", ">", 0), 
                    ("meters_elapsed", ">", 0)
                   ]]
    )
    
    df = df[df.speed_mph.notna() ].reset_index(drop=True)
    
    # Filter out abnormally high and low speeds
    # Threshold defaults to throwing away the bottom 20% of rows with low speeds.
    df = speeds_length_filter(df, segments, percentile)
    
    time_of_day_df = sched_rt_utils.get_trip_time_buckets(analysis_date)

    df2 = pd.merge(
        df, 
        time_of_day_df, 
        on = "trip_instance_key", 
        how = "inner"
    )
    
    all_day = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df2, 
        SEGMENT_IDENTIFIER_COLS
    )
    
    peak = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df2[df2.time_of_day.isin(["AM Peak", "PM Peak"])], 
        SEGMENT_IDENTIFIER_COLS
    )
    
    stats = pd.concat([
        all_day.assign(time_of_day = "all_day"),
        peak.assign(time_of_day = "peak")
    ], axis=0)
    
    # Merge in segment geometry with a changed CRS
    segments = segments.to_crs(geography_utils.WGS84)
    
    gdf = pd.merge(
        segments,
        stats,
        on = SEGMENT_IDENTIFIER_COLS,
        how = "inner"
    )
    
    return gdf

### Check out speeds

In [23]:
# Teakes around 7 minutes
avg_test = speeds_with_segment_geom(analysis_date, 
                                    70, 
                                    dictionary,
                                    0.2)

In [25]:
STG5_FILE = dictionary['stage5']
og_avg = gpd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet")
og_avg = og_avg.drop(columns=["district", "district_name"])

In [26]:
merge1_preview_cols = ['meters_length', 
       'trip_id', 'min_time', 'min_dist', 'max_time', 'max_dist',
       'meters_elapsed', 'sec_elapsed', 'speed_mph', 
       'percent']

## Sample segments and routes for Big Blue Bus

In [28]:
test_operator = "Big Blue Bus VehiclePositions"
test_org = "City of Santa Monica"
test_key = "6c2d7daaf979779fa2089c6395baf98b"

In [29]:
pub_df = pd.read_parquet(
    f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet", 
    filters = [[("agency", "==", test_org)]]
)

In [30]:
# Dark orange
shape_id1  = "26375"
stop_seq1 = 7
shape_array1 = pub_df[pub_df.shape_id==shape_id1].shape_array_key.iloc[0]

In [31]:
# Light yellow 
shape_id2 = "26342"
stop_seq2 = 23
shape_array2 = pub_df[pub_df.shape_id==shape_id2].shape_array_key.iloc[0]

In [32]:
# Dark Red
shape_id3 = "26393"
stop_seq3 = 32
shape_array3 = pub_df[pub_df.shape_id==shape_id3].shape_array_key.iloc[0]

In [33]:
# Light orange
shape_id4 = "26372"
stop_seq4 = 14
shape_array4 = pub_df[pub_df.shape_id==shape_id4].shape_array_key.iloc[0]

In [34]:
# Green
shape_id5 = "26400"
stop_seq5= 8
shape_array5 = pub_df[pub_df.shape_id==shape_id5].shape_array_key.iloc[0]

In [35]:
def one_route_map(avg_speeds:gpd.GeoDataFrame, shape_array_key:str):
    display(avg_speeds[avg_speeds.shape_array_key==shape_array_key].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE,
    style_kwds = {'weight':5}))
    

## Checks

In [36]:
def compare_average(avg_test:pd.DataFrame,
                    og_avg:pd.DataFrame, 
                    shape_array_key:str,
                    stop_sequence:int):
    
    print(f"route {shape_array_key}, sequence {stop_sequence}")
    drop_cols = ['shape_array_key','geometry','stop_sequence']
    print('Original')
    display(og_avg[(og_avg.shape_array_key == shape_array_key) &(og_avg.stop_sequence == stop_sequence)].drop(columns = drop_cols))
    
    print('New')
    display(avg_test[(avg_test.shape_array_key == shape_array_key) & (avg_test.stop_sequence == stop_sequence)].drop(columns = drop_cols))     

In [37]:
og_avg.p50_mph.describe()

count   230532.00
mean        11.99
std          8.74
min          0.00
25%          5.99
50%          9.99
75%         15.80
max         69.96
Name: p50_mph, dtype: float64

In [38]:
avg_test.p50_mph.describe()

count   215863.00
mean        13.06
std          8.73
min          0.00
25%          6.98
50%         11.05
75%         16.95
max         69.96
Name: p50_mph, dtype: float64

In [39]:
compare_average(avg_test, og_avg, shape_array1, stop_seq1)

route 5a788bd9c9aa5c5465875689a626baa9, sequence 7
Original


Unnamed: 0,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
82804,dbbe8ee4864a2715a40749605395d584,894,0,8.94,1,8.94,8.94,all_day
82805,dbbe8ee4864a2715a40749605395d584,894,0,8.94,1,8.94,8.94,peak


New


Unnamed: 0,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
77681,dbbe8ee4864a2715a40749605395d584,894,0,7,District 7 - Los Angeles,8.94,1,8.94,8.94,all_day
77682,dbbe8ee4864a2715a40749605395d584,894,0,7,District 7 - Los Angeles,8.94,1,8.94,8.94,peak


In [40]:
compare_average(avg_test, og_avg, shape_array2, stop_seq2)

route 5d34851ee46adb62216152f8a16fe7d0, sequence 23
Original


Unnamed: 0,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
84959,dbbe8ee4864a2715a40749605395d584,149,0,13.72,19,8.43,15.66,all_day
84960,dbbe8ee4864a2715a40749605395d584,149,0,13.17,8,5.74,14.9,peak


New


Unnamed: 0,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
79691,dbbe8ee4864a2715a40749605395d584,149,0,7,District 7 - Los Angeles,13.86,16,9.26,15.9,all_day
79692,dbbe8ee4864a2715a40749605395d584,149,0,7,District 7 - Los Angeles,13.85,6,12.62,15.51,peak


* Strangely enough, the 50th percentile speed became lower
* Sometimes the lower the length, the higher the speed?

In [41]:
compare_average(avg_test, og_avg, shape_array3, stop_seq3)

route 94e02a46331c8b449aedb4469f49764a, sequence 32
Original


Unnamed: 0,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
132715,dbbe8ee4864a2715a40749605395d584,1648,0,1.9,23,0.91,9.85,all_day
132716,dbbe8ee4864a2715a40749605395d584,1648,0,1.58,12,0.62,6.55,peak


New


Unnamed: 0,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
124137,dbbe8ee4864a2715a40749605395d584,1648,0,7,District 7 - Los Angeles,1.27,10,1.08,3.63,all_day
124138,dbbe8ee4864a2715a40749605395d584,1648,0,7,District 7 - Los Angeles,1.27,5,1.2,3.89,peak


In [42]:
merge1.loc[(merge1.shape_array_key == shape_array3) & (merge1.stop_sequence == stop_seq3)][merge1_preview_cols].sort_values(['percent'])

Unnamed: 0,meters_length,trip_id,min_time,min_dist,max_time,max_dist,meters_elapsed,sec_elapsed,speed_mph,percent
1320710,583.74,905067,68367.0,430.11,68412.0,468.28,38.17,45.0,1.9,6.54
1320723,583.74,905083,41431.0,249.29,41473.0,434.91,185.61,42.0,9.89,31.8
1320732,583.74,905092,27786.0,267.36,27875.0,459.47,192.11,89.0,4.83,32.91
1320716,583.74,905076,51917.0,0.0,52006.0,207.87,207.87,89.0,5.22,35.61
1320724,583.74,905084,40531.0,103.57,40576.0,323.28,219.72,45.0,10.92,37.64
1320719,583.74,905079,47491.0,88.57,47536.0,314.28,225.72,45.0,11.22,38.67
1320712,583.74,905070,62911.0,352.35,64288.0,582.05,229.7,1377.0,0.37,39.35
1320711,583.74,905069,65038.0,328.54,66192.0,583.74,255.2,1154.0,0.49,43.72
1320714,583.74,905073,57165.0,306.14,58508.0,580.88,274.74,1343.0,0.46,47.06
1320727,583.74,905087,35278.0,83.85,35367.0,361.36,277.52,89.0,6.98,47.54


* This is missing because the only row that was non-zero only covered 27% of the length
* Maybe up the threshold?

In [43]:

compare_average(avg_test, og_avg, shape_array4, stop_seq4)

route 50d9dd7ba11f08a8c86130778d6cabc4, sequence 14
Original


Unnamed: 0,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
74633,dbbe8ee4864a2715a40749605395d584,916,0,11.34,1,11.34,11.34,all_day


New


Unnamed: 0,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day


In [44]:
merge1.loc[(merge1.shape_array_key == shape_array4) & (merge1.stop_sequence == stop_seq4)][merge1_preview_cols]

Unnamed: 0,meters_length,trip_id,min_time,min_dist,max_time,max_dist,meters_elapsed,sec_elapsed,speed_mph,percent
756102,386.01,904427,53824.0,284.59,53844.0,386.01,101.42,20.0,11.34,26.27
756103,386.01,904430,27500.0,386.01,27510.0,386.01,0.0,10.0,0.0,0.0
756104,386.01,904452,75250.0,0.0,75339.0,0.0,0.0,89.0,0.0,0.0


In [45]:
compare_average(avg_test, og_avg, shape_array5, stop_seq5)

route 21a802700dadbc2e249b595773a50363, sequence 8
Original


Unnamed: 0,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
29471,dbbe8ee4864a2715a40749605395d584,579,0,22.24,6,20.76,25.12,all_day
29472,dbbe8ee4864a2715a40749605395d584,579,0,22.94,4,18.22,27.1,peak


New


Unnamed: 0,schedule_gtfs_dataset_key,stop_id,loop_or_inlining,district,district_name,p50_mph,n_trips,p20_mph,p80_mph,time_of_day
27691,dbbe8ee4864a2715a40749605395d584,579,0,7,District 7 - Los Angeles,21.72,3,17.33,26.73,all_day
27692,dbbe8ee4864a2715a40749605395d584,579,0,7,District 7 - Los Angeles,22.23,2,17.53,26.93,peak


## Comparison
* i still want a left join for segments, but only for segments that have RT trips ever (that end up going missing, after the 45% rule)

* i don't want a left join to show all segments ever, bc segments are cut from scheduled shapes. more operators have schedule data than RT. so i don't want a mess of segments at the end of the left join off schedule, but a left join for "RT shapes"

In [64]:
# Merge1: ALL rows even before cutting off max speed of 70 and segments with less than 
# 40% of the segment
rt_segs_agg = (merge1
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('og_')
           )

In [65]:
rt_segs_agg.sample()

Unnamed: 0,og_shape_array_key,og_schedule_gtfs_dataset_key,og_total_stops
2800,b1f2084455c083bb63d9e6fd19154bd9,7cc0cb1871dfd558f11a2885c145d144,26


In [63]:
# Rows that are above 70 mph and under 40% coverage of a segment are thrown away
avg_test_agg = (avg_test
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('testing_')
           )

In [66]:
avg_test_agg.sample()

Unnamed: 0,testing_shape_array_key,testing_schedule_gtfs_dataset_key,testing_total_stops
3843,f2a531fcb272ac5dd97c3b914a3507f5,5456c80d420043e15c8eb7368a8a4d89,15


In [49]:
avg_test_agg.shape, rt_segs_agg.shape

((4060, 3), (4088, 3))

In [68]:
m1 = (pd
      .merge(rt_segs_agg, avg_test_agg, 
             left_on = ['og_shape_array_key', 'og_schedule_gtfs_dataset_key'],
             right_on = ['testing_shape_array_key', 'testing_schedule_gtfs_dataset_key'], 
             how = 'outer', indicator = True)
     )

In [70]:
m1['total_thrown_out_stops'] = m1.og_total_stops - m1.testing_total_stops

In [75]:
m1.sample(3)

Unnamed: 0,og_shape_array_key,og_schedule_gtfs_dataset_key,og_total_stops,testing_shape_array_key,testing_schedule_gtfs_dataset_key,testing_total_stops,_merge,total_thrown_out_stops
3220,cb62fe67480015ead2335637cc9186cb,0139b1253130b33adcd4b3a4490530d2,30,cb62fe67480015ead2335637cc9186cb,0139b1253130b33adcd4b3a4490530d2,26.0,both,4.0
2514,9ddb6474b9b2af72d4e1a8c99fc768c4,1770249a5a2e770ca90628434d4934b1,5,9ddb6474b9b2af72d4e1a8c99fc768c4,1770249a5a2e770ca90628434d4934b1,4.0,both,1.0
2534,9ef649fec1b82fc03432099552fe0cc6,eb9acbcb42315399bb54df78adfd3dac,40,9ef649fec1b82fc03432099552fe0cc6,eb9acbcb42315399bb54df78adfd3dac,39.0,both,1.0


In [76]:
m1.loc[m1._merge == 'left_only'].sample(3)

Unnamed: 0,og_shape_array_key,og_schedule_gtfs_dataset_key,og_total_stops,testing_shape_array_key,testing_schedule_gtfs_dataset_key,testing_total_stops,_merge,total_thrown_out_stops
2944,ba44c4ca0a8cffeb99a85555699c2af4,8f152d5328f38ce505eb9c647e08375e,3,,,,left_only,
3426,d79dacb8b4fe1f5416ec106f4ac5672d,07d3b79f14cec8099119e1eb649f065b,1,,,,left_only,
2606,a3f8b36fe1105d028d1dc627f4961e5c,770072d7a8d356b529ef34fe01715bcb,2,,,,left_only,


* 28 routes are missing.

In [71]:
m1._merge.value_counts()

both          4060
left_only       28
right_only       0
Name: _merge, dtype: int64

In [73]:
m1.total_thrown_out_stops.describe()

count   4060.00
mean       3.18
std        3.85
min        0.00
25%        1.00
50%        2.00
75%        4.00
max       55.00
Name: total_thrown_out_stops, dtype: float64

#### Seeing all the sequences thrown out

In [50]:
subset = ['shape_array_key','schedule_gtfs_dataset_key', 'stop_sequence']

In [51]:
merge2 = merge1[subset]

In [52]:
avg_test2 = avg_test[subset]

In [53]:
m2 = pd.merge(merge2, avg_test2, on = subset, how = 'outer', indicator = True)

In [54]:
m2._merge.value_counts()

both          4272884
left_only       74121
right_only          0
Name: _merge, dtype: int64

In [55]:
thrown_out_sequences = m1.loc[m1._merge == "left_only"].reset_index()

In [56]:
thrown_out_sequences2 = (thrown_out_sequences
            .groupby(['shape_array_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_missing_stops'})
           )

In [57]:
thrown_out_sequences2.shape

(3816, 2)

In [58]:
thrown_out_sequences2.head()

Unnamed: 0,shape_array_key,total_missing_stops
0,000b8c60f7767e8214f6ef6638d2cb83,3
1,001fe9bb4ed5b0bbccf1040fb5168dd2,1
2,00292832fd6aa3873503acd72b70dddc,8
3,002a87c755b2feafe8d51d142ecd6d49,1
4,00333fc7d5684277975b1adcdd72be8b,1


In [59]:
thrown_out_sequences2.total_missing_stops.describe()

count   3816.00
mean       3.41
std        3.88
min        1.00
25%        1.00
50%        2.00
75%        4.00
max       55.00
Name: total_missing_stops, dtype: float64