In [None]:
import datetime
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
import altair as alt
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,  
    analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions,B2_avg_speeds_by_segment)
from shared_utils import calitp_color_palette as cp, rt_utils, geography_utils
import _threshold_utils as threshold_utils
import _rt_scheduled_utils as rt_scheduled_utils
CONFIG_PATH = './scripts/config.yml'
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
analysis_date

## % of Meters
* start with the speeds_stop_segments_{analysis_date} parquet (which is produced in B1_speeds_by_segment_trip ).
* grab in stop_segments_{analysis_date} (in CRS 3310 already), and you can get the segment's length.
* merge with the speeds by segment-trip, which contains the meters_elapsed column
* calculate pct where meters_elapsed/segment_length
* show me some charts around this, a couple of descriptives to make sure these are all ok
* give me rough descriptives of how many rows we're dropping if we go with keeping at least 30%, 40%, 50%

### Open these files just for testing

In [None]:
def import_speeds_segs(analysis_date:str, 
                       max_speed_cutoff: int, 
                       dict_inputs:dict)-> pd.DataFrame:
    FILE = dict_inputs['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}", 
        filters = [[("speed_mph", "<=", max_speed_cutoff), 
                    ("sec_elapsed", ">", 0), 
                    ("meters_elapsed", ">", 0)
                   ]])
    
    return df

In [None]:
def load_segments(analysis_date:str, dict_inputs:dict) -> gpd.GeoDataFrame:
    # Load in ALL segments, find the length
    FILE = dict_inputs['segments_file']
    segments = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet")
    
    return segments

## Sample segments and routes for Big Blue Bus

In [None]:
test_operator = "Big Blue Bus VehiclePositions"
test_org = "City of Santa Monica"
test_key = "6c2d7daaf979779fa2089c6395baf98b"

In [None]:
pub_df = pd.read_parquet(
    f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet", 
    filters = [[("agency", "==", test_org)]]
)

In [None]:
# Dark orange
shape_id1  = "26375"
stop_seq1 = 7
shape_array1 = pub_df[pub_df.shape_id==shape_id1].shape_array_key.iloc[0]

In [None]:
# Light yellow 
shape_id2 = "26342"
stop_seq2 = 23
shape_array2 = pub_df[pub_df.shape_id==shape_id2].shape_array_key.iloc[0]

In [None]:
# Dark Red
shape_id3 = "26393"
stop_seq3 = 32
shape_array3 = pub_df[pub_df.shape_id==shape_id3].shape_array_key.iloc[0]

In [None]:
# Light orange
shape_id4 = "26372"
stop_seq4 = 14
shape_array4 = pub_df[pub_df.shape_id==shape_id4].shape_array_key.iloc[0]

In [None]:
# Green
shape_id5 = "26400"
stop_seq5= 8
shape_array5 = pub_df[pub_df.shape_id==shape_id5].shape_array_key.iloc[0]

In [None]:
def one_route_map(avg_speeds:gpd.GeoDataFrame, shape_array_key:str):
    display(avg_speeds[avg_speeds.shape_array_key==shape_array_key].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE,
    style_kwds = {'weight':5}))
    

## Open Files
* Add `import_speeds_segs`,`calculate_segment_length`,`merge_segments_speeds` into `B2`

In [None]:
dictionary = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [None]:
segments_gdf = load_segments(analysis_date, dictionary)

In [None]:
speeds = import_speeds_segs(analysis_date,70,  dictionary)

In [None]:
def calculate_segment_length(segments:gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Find the length of segments
    """
    segments = segments.assign(
        meters_length=(segments.geometry.length)
    )
    df = segments.drop(columns = ['geometry','district','district_name'])
    
    return df

In [None]:
def merge_segments_speeds(speeds: pd.DataFrame,
                          segments:gpd.GeoDataFrame, 
                          ) -> pd.DataFrame:
    """
    Merge speeds and segments, calculate length.
    """
    segments = calculate_segment_length(segments)
    
    merge_cols = ['shape_array_key','stop_sequence','schedule_gtfs_dataset_key']
    merge1 = pd.merge(segments, speeds, on = merge_cols, how = "inner")
    
    merge1['percent'] = merge1.meters_elapsed/merge1.meters_length * 100
    
    merge1.percent = merge1.percent.fillna(0)
    return merge1

In [None]:
merge1 = merge_segments_speeds(speeds, segments_gdf)

In [None]:
def myround(x, base=5):
    return base * round(x/base)

In [None]:
def valid_trips_by_cutoff(df, percentages:list):
    """
    Returns a table of how many trips/routes/operators/etc are kept 
    after applying a certain percentile cutoff for the 
    length of segment that's covered.
    
    Ex: if a segment has 40% of its length covered in RT 
    data, it falls in the 20th percentile.
    
    Args:
        df: result from  merge_segments_speeds(speeds, segments_gdf)
        percentages: the percentiles in floats like 0.2, 0.25, 0.3
    """
    final = pd.DataFrame()
    og_len = len(df)
    og_trips = df.trip_id.nunique()
    og_shape_array_key = df.shape_array_key.nunique()
    
    for i in percentages:
        # Round up percent to nearest 5. Ex: 43 becomes 45.
        percent = myround(df.percent.quantile(i).astype(float))
        temp = (df[(df.percent >= percent)])
        
        temp = temp.assign(
            percentile = f"Min. of {percent}% of seg length covered")
        
        temp = (temp
                .groupby(['percentile'])
                .agg({'gtfs_dataset_name':'count',
                      'speed_mph':'mean',
                      'shape_array_key':'nunique',
                      'trip_id':'nunique',
                      'gtfs_dataset_key':'nunique'})
                .reset_index()
                .rename(columns = {'gtfs_dataset_name':'n_rows',
                                  'speed_mph':'mean_speed_mph',
                                  'shape_array_key':'n_kept_routes',
                                  'trip_id':'n_kept_trips',
                                  'gtfs_dataset_key':'n_kept_operators'})
               )
        
        final = pd.concat([final, temp], axis=0)

    
    final = final.assign(
        percentage_kept_rows=final.n_rows.divide(og_len) * 100,
        percentage_kept_trips = final.n_kept_trips.divide(og_trips) * 100,
        percentage_kept_routes = final.n_kept_routes.divide(og_shape_array_key)*100)
    
    round_cols = ['percentage_kept_rows', 'percentage_kept_trips', 'percentage_kept_routes']
    final[round_cols] = final[round_cols].round(0)
    return final

In [None]:
test = valid_trips_by_cutoff(merge1, [.1,.2,.3,.4,.5,.6,.7])

## Visualizing Speed
* https://nbviewer.org/github/cal-itp/data-analyses/blob/filter-speeds-avgs/rt_segment_speeds/18_speed_distribution.ipynb
* https://analysis.calitp.org/rt/district_07-los-angeles/9__speedmaps__district_07-los-angeles__itp_id_300.html

### % of rows kept

In [None]:
test = threshold_utils.pre_clean(test)

In [None]:
test

In [None]:
# Main chart
def bar_chart(df, x_column: str, y_column:str, title:str):
    chart = (alt.Chart(df)
         .mark_bar()
         .encode(x=x_column, y= y_column, 
          color=alt.Color(y_column, 
          scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
          legend=None),
          tooltip = df.columns.tolist())
         .properties(title = title)
            )
    chart = threshold_utils.chart_size((chart), 400,300)
    return chart
    

In [None]:
bar_chart(test, 'Percentage Kept Rows','Percentile', 'Rows Kept After % Segment Cutoff')

### Edited B2 function

In [None]:
def speeds_with_segment_geom(
    analysis_date: str, 
    max_speed_cutoff: int = 70,
    dict_inputs: dict = {},
    percent_segment_covered:float = 0.55,
) -> gpd.GeoDataFrame: 
    """
    Import the segment-trip table. 
    Average the speed_mph across all trips present in the segment.
    """
    SEGMENT_FILE = dict_inputs["segments_file"]
    SEGMENT_IDENTIFIER_COLS = dict_inputs["segment_identifier_cols"]
    SPEEDS_FILE = dict_inputs["stage4"]
    
    # Load in segment geometry
    segments = helpers.import_segments(
        SEGMENT_GCS,
        f"{SEGMENT_FILE}_{analysis_date}",
        columns = SEGMENT_IDENTIFIER_COLS + [
            "schedule_gtfs_dataset_key", 
            "stop_id",
            "loop_or_inlining",
            "geometry", 
            "district", "district_name"
        ]
    )
    
    # CRS is 3310, calculate the length
    segments["segment_length"] = segments.geometry.length
    
    # Read in speeds
    df = pd.read_parquet(
        f"{SEGMENT_GCS}{SPEEDS_FILE}_{analysis_date}"
    )
    
    # Find only unique segments with rt data  before filtering
    unique_segments = df[SEGMENT_IDENTIFIER_COLS].drop_duplicates()
    
    # Do a merge with segments
    merge_cols = ['shape_array_key','stop_sequence','schedule_gtfs_dataset_key']
    df2 = pd.merge(segments, df, on = merge_cols, how = "inner")
    
    # Find percentage of meters elapsed vs. total segment length
    df2 = df2.assign(
        pct_seg = df2.meters_elapsed.divide(df2.segment_length)
    )
    
    # Filter out abnormally high and low speeds
    # Threshold defaults to throwing away the bottom 20% of rows with low speeds
    df3 = df2[(df2.pct_seg >= percent_segment_covered) & (df2.speed_mph.notna()) & 
              (df2.sec_elapsed > 0) & (df2.meters_elapsed > 0)]
    
    time_of_day_df = sched_rt_utils.get_trip_time_buckets(analysis_date)

    df4 = pd.merge(
        df3, 
        time_of_day_df, 
        on = "trip_instance_key", 
        how = "inner"
    )
    
    all_day = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df4, 
        SEGMENT_IDENTIFIER_COLS
    )
    peak = B2_avg_speeds_by_segment.calculate_avg_speeds(
        df4[df4.time_of_day.isin(["AM Peak", "PM Peak"])], 
        SEGMENT_IDENTIFIER_COLS
    )
    
    stats = pd.concat([
        all_day.assign(time_of_day = "all_day"),
        peak.assign(time_of_day = "peak")
    ], axis=0)
    
    ## New stuff 
    # start with segments with geom (scheduled, we have way too many)
    # merge against unique_segments (these are present in RT...inner join)...we have geom for segments we care about
    # merge against stats (left join)
    # Left join to find missing segments not in stats
    missing = (pd.merge(unique_segs_with_geo, 
                        stats, 
                        on = ['shape_array_key', 'stop_sequence'], 
                        how = "left", 
                        indicator = True))
    
    # Grab left only results, which did not show up in the stats df
    missing = missing.loc[missing._merge == "left_only"].reset_index(drop = True)
    missing = missing.drop(columns = ['_merge'])
    
    # Concat & fill in NA 
    stats2 = pd.concat([missing, stats])
    stats2 = stats2.fillna(stats2.dtypes.replace({'float64': 0.0, 'object': 'None'}))
    stats2 = stats2.drop(columns = ['segment_length'])
    
    # Merge in segment geometry with a changed CRS
    segments = segments.to_crs(geography_utils.WGS84)
    
    gdf = pd.merge(
        segments,
        stats2,
        on = SEGMENT_IDENTIFIER_COLS,
        how = "inner"
    )
    
    
    return gdf

### Check out speeds

In [None]:

avg_test = speeds_with_segment_geom(analysis_date, 
                                    70, 
                                    dictionary,
                                    0.55)

In [None]:
STG5_FILE = dictionary['stage5']
og_avg = gpd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet")

In [None]:
# See unique stops-keys-routes. They should have no more than 2.
avg_test.groupby(['shape_array_key','schedule_gtfs_dataset_key','stop_sequence']).agg({'district':'count'}).sort_values('district', ascending = False).head()

* There are 3000 more rows

In [None]:
og_avg.shape

In [None]:
avg_test.shape

## Checks

### Checking one sequence that was deemed "missing"

In [None]:
# Test 3 contains missing sequences that were added back on
# From "add_back_missing_stops"
avg_test.loc[avg_test.time_of_day == 'None'].sample(3).drop(columns = ['geometry'])

In [None]:
missing_shape1  = "609c3fe2d34fe2f239c12f998ae0fdb6"
missing_stop_seq1 = 33

In [None]:
def look_at_missing_row(shape, sequence):
    display(og_avg.loc[(og_avg.shape_array_key == shape) & (og_avg.stop_sequence == sequence)].drop(columns = ['geometry']))
    display(merge1.loc[(merge1.shape_array_key == shape) & (merge1.stop_sequence == sequence)][['percent', 'speed_mph']])
    display(avg_test.loc[(avg_test.shape_array_key == shape) & (avg_test.stop_sequence == sequence)].drop(columns = ['geometry']))

In [None]:
look_at_missing_row(missing_shape1, missing_stop_seq1)

In [None]:
missing_shape2  = "4d006755475d7c2a03c525995311ec16"
missing_stop_seq2 = 29

In [None]:
look_at_missing_row(missing_shape2, missing_stop_seq2)

### Check using Big Blue Bus

In [None]:
def compare_average(avg_test:pd.DataFrame,
                    og_avg:pd.DataFrame, 
                    shape_array_key:str,
                    stop_sequence:int):
    
    print(f"route {shape_array_key}, sequence {stop_sequence}")
    drop_cols = ['shape_array_key','stop_sequence']
    print('Original')
    original = og_avg[(og_avg.shape_array_key == shape_array_key) &(og_avg.stop_sequence == stop_sequence)].drop(columns = drop_cols)
    display(original.drop(columns = ['geometry']))
    
    print('New')
    new = avg_test[(avg_test.shape_array_key == shape_array_key) & (avg_test.stop_sequence == stop_sequence)].drop(columns = drop_cols)
    display(new.drop(columns = ['geometry']))    
    display(new.explore(width = 300, height = 300))

In [None]:
og_avg.loc[og_avg.p50_mph != 0].p50_mph.describe()

In [None]:
avg_test.loc[avg_test.p50_mph != 0].p50_mph.describe()

In [None]:
# compare_average(avg_test, og_avg, shape_array1, stop_seq1)

In [None]:
# compare_average(avg_test, og_avg, shape_array2, stop_seq2)

* Strangely enough, the 50th percentile speed became lower
* Sometimes the lower the length, the higher the speed?

In [None]:
# compare_average(avg_test, og_avg, shape_array3, stop_seq3)

In [None]:
# merge1.loc[(merge1.shape_array_key == shape_array3) & (merge1.stop_sequence == stop_seq3)][merge1_preview_cols].sort_values(['percent'])

* This is missing because the only row that was non-zero only covered 27% of the length
* Maybe up the threshold?

In [None]:
compare_average(avg_test, og_avg, shape_array4, stop_seq4)

In [None]:
merge1.loc[(merge1.shape_array_key == shape_array4) & (merge1.stop_sequence == stop_seq4)][merge1_preview_cols]

In [None]:
compare_average(avg_test, og_avg, shape_array5, stop_seq5)

## Comparison
* i still want a left join for segments, but only for segments that have RT trips ever (that end up going missing, after the 45% rule)

* i don't want a left join to show all segments ever, bc segments are cut from scheduled shapes. more operators have schedule data than RT. so i don't want a mess of segments at the end of the left join off schedule, but a left join for "RT shapes"

In [None]:
# Results from original script
rt_segs_agg = (og_avg
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('og_')
           )

In [None]:
rt_segs_agg.sample()

In [None]:
# My test after filtering but adding back in cut rows
avg_test_agg = (avg_test
            .groupby(['shape_array_key','schedule_gtfs_dataset_key'])
            .agg({'stop_sequence':'nunique'})
            .reset_index()
            .rename(columns = {'stop_sequence':'total_stops'})
            .add_prefix('testing_')
           )

In [None]:
avg_test_agg.sample()

In [None]:
avg_test_agg.shape, rt_segs_agg.shape

In [None]:
m1 = (pd
      .merge(rt_segs_agg, avg_test_agg, 
             left_on = ['og_shape_array_key', 'og_schedule_gtfs_dataset_key'],
             right_on = ['testing_shape_array_key', 'testing_schedule_gtfs_dataset_key'], 
             how = 'outer', indicator = True)
     )

In [None]:
m1['total_thrown_out_stops'] = m1.og_total_stops - m1.testing_total_stops

In [None]:
m1.sample(3)

In [None]:
m1._merge.value_counts()

* 28 routes are missing.

In [None]:
m1.total_thrown_out_stops.describe()

#### Seeing all the sequences thrown out

In [None]:
og_avg.columns

In [None]:
merge_cols = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining']

In [None]:
subset = ['shape_array_key', 'stop_sequence', 'schedule_gtfs_dataset_key',
       'stop_id', 'loop_or_inlining', 'p50_mph', 'n_trips',
       'p20_mph', 'p80_mph', 'time_of_day']

In [None]:
og_avg2 = og_avg[subset]

In [None]:
og_avg2.shape

In [None]:
avg_test2 = avg_test[subset]

In [None]:
avg_test2.shape

In [None]:
m2 = pd.merge(og_avg2, avg_test2, on = merge_cols, how = 'outer', indicator = True)