In [1]:
import datetime
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
import altair as alt
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,  
    analysis_date,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions)
from shared_utils import calitp_color_palette as cp, rt_utils, geography_utils
import _threshold_utils as threshold_utils
import _rt_scheduled_utils as rt_scheduled_utils
CONFIG_PATH = './scripts/config.yml'
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [3]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

## % of Meters
* start with the speeds_stop_segments_{analysis_date} parquet (which is produced in B1_speeds_by_segment_trip ).
* grab in stop_segments_{analysis_date} (in CRS 3310 already), and you can get the segment's length.
* merge with the speeds by segment-trip, which contains the meters_elapsed column
* calculate pct where meters_elapsed/segment_length
* show me some charts around this, a couple of descriptives to make sure these are all ok
* give me rough descriptives of how many rows we're dropping if we go with keeping at least 30%, 40%, 50%

### Files

In [4]:
STG5_FILE = DICT['stage5']
# avg_speeds = gpd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet")
# avg_speeds = avg_speeds.drop(columns=["district", "district_name"])

In [5]:
def import_speeds_segs(analysis_date:str):
    FILE = DICT['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}")
    
    return df

In [6]:
def import_segments(analysis_date:str) -> gpd.GeoDataFrame:
    # Load in ALL segments, flag them.
    FILE = STOP_SEG_DICT['segments_file']
    gdf = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet").to_crs(PROJECT_CRS)
    
    gdf = gdf.assign(
        meters_length=(gdf.geometry.length)
    )
    df = gdf.drop(columns = ['geometry','geometry_arrowized','district','district_name'])
    
    return df

In [7]:
def merge_segments_speeds(analysis_date:str):
    speeds = import_speeds_segs(analysis_date)
    segments = import_segments(analysis_date)
    merge1 = pd.merge(segments, speeds, on = ['shape_array_key','gtfs_dataset_key','stop_sequence'], how = "inner")
    
    merge1['percent'] = merge1.meters_elapsed/merge1.meters_length * 100
    
    merge1.percent = merge1.percent.fillna(0)
    merge1['rounded_percent'] = ((merge1.percent/100)*10).astype(int)*10
    return merge1

In [8]:
merge1 = merge_segments_speeds(analysis_date)

In [9]:
# test1 = row_cat(merge1)

In [10]:
def valid_trips_by_cutoff(df, percentages:list):
    final = pd.DataFrame()
    og_len = len(df)
    
    for i in percentages:
        percent = df.percent.quantile(i).astype(float)
        temp = (df[(df.percent >= percent)])
        temp = temp.assign(
            percentile = f"Min. of {percent.astype(int)}% of seg length covered")
        
        temp = (temp
                .groupby(['percentile'])
                .agg({'trip_id':'count','speed_mph':'mean'})
                .reset_index()
                .rename(columns = {'trip_id':'n_rows',
                                  'speed_mph':'mean_speed_mph'})
               )
        
        final = pd.concat([final, temp], axis=0)

    
    final = final.assign(
        percentage_kept_rows=final.n_rows.divide(og_len) * 100)
    
   
    
    return final

In [11]:
test = valid_trips_by_cutoff(merge1, [.1,.2,.3,.4,.5,.6,.7,.8])

## Visualizing Speed
* https://nbviewer.org/github/cal-itp/data-analyses/blob/filter-speeds-avgs/rt_segment_speeds/18_speed_distribution.ipynb
* https://analysis.calitp.org/rt/district_07-los-angeles/9__speedmaps__district_07-los-angeles__itp_id_300.html

### % of rows kept

In [12]:
test = threshold_utils.pre_clean(test)

In [13]:
# Main chart
def bar_chart(df, x_column: str, y_column:str, title:str):
    chart = (alt.Chart(df)
         .mark_bar()
         .encode(x=x_column, y= y_column, 
          color=alt.Color(y_column, 
          scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
          legend=None),
          tooltip = df.columns.tolist())
         .properties(title = title)
            )
    chart = threshold_utils.chart_size((chart), 400,300)
    return chart
    

In [14]:
bar_chart(test, 'Percentage Kept Rows','Percentile', 'Rows Kept After % Segment Cutoff')

### % of rows kept & average speeds

In [54]:
bar_chart(test, 'Mean Speed Mph', 'Percentile', 'Mean MPH by % Segment Length')

### Speed by percent (not percentiles as above)

In [16]:
# https://pbpython.com/groupby-agg.html
agg_func_describe = {'speed_mph': ['describe']}

In [17]:
percent_speeds = merge1.groupby(['rounded_percent']).agg(agg_func_describe).round(2).reset_index()

In [18]:
percent_speeds.columns = percent_speeds.columns.droplevel()

In [19]:
percent_speeds.columns = ['percentage','total_rows','speed_mph_mean', 'speed_std','speed_mph_min','speed_25','speed_50','speed_75','speed_mph_max',]

In [20]:
percent_speeds = threshold_utils.pre_clean(percent_speeds)

In [21]:
percent_speeds.Percentage = percent_speeds.Percentage.astype('str')

In [22]:
bar_chart(percent_speeds, 'Percentage','Total Rows', 'Rows Kept After % Segment Cutoff')

In [23]:
bar_chart(percent_speeds, 'Percentage','Speed 50', '% of Segment Covered - 50th Percentile Speed')

In [24]:
bar_chart(percent_speeds, 'Percentage','Speed 25', '% of Segment Covered - 25th Percentile Speed')

In [25]:
def speeds_one_trip_seq(shape_array_key:str, stop_sequence:int):
    STG4_FILE = DICT['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{STG4_FILE}_{analysis_date}", 
         filters = [[("shape_array_key", "==", shape_array_key),
                    ("stop_sequence", "==", stop_sequence)]]) 
    return df

In [26]:
def avg_speeds_one_trip_seq(shape_array_key:str, stop_sequence:int):
    STG5_FILE = DICT['stage5']
    df = pd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet", 
         filters = [[("shape_array_key", "==", shape_array_key),
                    ("stop_sequence", "==", stop_sequence),
                    ("time_of_day", "==", "all_day")]])
    df = df.drop(columns=["district", "district_name", "geometry"])
    return df

In [27]:
def merge_avg_all_speeds(shape_array_key:str, stop_sequence:int):
    speeds = speeds_one_trip_seq(shape_array_key, stop_sequence)
    avg = avg_speeds_one_trip_seq(shape_array_key, stop_sequence)
    m1 = pd.merge(speeds,avg, on =['shape_array_key','stop_sequence','gtfs_dataset_key'], how = "inner")
    return m1

### Sample segments and routes for Big Blue Bus

In [28]:
test_operator = "Big Blue Bus VehiclePositions"
test_org = "City of Santa Monica"
test_key = "6c2d7daaf979779fa2089c6395baf98b"

In [29]:
pub_df = pd.read_parquet(
    f"{SEGMENT_GCS}export/avg_speeds_stop_segments_{analysis_date}_tabular.parquet", 
    filters = [[("agency", "==", test_org)]]
)

In [30]:
# Dark orange
shape_id1  = "26375"
stop_seq1 = 7
shape_array1 = pub_df[pub_df.shape_id==shape_id1].shape_array_key.iloc[0]

In [31]:
# Light yellow 
shape_id2 = "26342"
stop_seq2 = 23
shape_array2 = pub_df[pub_df.shape_id==shape_id2].shape_array_key.iloc[0]

In [32]:
# Dark Red
shape_id3 = "26393"
stop_seq3 = 32
shape_array3 = pub_df[pub_df.shape_id==shape_id3].shape_array_key.iloc[0]

In [33]:
# Light orange
shape_id4 = "26372"
stop_seq4 = 14
shape_array4 = pub_df[pub_df.shape_id==shape_id4].shape_array_key.iloc[0]

In [34]:
# Green
shape_id5 = "26400"
stop_seq5= 8
shape_array5 = pub_df[pub_df.shape_id==shape_id5].shape_array_key.iloc[0]

In [35]:
def one_route_map(avg_speeds:gpd.GeoDataFrame, shape_array_key:str):
    display(avg_speeds[avg_speeds.shape_array_key==shape_array_key].explore(
    "p50_mph", 
    tiles = "CartoDB Positron",
    cmap = rt_utils.ZERO_THIRTY_COLORSCALE,
    style_kwds = {'weight':5}))
    

### Speeds for 1 Sequence

In [36]:
def display_speeds(shape_array_key:str, stop_sequence:int):
    m1 = merge_avg_all_speeds(shape_array_key,stop_sequence)
    
    # Fill any nan values
    m1= m1.fillna(0)
    
    # Create chart title 
    chart_title = f"Speed Distribution for Seq {m1.stop_sequence.iloc[0]}/Shape {m1.shape_array_key.iloc[0]}"
    
    # Main chart
    chart = (alt.Chart(m1)
         .mark_bar()
         .encode(x='trip_id', y='speed_mph')
         .properties(title = chart_title))
    
    # Rule
    rule1 = alt.Chart(m1).mark_rule(color='red', strokeDash=[10, 7]).encode(y='p20_mph')
    rule2 = alt.Chart(m1).mark_rule(color='blue', strokeDash=[10, 7]).encode(y='p80_mph')
    rule3 = alt.Chart(m1).mark_rule(color='green', strokeDash=[10, 7]).encode(y='p50_mph')
    chart = threshold_utils.chart_size((chart+rule1+rule2+rule3), 600,300)
    chart = chart.interactive()
    display(chart)

In [37]:
display_speeds(shape_array2, stop_seq2)

In [38]:
display_speeds(shape_array3, stop_seq3)

### Speeds for 1 Shape

In [39]:
def box_whisker(df:pd.DataFrame, route:str):
    
    one_shape = df[df.shape_array_key == route]
    
    chart_title = f"Speed Distribution for Seq {one_shape._gtfs_dataset_name.iloc[0]}/Shape {one_shape.shape_array_key.iloc[0]}"
    
    one_shape = threshold_utils.pre_clean(one_shape)
    
    chart = (alt.Chart(one_shape)
    .mark_boxplot(extent='min-max').encode(
    x='Stop Sequence:O',
    y='Speed Mph:Q', color=alt.Color('Shape Array Key',
    scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS),
    legend=None))
    .properties(title = chart_title)
            )
    
    chart = threshold_utils.chart_size((chart), 600,300)
    
    return chart

In [40]:
box_whisker(merge1, shape_array3)

In [41]:
box_whisker(merge1, shape_array5)

In [42]:
def speeds_one_route(shape_array_key:str):
    STG4_FILE = DICT['stage4']
    df = pd.read_parquet(f"{SEGMENT_GCS}{STG4_FILE}_{analysis_date}", 
         filters = [[("shape_array_key", "==", shape_array_key)]]) 
    return df

In [43]:
def avg_speeds_one_route(shape_array_key:str):
    STG5_FILE = DICT['stage5']
    df = pd.read_parquet(f"{SEGMENT_GCS}{STG5_FILE}_{analysis_date}.parquet", 
         filters = [[("shape_array_key", "==", shape_array_key),
                    ("time_of_day", "==", "all_day")]])
    df = df.drop(columns=["district", "district_name", "geometry"])
    return df

In [44]:
m2 = merge_avg_all_speeds(shape_array2,stop_seq2)

In [45]:
rule1 = alt.Chart(m2).mark_rule(color='red', strokeDash=[10, 7]).encode(y='p20_mph')

In [46]:
m2.speed_mph = m2.speed_mph.fillna(0)

In [47]:
bins = [0, 5, 10, 15, 25, 30, 35, 40, 45, 50, 55, 60, 65]

In [48]:

m2["binned"] = pd.cut(m2.speed_mph, bins).astype(str)

In [49]:
m2.binned = m2.binned.str.replace('nan','(0, 5]')

In [50]:
m2 = (m2
      .groupby(['binned'])
      .agg({'trip_id':'count'})
      .reset_index()
      .rename(columns = {'trip_id':'number_of_trips'})
     )

In [51]:
chart2 = (alt.Chart(m2)
    .mark_bar(size=40)
    .encode(
        x=alt.X('binned'),
        y=alt.Y('number_of_trips'),
        tooltip=m2.columns.tolist(),
    )
    .properties(title='Test')
    .interactive())

In [52]:
threshold_utils.chart_size(chart2, 400, 300)