In [None]:
import datetime
import _speed_utils as speed_utils
import dask.dataframe as dd
import numpy as np
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs,sched_rt_utils
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,
)
from scripts import (A1_sjoin_vp_segments, A2_valid_vehicle_positions)
from shared_utils import calitp_color_palette as cp

CONFIG_PATH = './scripts/config.yml'
STOP_SEG_DICT = helpers.get_parameters(CONFIG_PATH, "stop_segments")

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
analysis_date = '2023-07-12'

### All operators

In [None]:
# Flagged: all the rows in the df flagged
# divide_by_zero: only the rows that have 0 for meters and sec elapsed
# trips_count: % of trips with 1+ division by 0 row for a route
# route_most_populated_df: the trip for a route with the smallest % of rows that are divided by 0
flagged, divide_by_zero, trips_count, route_most_populated_df = speed_utils.flagging_stage(analysis_date)

In [None]:
# all_ops = list(flagged._gtfs_dataset_name.sort_values().unique())

### Filtering out for Big Blue Bus Only

In [None]:
big_blue_bus = "Big Blue Bus VehiclePositions"

In [None]:
bbb_only = flagged[flagged._gtfs_dataset_name == big_blue_bus].reset_index(drop = True)

In [None]:
bbb_key = "6c2d7daaf979779fa2089c6395baf98b"

In [None]:
bbb_routes = list(bbb_only.shape_array_key.unique())

In [None]:
len(bbb_routes)

In [None]:
# Rows that are divided by 0
bbb_only_zero = divide_by_zero[divide_by_zero._gtfs_dataset_name == big_blue_bus].reset_index(drop = True)

In [None]:
f"{len(bbb_only_zero)} or {len(bbb_only_zero)/len(bbb_only)*100} are divided by 0."

In [None]:
f"{bbb_only_zero.shape_array_key.nunique()} routes have 1+ row that is divided by 0"

In [None]:
bbb_trips_count = trips_count[trips_count.shape_array_key.isin(bbb_routes)].reset_index(drop = True)

#### 78% of Big Blue Bus trips has 1+ row that is divided by 0 across its routes, compared with the overall mean of 52% across all operators. 

In [None]:
trips_count.percent_of_trips_with_problematic_rows.describe()

In [None]:
bbb_trips_count.percent_of_trips_with_problematic_rows.describe()

#### Does BBB have more "not ok" rows compared to other operators?  
* Yes, Big Blue Bus ranks the highest. 
* 42% of their total rows are divided by 0 compared to the mean of 12%.
* 49% of their total rows are ok, compared to the mean  of 72%. 
* 5% of their total rows record "meters too low", compared to the mean of 4%.
* However, Big Blue Bus has a lower than average % of rows flagged as "seconds too high" - 3% compared to 13%. 

In [None]:
# How is value counts giving me a different answer than grouping by trip_id
bbb_only.flag.value_counts()

In [None]:
agg1 = (flagged
        .groupby(['_gtfs_dataset_name','flag'])
        .agg({'trip_id':'count'})
        .reset_index()
        .rename(columns = {'trip_id':'count_of_rows'})
       )

In [None]:
agg2 = (flagged
        .groupby(['_gtfs_dataset_name'])
        .agg({'trip_id':'count'})
        .reset_index()
        .rename(columns = {'trip_id':'total_rows'})
       )

In [None]:
agg3 = pd.merge(agg1, agg2, on = ['_gtfs_dataset_name'], how = 'inner') 

In [None]:
agg3['percent_of_rows'] = agg3.count_of_rows/agg3.total_rows * 100

In [None]:
agg3[agg3.flag == "division by 0"].percent_of_rows.describe()

In [None]:
agg3[agg3.flag == "meters too low"].percent_of_rows.describe()

In [None]:
agg3[agg3.flag == "seconds too high"].percent_of_rows.describe()

In [None]:
agg3[agg3.flag == "ok"].percent_of_rows.describe()

In [None]:
agg3[agg3._gtfs_dataset_name == big_blue_bus]

In [None]:
agg3[agg3._gtfs_dataset_name == big_blue_bus][['count_of_rows']].sum()

In [None]:
bbb_only.shape_array_key.nunique()

#### Which operators have the highest % of ok routes to compare
* Using LA DOT since it is also a big operator
* LA DOT has 74k first and last positions, Big Blue Bus only has 35k
* However LA DOT has 110 routes, Big Blue Bus has 62.


In [None]:
agg3[agg3.flag == "ok"].sort_values(['percent_of_rows'], ascending = False).head(10)

In [None]:
agg3[agg3._gtfs_dataset_name == "LA DOT VehiclePositions"][['count_of_rows']].sum()

In [None]:
agg3[agg3._gtfs_dataset_name == "LA DOT VehiclePositions"]

In [None]:
ladot_only = flagged[flagged._gtfs_dataset_name == "LA DOT VehiclePositions"]

In [None]:
ladot_routes = list(ladot_only.shape_array_key.unique())

In [None]:
len(ladot_routes)

In [None]:
ladot_key = "5222fe2cf728fd3f16b2ff51e133fe8c"

In [None]:
62/110

In [None]:
.56*74601

#### Find shapes for Big Blue Bus in which all the trips at 1+ division by 0 row to observe
* Filter down so we can map the most problematic routes, instead of all of them?

In [None]:
bbb_trips_count.sort_values(by = ['percent_of_trips_with_problematic_rows'], ascending = False).head()

In [None]:
# Find routes with trips that record  zeroes
bbb_shapes_with_zeroes = (bbb_trips_count[bbb_trips_count.percent_of_trips_with_problematic_rows != 0]
 .sort_values(by = ['all_trips'], ascending = False)
 .reset_index(drop = True)
)

In [None]:
bbb_shapes_with_zeroes = list(bbb_shapes_with_zeroes.shape_array_key.unique())

In [None]:
len(bbb_shapes_with_zeroes)

In [None]:
# Grab the trip from routes that are the most populated.
bbb_route_most_populated_df = route_most_populated_df[route_most_populated_df.shape_array_key.isin(bbb_routes)].reset_index(drop = True)

In [None]:

bbb_route_most_populated_df.sort_values(by = ['percent_of_ok_rows', 'total_rows'], ascending = False)

#### Count vehicle positions collected for the route

### Stage 3 

In [None]:
stage3 = speed_utils.flag_stage3(divide_by_zero, analysis_date)

In [None]:
sort_by_cols = ['_gtfs_dataset_name','shape_array_key','trip_id','stop_sequence']

In [None]:
preview_cols = sort_by_cols + ['stop_id','gtfs_dataset_key','location_timestamp_local','pair','stage3_flag','flag']

In [None]:
stage3 = stage3.sort_values(by = sort_by_cols).reset_index(drop = True)

In [None]:
# Looking at a "normal" route that need to be divided by 0 
# stage3[(stage3.shape_array_key == "b7ed7b6ed70a7553118c8e065b79cb8d") & (stage3.trip_id == "1101539")][preview_cols]

In [None]:
preview_cols2 =['shape_array_key', 'stop_sequence', 'gtfs_dataset_key', 'stop_id',
 '_gtfs_dataset_name', 'trip_id', 'flag']

In [None]:
bbb_stage3 = stage3[stage3._gtfs_dataset_name == big_blue_bus].reset_index(drop = True)

### Map BBB
* Only mapping shapes where there is 1+ trip that has 1+ row that is divided by 0. 
* Only mapping the trip that has the highest % of ok rows.
* As of 7/24, there are 50 routes and corresponding 50  trips.
* Doing this to avoid visual overload.

In [None]:
# These are the shapes with the most trips and highest % of rows divided by 0
len(bbb_shapes_with_zeroes)

In [None]:
# Grabbing the trip ids to sample
bbb_trips_zero_tripids = bbb_route_most_populated_df[bbb_route_most_populated_df.shape_array_key.isin(bbb_shapes_with_zeroes)]

In [None]:
bbb_trips_zero_tripids = list(bbb_trips_zero_tripids.trip_id.unique())

In [None]:
len(bbb_trips_zero_tripids)

In [None]:
def import_unique_trips(gtfs_key:str, routes:list, analysis_date:str):
    """
    Read vp_usable file for one 
    trip/route/operator and find the unique trips.
    """
    FILE = STOP_SEG_DICT['stage1']
    vp_trips = A1_sjoin_vp_segments.add_grouping_col_to_vp(
        f"{FILE}_{analysis_date}",
        analysis_date,
       ["shape_array_key"]
    )
    
    # Filter to just one trip/route/operator
    # df = vp_trips[(vp_trips.gtfs_dataset_key == gtfs_key) & (vp_trips.gtfs_dataset_key.isin(routes))].reset_index(drop = True)
    return vp_trips

In [None]:
unique_trips = import_unique_trips(bbb_key,bbb_shapes_with_all_zeroes, analysis_date)

In [None]:
len(unique_trips)

In [None]:
def import_vehicle_positions(unique_trips:pd.DataFrame, 
                             gtfs_key:str,
                             analysis_date:str,
                             trips:list)-> gpd.GeoDataFrame:
    """
    Find ALL points for the trip.
    
    Args:
        unique_trips: df from import_unique_trips()
    """
    FILE = STOP_SEG_DICT['stage0']
    
    vp = helpers.import_vehicle_positions(
            SEGMENT_GCS,
            f"{FILE}_{analysis_date}/",
            "gdf",
            filters = [[("gtfs_dataset_key", "==", gtfs_key)]],
            columns = ["gtfs_dataset_key", "trip_id","geometry"],
            partitioned = False
        )
    vp = vp.compute()

    vp = vp.merge(unique_trips, on = ["gtfs_dataset_key", "trip_id"],
            how = "inner"
        )
    
    vp = vp[vp.trip_id.isin(trips)]
    return vp

In [None]:
vps = import_vehicle_positions(unique_trips, bbb_key, analysis_date,bbb_trips_zero_tripids)

In [None]:
len(vps)

In [None]:
vps.trip_id.nunique(), vps.shape_array_key.nunique()

In [None]:
def import_segments(flagged_df: pd.DataFrame, 
                    gtfs_key:str,
                    shapes:list) -> gpd.GeoDataFrame:
    """
    Import cut segments and colorcode  them based on 
    whether or not it has 1+ rows that is divided by 0.
    Cavaet: even if a segment records only 1 row that is divided by 0,
    it will be color coded as so.
    
    Args:
        flagged_df: result from df from categorize_meters_speeds_pandas()
    """
    # Load in ALL segments, flag them.
    FILE = STOP_SEG_DICT['segments_file']
    gdf = gpd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}.parquet",
                           filters = [[("shape_array_key", "in", shapes),
                                      ("gtfs_dataset_key", "==", gtfs_key),
                                     ]]).to_crs(PROJECT_CRS)
    
    gdf["geometry_buffered"] = gdf.geometry.buffer(35)
    gdf = gdf.set_geometry('geometry_buffered')
    
    return gdf

In [None]:
segs = import_segments(bbb_only, bbb_key, bbb_shapes_with_zeroes)

In [None]:
segs_ladot = import_segments(ladot_only, ladot_key, ladot_routes)

In [None]:
segs.shape_array_key.nunique()

#### A lot of routes overlap for Big Blue Bus
* Checking LA DOT if this is similar? There are also a lot of overlapping routes too.

In [None]:
segs.explore('shape_array_key',cmap= 'tab10', height = 400, width = 600, name = 'segments', legend = False)

In [None]:
# segs_ladot.explore('shape_array_key',cmap= 'tab10', height = 400, width = 600, name = 'segments', legend = False)

In [None]:
def find_first_last_points(gtfs_key:str, analysis_date:str)-> gpd.GeoDataFrame:
    """
    Load gdf with only the first and last points pared. 
    """
    FILE = STOP_SEG_DICT['stage3']
    
    df = pd.read_parquet(f"{SEGMENT_GCS}{FILE}_{analysis_date}",
        filters = [[('gtfs_dataset_key', "==", gtfs_key)]],)
    
    gdf =  gpd.GeoDataFrame(
        df, 
        geometry = gpd.points_from_xy(df.x, df.y, crs = "EPSG:4326")
    ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"])
    gdf = gdf[['geometry','stop_sequence']]
    
    
    return gdf

In [None]:
first_last = find_first_last_points(bbb_key, analysis_date)

In [None]:
# Sjoin first and last points
sjoined_firstslast = speed_utils.sjoin_vp_segments(segs, first_last)
sjoined_firstslast = sjoined_results.set_geometry('geometry_left')

In [None]:
# Sjoin all vehicle positions
sjoined_results = speed_utils.sjoin_vp_segments(segs, vps)
sjoined_results = sjoined_results.set_geometry('geometry_left')

In [None]:
len(sjoined_firstslast), len(first_last)

In [None]:
base1 = segs.explore('shape_array_key',cmap= 'tab10', height = 400, width = 600, name = 'segments', legend = False)
all_points_map = vps.explore(m = base1, color = 'red',style_kwds = {'weight':5}, name= 'points')

In [None]:
# Look at the trips chosen for these 50 routes to investigate
(bbb_route_most_populated_df[bbb_route_most_populated_df
  .shape_array_key.isin(bbb_shapes_with_all_zeroes)]
 .sort_values(['shape_array_key'])
 .drop(columns = ['gtfs_dataset_key'])
      )

In [None]:
# all_points_map

In [None]:
base2 = segs.explore('shape_array_key',cmap= 'tab10', height = 400, width = 600, name = 'segments', legend = False)
first_last_map = sjoined_firstslast.explore(m = base2, color = 'blue',style_kwds = {'weight':5}, name= 'points')

In [None]:
# first_last_map

#### Look at routes with a high % of ok rows

In [None]:
def one_route_firstlast(segments:gpd.GeoDataFrame, points:gpd.GeoDataFrame, shape_key:str):
    base = segments[segments.shape_array_key == shape_key]
    points = points[points.shape_array_key_left == shape_key]
    base_map = base.explore('shape_array_key',cmap= 'tab10', height = 400, width = 600, name = 'segments', legend = False)
    points_map = points.explore(m = base_map, color = 'blue',style_kwds = {'weight':5}, name= 'points')
    display(points_map)

In [None]:
def one_route_allpts(segments:gpd.GeoDataFrame, points:gpd.GeoDataFrame, shape_key:str):
    base = segments[segments.shape_array_key == shape_key]
    points = points[points.shape_array_key == shape_key]
    base_map = base.explore('shape_array_key',cmap= 'tab10', height = 400, width = 600, name = 'segments', legend = False)
    points_map = points.explore(m = base_map, color = 'red',style_kwds = {'weight':5}, name= 'points')
    display(points_map)

In [None]:
highest_key1 = "ec24d2f734c25d27c5206af1960c7c20"

In [None]:
# Find rows associated with each trip
((bbb_only[bbb_only
.shape_array_key == highest_key1])
 .groupby(['trip_id'])
 .agg({'meters_elapsed':'count'})
)

In [None]:
# Look at each row from the flagged dataframe.
speed_utils.original_df_rows(bbb_only, 
                             "904961",
                             highest_key1,
                             )

In [None]:
(bbb_route_most_populated_df[bbb_route_most_populated_df
                             .shape_array_key == highest_key1])

In [None]:
one_route_firstlast(segs, sjoined_firstslast, highest_key1)

In [None]:
one_route_allpts(segs, vps, highest_key1)

#### Look at routes with a low % of ok rows

##### Ex 1

In [None]:
low_key1= "de70089f186a809de6685c056377f892"

In [None]:
(bbb_route_most_populated_df[bbb_route_most_populated_df
                             .shape_array_key == low_key1])

In [None]:
((bbb_only[bbb_only
.shape_array_key == low_key1])
 .groupby(['trip_id'])
 .agg({'meters_elapsed':'count'})
)

In [None]:
speed_utils.original_df_rows(bbb_only, 
                             "903024",
                             low_key1,
                             )

In [None]:
one_route_firstlast(segs, sjoined_firstslast, low_key1)

In [None]:
one_route_allpts(segs, vps, low_key1)

##### Ex 2

In [None]:
low_key2 = "7e229c2580722238014a0d6c2c7910ed"

In [None]:
(bbb_route_most_populated_df[bbb_route_most_populated_df
                             .shape_array_key == low_key2])

In [None]:
((bbb_only[bbb_only
.shape_array_key == low_key2])
 .groupby(['trip_id'])
 .agg({'meters_elapsed':'count'})
)

In [None]:
speed_utils.original_df_rows(bbb_only, 
                             "903732",
                             low_key2,
                             )

In [None]:
one_route_firstlast(segs, sjoined_firstslast, low_key2)

In [None]:
one_route_allpts(segs, vps, low_key2)

#### Look at overlapping routes

In [None]:
overlap_key1 = "26cf9105aaf5efa345fe565befc7b67d"

In [None]:
(bbb_route_most_populated_df[bbb_route_most_populated_df
                             .shape_array_key == overlap_key1])

In [None]:
one_route_firstlast(segs, sjoined_firstslast, overlap_key1)

In [None]:
overlap_key2 = "dc0b97c39be8b2f825f345f600742f81"

In [None]:
(bbb_route_most_populated_df[bbb_route_most_populated_df
                             .shape_array_key == overlap_key2])

In [None]:
# Find rows associated with each trip
((bbb_only[bbb_only
.shape_array_key == overlap_key2])
 .groupby(['trip_id'])
 .agg({'meters_elapsed':'count'})
)

In [None]:
(bbb_only[(bbb_only.shape_array_key == overlap_key2) & (bbb_only.trip_id == "905371")]).groupby(['flag']).agg({'meters_cat':'count'})


In [None]:
one_route_firstlast(segs, sjoined_firstslast, overlap_key2)

In [None]:
one_route_allpts(segs, vps, overlap_key2)

### Error categories