In [None]:
import datetime
import _speed_utils as speed_utils
import _threshold_utils as threshold_utils
import altair as alt
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    CONFIG_PATH
)
from scripts import A1_sjoin_vp_segments
from shared_utils import calitp_color_palette as cp

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [None]:
# alt.data_transformers.disable_max_rows()

### Merging

In [None]:
def merge_all_speeds(analysis_date:str) -> pd.DataFrame:
    """
    Merge avg_speeds_stop_segments and
    speed_stops parquets.
    
    Args:
        date: analysis date
    """
    # Open up avg speeds
    avg_speeds = pd.read_parquet(f"{SEGMENT_GCS}avg_speeds_stop_segments_{analysis_date}.parquet")
    avg_speeds = avg_speeds.drop(columns=["geometry", "district", "district_name"])
    # Filter  for all day flags
    avg_speeds = avg_speeds[avg_speeds.time_of_day == 'all_day'].reset_index(drop = True)
    
    # Open up speeds
    speeds = pd.read_parquet(f"{SEGMENT_GCS}speeds_stop_segments_{analysis_date}")
    
    merge_cols = ['gtfs_dataset_key','shape_array_key', 'stop_sequence']
    m1 = pd.merge(avg_speeds, speeds, on = merge_cols, how = 'inner')
    
    m1 = m1.drop_duplicates().reset_index(drop = True)
    
    return m1

In [None]:
m1 = merge_all_speeds(analysis_date)

In [None]:
# m1.shape

In [None]:

# Picked 4 random routes
sample_0_keys = [
    "0fb4f3627996269dc7075276d3b69e36",
    "07c9a47264a43d8d0d16ef7109e8fd68",
    "106d979b9a9e6338827a8e1c145e69fd",
    "000624bd8453dbe4f2eb2765b04bcb98",
]

### Categorize

In [None]:
def categorize_by_percentile_pandas(
    df: pd.DataFrame, column_percentile: str, column_str: str
) -> pd.DataFrame:

    # Find percentiles
    p5 = df[column_percentile].quantile(0.05).astype(float)
    p95 = df[column_percentile].quantile(0.95).astype(float)
    
    def rate(row):
        if ((row[column_percentile] >= 0) and (row[column_percentile] <= p5)):
            return f"{column_str} is low"
        elif (row[column_percentile] >= p95):
               return f"{column_str} is high"
        else:
            return f"{column_str} is avg"
    
    # Apply flags
    df[f"{column_str}cat"] = df.apply(lambda x: rate(x), axis=1)
    
    # Clean
    df[f"{column_str}cat"] = df[f"{column_str}cat"].str.replace("_", "")

    print(f"Done with {column_str}")
    
    return df  

In [None]:
# df1 = categorize_by_percentile_pandas(subset, "meters_elapsed", "meters_")

In [None]:
# df1.head()

In [None]:
# df2 = categorize_by_percentile_pandas(df1, "sec_elapsed", "sec_")

In [None]:
# df2.head()

In [None]:
def categorize_meters_speeds_pandas(df)-> pd.DataFrame:
    start = datetime.datetime.now()
    print(start)
    
    #df = merge_all_speeds(analysis_date)
    
    # Categorize
    df1 = categorize_by_percentile_pandas(df, "meters_elapsed", "meters_")
    df2 = categorize_by_percentile_pandas(df1, "sec_elapsed", "sec_")
  
    # Find size of categories
    print(df2.groupby(['sec_cat','meters_cat']).size())

    # Filter out for only meters that are low or seconds that are high
    df2 = df2[(df2.meters_cat == 'meters is low') | (df2.sec_cat == 'sec is high')].reset_index(drop = True)
    print(f"{len(df2)} rows left after filtering for rows with either high seconds OR low meters") 
    
    def flag_round(row):
        if (row["meters_elapsed"] == 0) & (row["sec_elapsed"] == 0):
            return "division by 0"
        elif row["meters_cat"] == "meters is low":
            return "meters too low"
        elif row["sec_cat"] == "sec is high":
            return "seconds too high"
        else:
            return "ok"
        
    df2["flag"] = df2.apply(lambda x: flag_round(x), axis=1)
    print(df2.flag.value_counts())
    
    # Filter out for only division by 0 
    df3 = df2[(df2.flag == 'division by 0')].reset_index(drop = True)
    
    end = datetime.datetime.now()
    print(f"Took {end-start}")
    return df3

In [None]:
subset = m1[m1.shape_array_key.isin(sample_0_keys)].reset_index()

In [None]:
m2 = categorize_meters_speeds_pandas(subset)

In [None]:
m2.flag.value_counts()

In [None]:
len(m1)-len(m2)

In [None]:
len(m2)

In [None]:
m2.trip_id.nunique(), m1.trip_id.nunique()

In [None]:
m2.shape_array_key.nunique(), m1.shape_array_key.nunique()

In [None]:
m2._gtfs_dataset_name.nunique(), m1._gtfs_dataset_name.nunique()

In [None]:
m2.groupby(["loop_or_inlining"]).agg({"shape_array_key": "nunique"})

#### See how many trips for a shape ID have problematic rows


In [None]:
# Number of trips that have at least one row that was divided by 0 
# for this shape array key
df1 = m2.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'trips_with_zero'}).reset_index()

In [None]:
# Original number of trips
df2 = m1.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'all_trips'}).reset_index()

In [None]:
df3 = pd.merge(df1, df2, how = "inner", on = 'shape_array_key')

In [None]:
df3['percent_of_trips_with_problematic_rows'] = df3.trips_with_zero/df3.all_trips * 100

In [None]:
df3['percent_of_trips_with_problematic_rows'].describe()

In [None]:
# df3.sample(5)

### Investigate 
#### Stage3: "vp_pared_stops"/A3_loop_inlining
* Rewrite this part to filter read_parquet with the shape array and whatnot

In [None]:
def load_vp_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:
    
    # Subset the dataframe and use it to filter out for only the values of interest
    shape_array_keys = flagged_df.shape_array_key.unique().tolist()
    stop_seq = flagged_df.stop_sequence.unique().tolist() 
    trip_id = flagged_df.trip_id.unique().tolist() 
    gtfs_dataset_key = flagged_df.gtfs_dataset_key.unique().tolist() 
    
    #flagged_df = flagged_df[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key']]
    vp = pd.read_parquet(f"{SEGMENT_GCS}vp_pared_stops_{date}",
        filters = [[('shape_array_key', "in", shape_array_keys),
                   ('stop_sequence', 'in', stop_seq), 
                   ('trip_id', 'in', trip_id), 
                   ('gtfs_dataset_key', 'in', gtfs_dataset_key)]],)
    
    # Merge to filter
    vp2 = pd.merge(flagged_df, vp, how = "inner", on = ['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key'])
    
    return vp2

In [None]:
vp2 = load_vp_stage3(subset, analysis_date)

In [None]:
# vp = pd.read_parquet(f"{SEGMENT_GCS}vp_pared_stops_{analysis_date}")

In [None]:
# Check out stop sequences for the trip below that have division by 0
# subset[subset.trip_id == "1088383"].stop_sequence.unique()

In [None]:
# Stop sequences that were flagged as division by 0
# vp2[vp2.trip_id == "1088383"].sort_values(['trip_id', 'stop_sequence','location_timestamp_local'])

In [None]:
# All the stop sequences for this trip, even those that are ok
# vp_pared[vp_pared.trip_id == "1088383"].sort_values(['trip_id', 'stop_sequence','location_timestamp_local'])

In [None]:
# All the stop sequences for this trip, even those that are ok
# vp_pared[vp_pared.trip_id == "1088383"].sort_values(['location_timestamp_local','stop_sequence',])

In [None]:
def stage3_repeated_timestamps(stage3_df:pd.DataFrame)-> pd.DataFrame:
    """
    Look at how many times a time stamp is repeated a route-trip-location.
    Each of these 3 combos should have a different time for each 
    stop sequence or else the vehicle is not changing locations.
    """
    agg = (stage3_df
     .groupby(['shape_array_key','trip_id', 'location_timestamp_local'])
     .agg({'stop_sequence':'nunique'})
     .reset_index()
     .rename(columns = {'stop_sequence':'number_of_repeated_timestamps'})
    )
    
    # Only keep timestamps that are repeated more than once
    agg = (agg[agg.number_of_repeated_timestamps > 1]).reset_index(drop = True)

    return agg

In [None]:
def stage3_repeated_locations(stage3_df:pd.DataFrame):
    """
    Look at how many times a time stamp is repeated for a stop-trip-route combo.
    Each of these 3 combos should have a different location for each 
    stop sequence or else the vehicle is not changing locations.
    """
    # Concat x and y into a string
    stage3_df['pair'] = stage3_df.x.astype(str) + '/' + vp2.y.astype(str)
    
    # Count number of different stops that reference the same location
    agg = (stage3_df
     .groupby(['shape_array_key','trip_id','pair'])
     .agg({'stop_sequence':'nunique'})
     .reset_index()
     .sort_values('stop_sequence', ascending = False)
     .rename(columns = {'stop_sequence':'number_of_repeated_locs'})               
    )

    # Only keep locations that are repeated more than once
    agg = agg[agg.number_of_repeated_locs != 1].reset_index(drop = True)
    
    return agg

In [None]:
def flag_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:
    """
    Flag the errors in stage3
    """
    start = datetime.datetime.now()
    print(start)
    
    # Relevant rows from Vehicle Positions
    vp = load_vp_stage3(flagged_df, date)
    
    # Find repeated timestamps.
    multi_timestamps = stage3_repeated_timestamps(vp)
    
    # Find repeated locations
    multi_locs = stage3_repeated_locations(vp)
    
    # Merge
    timestamps_merge_cols = ['shape_array_key','trip_id','location_timestamp_local']
    loc_merge_cols =  ['shape_array_key','trip_id','pair']
    
    # Want everything found in vehicle positions, so do left merges
    m1 = (vp
          .merge(multi_timestamps, how="left", on= timestamps_merge_cols)
          .merge(multi_locs, how="left", on=loc_merge_cols)
         )
    
    drop_cols = ['vp_idx','x','y','hour','activity_date',]
    m1 = m1.drop(columns = drop_cols)
    
    # Flag
    def flag(row):
        if (row["number_of_repeated_timestamps"] > 1) & (row["number_of_repeated_locs"] > 1):
            return "repeated timestamps & locations"
        elif (row["number_of_repeated_timestamps"] > 1):
            return "repeated timestamps"
        elif (row["number_of_repeated_locs"] > 1):
            return "repeated locations"
        else:
            return "check in stage 2"
        
    m1["stage3_flag"] = m1.apply(lambda x: flag(x), axis=1)
    
    print(m1.stage3_flag.value_counts())
    
    check_in_stage2 = m1[m1.stage3_flag == "check in stage 2"]
    print(f"Have to check {len(check_in_stage2)/len(m1) * 100} % of rows in stage 2")
    
    end = datetime.datetime.now()
    print(f"Took {end-start}")
    return m1

In [None]:
m3 = flag_stage3(m2, analysis_date)

In [None]:
m3.shape

In [None]:
m3 = m3[m3.stage3_flag == "check in stage 2"]

In [None]:
m3.shape

In [None]:
sort_cols = ['trip_id', 'shape_array_key', 'stop_sequence']

#### Stage2: "vp_stop_segment"/A1_sjoin_vp_segments


In [None]:
# Select one route to look at
test_route = "106d979b9a9e6338827a8e1c145e69fd"

In [None]:
test_sequence = 39

In [None]:
test_gtfs_key = "db56b50ab86b5f7a4ae2fc2dd9889bbe"

In [None]:
test_trip = '1088405'

#### Look at export  file

In [None]:
def import_stage_2(date:str, route:str, stop_sequence:str):
    df = pd.read_parquet(
            f"{SEGMENT_GCS}vp_sjoin/vp_stop_segment_{date}",
            filters = [[('shape_array_key', "==", route),
                       ('stop_sequence', "==", stop_sequence)]],
        )
    return df

In [None]:
# stg2 = import_stage_2(analysis_date, test_route, test_sequence)

#### Look at vp trips -> import unique trips

In [None]:
def import_unique_trips(gtfs_key:str, trip: str, route:str):
    vp_trips = A1_sjoin_vp_segments.add_grouping_col_to_vp(
        f"vp_usable_{analysis_date}",
        analysis_date,
       ["shape_array_key"]
    )
    
    df = vp_trips[(vp_trips.gtfs_dataset_key == gtfs_key)
                    & (vp_trips.shape_array_key == route)
                    & (vp_trips.trip_id == trip)].reset_index(drop = True)
    return df


In [None]:
unique_trips = import_unique_trips(test_gtfs_key, test_trip, test_route)

#### Look at vehicle positions

In [None]:
def import_vehicle_positions(unique_trips:pd.DataFrame, gtfs_key:str, trip_id:str)-> gpd.GeoDataFrame:
    vp = helpers.import_vehicle_positions(
            SEGMENT_GCS,
            f"vp_usable_{analysis_date}/",
            filters = [[("gtfs_dataset_key", "==", gtfs_key),
                      ('trip_id', '==', trip_id)]],
            columns = ["gtfs_dataset_key", "trip_id", 
                       "vp_idx", "x", "y"],
            partitioned = True
        )
    
    vp = vp.compute()
    vp = vp.merge(unique_trips, on = ["gtfs_dataset_key", "trip_id"],
            how = "inner"
        )
    
    vp_gdf = gpd.GeoDataFrame(
        vp, 
        geometry = gpd.points_from_xy(vp.x, vp.y, crs = "EPSG:4326")
    ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"])
    
    return vp_gdf

In [None]:
vehicle_positions = import_vehicle_positions(unique_trips, test_gtfs_key, test_trip)

In [None]:
len(vehicle_positions)

#### Look at segments

In [None]:
def import_segments(flagged_df: pd.DataFrame, route:str, gtfs_key:str) -> gpd.GeoDataFrame:
    gdf = gpd.read_parquet(f"{SEGMENT_GCS}stop_segments_{analysis_date}.parquet",
                           filters = [[("shape_array_key", "==", route),
                                      ("gtfs_dataset_key", "==", gtfs_key),
                                     ]]).to_crs(PROJECT_CRS)
    
    gdf["geometry_buffered"] = gdf.geometry.buffer(35)
    gdf = gdf.set_geometry('geometry_buffered')
    
    # Distinguish between "correct" and "incorrect" seq
    # A sequence can be incorrect even if just one row is "divided by 0"
    incorrect_segments = flagged_df[(flagged_df.shape_array_key == route) & (flagged_df.gtfs_dataset_key == gtfs_key)]
    incorrect_segments_list = incorrect_segments.stop_sequence.unique().tolist()
    incorrect_segments_filtered = gdf[gdf.stop_sequence.isin(incorrect_segments_list)].reset_index(drop = True)
    incorrect_segments_filtered['flag'] = 'incorrect'
    
    # Filter for correct segments
    correct_segments = flagged_df[~flagged_df.stop_sequence.isin(incorrect_segments_list)]
    correct_segments_list = correct_segments.stop_sequence.unique().tolist()
    correct_segments_filtered = gdf[gdf.stop_sequence.isin(correct_segments_list)].reset_index(drop = True)
    correct_segments_filtered['flag'] = 'correct'
    
    final = pd.concat([correct_segments_filtered, incorrect_segments_filtered])
    
    return final

In [None]:
# flagged_segments = import_segments(m3, test_route, test_gtfs_key)

In [None]:
#segments = A1_sjoin_vp_segments.import_segments_and_buffer(
 #   f"stop_segments_{analysis_date}",
#    35,
   # ["shape_array_key", "stop_sequence"]+ ["seg_idx", "geometry"]
#)

In [None]:
# segments = segments.compute()

#### Stops kept: last and first

In [None]:
def find_first_last_points(route:str, trip:str, gtfs_key:str):
    df = pd.read_parquet(f"{SEGMENT_GCS}vp_pared_stops_{analysis_date}",
        filters = [[('shape_array_key', "==", route),
                  
                   ('trip_id', "==", trip), 
                   ('gtfs_dataset_key', '==', gtfs_key)]],)
    
    gdf =  gpd.GeoDataFrame(
        df, 
        geometry = gpd.points_from_xy(df.x, df.y, crs = "EPSG:4326")
    ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"])
    
    gdf = gdf[['geometry','stop_sequence']]
    
    return gdf

In [None]:
first_last = find_first_last_points(test_route, test_trip, test_gtfs_key)

In [None]:
len(first_last)

#### Mapping

In [None]:
def display_maps(all_points: gpd.GeoDataFrame, first_last_points: gpd.GeoDataFrame, segments: gpd.GeoDataFrame):
    base1 = segments.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')
    
    all_points_map = all_points.explore(m = base1, color = 'red',style_kwds = {'weight':5}, legend_kwds = {'caption': 'all_points'}, name= 'points')
    
    display(all_points_map) 
    first_last_map = first_last_points.explore('stop_sequence', cmap = 'tab10',style_kwds = {'weight':5},height = 400, width = 600,)
    display(first_last_map)

In [None]:
# display_maps(vehicle_positions,first_last,flagged_segments)

#### Function

In [None]:
def stage2_trouble_shooting(flagged_df:pd.DataFrame,
                            date:str, route:str, 
                            stop_sequence:str, 
                            trip:str, gtfs_key:str):
    stg2 = import_stage_2(date, route, stop_sequence)
    unique_trips = import_unique_trips(gtfs_key, trip, route)
    
    vehicle_positions = import_vehicle_positions(unique_trips, gtfs_key, trip)
    flagged_segments = import_segments(flagged_df, route, gtfs_key)
    first_last = find_first_last_points(route, trip, gtfs_key)
    
    display_maps(vehicle_positions,first_last,flagged_segments)
    return vehicle_positions, first_last, flagged_segments
    

In [None]:
test1_allpts, test1_firstlast_pts, test1_flagged = stage2_trouble_shooting(flagged_df= m3,
                        date = analysis_date,
                        route = test_route,
                        stop_sequence = test_sequence,
                        trip = test_trip,
                        gtfs_key = test_gtfs_key)

In [None]:
test1_allpts.shape, test1_firstlast_pts.shape

In [None]:
test_route2 = "0fb4f3627996269dc7075276d3b69e36"
test_stop = 13
test_gtfs_key2 = "a4f6fd5552107e05fe9743ac7cce2c55"
test_trip2 = "16939095"

In [None]:
test2_allpts, test2_firstlast_pts, test2_flagged = stage2_trouble_shooting(flagged_df= m3,
                        date = analysis_date,
                        route = test_route2,
                        stop_sequence = test_stop,
                        trip = test_trip2,
                        gtfs_key = test_gtfs_key2)

In [None]:
test2_allpts.shape, test2_firstlast_pts.shape

In [None]:
# test2_base = test2_flagged.explore('flag', cmap= 'tab10', height = 400, width = 600, name = 'segments')

In [None]:
# test2_all_pts_map = test2_allpts.explore(m = test2_base, color = 'red',style_kwds = {'weight':5}, legend_kwds = {'caption': 'all_points'}, name= 'points')

In [None]:
# test2_all_pts_map

In [None]:
# test2_firstlast_pts.explore( color = 'red',style_kwds = {'weight':5}, height = 400, width = 600, )

##### Sjoin

In [None]:
def sjoin_vp_segments(segments: gpd.GeoDataFrame, vp_gdf: gpd.GeoDataFrame):
    vp_in_seg = gpd.sjoin(
        vp_gdf,
        segments,
        how = "inner",
        predicate = "within"
    )
    # vp_in_seg = vp_in_seg.set_geometry('geometry_left')
    
    return vp_in_seg



### Stage1: "vp_usable"

In [None]:
# What's the diff between stop segments normal/special/and without any notation?
usable = pd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}")

In [None]:
usable.sample()

In [None]:
subset_for_merge2 = subset_for_merge.drop(columns = ['stop_sequence','stop_id','meters_elapsed','sec_elapsed'])

In [None]:
m_cols2 = ['gtfs_dataset_key',
 'trip_id']

In [None]:
subset_for_merge2.head()

In [None]:
# m2[m2.trip_id == '1350']