In [1]:
import datetime
import _speed_utils as speed_utils
import _threshold_utils as threshold_utils
import altair as alt
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
from segment_speed_utils import gtfs_schedule_wrangling, helpers, segment_calcs
from segment_speed_utils.project_vars import (
    COMPILED_CACHED_VIEWS,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    CONFIG_PATH
)
from shared_utils import calitp_color_palette as cp


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# alt.data_transformers.disable_max_rows()

### Merging

In [4]:
def merge_all_speeds(analysis_date:str) -> pd.DataFrame:
    """
    Merge avg_speeds_stop_segments and
    speed_stops parquets.
    
    Args:
        date: analysis date
    """
    # Open up avg speeds
    avg_speeds = pd.read_parquet(f"{speed_utils.GCS_PATH}avg_speeds_stop_segments_{analysis_date}.parquet")
    avg_speeds = avg_speeds.drop(columns=["geometry", "geometry_arrowized", "district", "district_name"])
    # Filter  for all day flags
    avg_speeds = avg_speeds[avg_speeds.time_of_day == 'all_day'].reset_index(drop = True)
    
    # Open up speeds
    speeds = pd.read_parquet(f"{speed_utils.GCS_PATH}speeds_stop_segments_{analysis_date}")
    
    merge_cols = ['gtfs_dataset_key','shape_array_key', 'stop_sequence']
    m1 = pd.merge(avg_speeds, speeds, on = merge_cols, how = 'inner')
    
    m1 = m1.drop_duplicates().reset_index(drop = True)
    
    return m1

In [5]:
m1 = merge_all_speeds(analysis_date)

In [6]:
# m1.shape

In [7]:

# Picked 4 random routes
sample_0_keys = [
    "0fb4f3627996269dc7075276d3b69e36",
    "07c9a47264a43d8d0d16ef7109e8fd68",
    "106d979b9a9e6338827a8e1c145e69fd",
    "000624bd8453dbe4f2eb2765b04bcb98",
]

### Categorize

In [8]:
def categorize_by_percentile_pandas(
    df: pd.DataFrame, column_percentile: str, column_str: str
) -> pd.DataFrame:

    # Find percentiles
    p5 = df[column_percentile].quantile(0.05).astype(float)
    p95 = df[column_percentile].quantile(0.95).astype(float)
    
    def rate(row):
        if ((row[column_percentile] >= 0) and (row[column_percentile] <= p5)):
            return f"{column_str} is low"
        elif (row[column_percentile] >= p95):
               return f"{column_str} is high"
        else:
            return f"{column_str} is avg"
    
    # Apply flags
    df[f"{column_str}cat"] = df.apply(lambda x: rate(x), axis=1)
    
    # Clean
    df[f"{column_str}cat"] = df[f"{column_str}cat"].str.replace("_", "")

    print(f"Done with {column_str}")
    
    return df  

In [9]:
# df1 = categorize_by_percentile_pandas(subset, "meters_elapsed", "meters_")

In [10]:
# df1.head()

In [11]:
# df2 = categorize_by_percentile_pandas(df1, "sec_elapsed", "sec_")

In [12]:
# df2.head()

In [13]:
def categorize_meters_speeds_pandas(df)-> pd.DataFrame:
    start = datetime.datetime.now()
    print(start)
    
    #df = merge_all_speeds(analysis_date)
    
    # Categorize
    df1 = categorize_by_percentile_pandas(df, "meters_elapsed", "meters_")
    df2 = categorize_by_percentile_pandas(df1, "sec_elapsed", "sec_")
  
    # Find size of categories
    print(df2.groupby(['sec_cat','meters_cat']).size())

    # Filter out for only meters that are low or seconds that are high
    df2 = df2[(df2.meters_cat == 'meters is low') | (df2.sec_cat == 'sec is high')].reset_index(drop = True)
    print(f"{len(df2)} rows left after filtering for rows with either high seconds OR low meters") 
    
    def flag_round(row):
        if (row["meters_elapsed"] == 0) & (row["sec_elapsed"] == 0):
            return "division by 0"
        elif row["meters_cat"] == "meters is low":
            return "meters too low"
        elif row["sec_cat"] == "sec is high":
            return "seconds too high"
        else:
            return "ok"
        
    df2["flag"] = df2.apply(lambda x: flag_round(x), axis=1)
    print(df2.flag.value_counts())
    
    # Filter out for only division by 0 
    df3 = df2[(df2.flag == 'division by 0')].reset_index(drop = True)
    
    end = datetime.datetime.now()
    print(f"Took {end-start}")
    return df3

In [14]:
subset = m1[m1.shape_array_key.isin(sample_0_keys)].reset_index()

In [15]:
m2 = categorize_meters_speeds_pandas(m1)

2023-06-26 09:14:38.408932
Done with meters_
Done with sec_
sec_cat      meters_cat    
sec is avg   meters is avg     2415102
             meters is high      70745
             meters is low      139528
sec is high  meters is avg       57245
             meters is high      83074
             meters is low       13695
sec is low   meters is low      296973
dtype: int64
590515 rows left after filtering for rows with either high seconds OR low meters
division by 0       296973
meters too low      153223
seconds too high    140319
Name: flag, dtype: int64
Took 0:02:17.630093


In [16]:
m2.flag.value_counts()

division by 0    296973
Name: flag, dtype: int64

In [17]:
len(m1)-len(m2)

2779389

In [18]:
len(m2)

296973

In [19]:
m2.trip_id.nunique(), m1.trip_id.nunique()

(45357, 72067)

In [20]:
m2.shape_array_key.nunique(), m1.shape_array_key.nunique()

(2682, 4837)

In [21]:
m2._gtfs_dataset_name.nunique(), m1._gtfs_dataset_name.nunique()

(63, 76)

In [22]:
m2.groupby(["loop_or_inlining"]).agg({"shape_array_key": "nunique"})

Unnamed: 0_level_0,shape_array_key
loop_or_inlining,Unnamed: 1_level_1
0,2682


#### See how many trips for a shape ID have problematic rows


In [23]:
# Number of trips that have at least one row that was divided by 0 
# for this shape array key
df1 = m2.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'trips_with_zero'}).reset_index()

In [40]:
# Original number of trips
df2 = m1.groupby(['shape_array_key']).agg({'trip_id':'nunique'}).rename(columns = {'trip_id':'all_trips'}).reset_index()

In [41]:
df3 = pd.merge(df1, df2, how = "inner", on = 'shape_array_key')

In [42]:
df3['percent_of_trips_with_problematic_rows'] = df3.trips_with_zero/df3.all_trips * 100

In [44]:
df3['percent_of_trips_with_problematic_rows'].describe()

count   2682.00
mean      82.86
std       26.65
min        1.52
25%       75.00
50%      100.00
75%      100.00
max      100.00
Name: percent_of_trips_with_problematic_rows, dtype: float64

In [43]:
df3.sample(5)

Unnamed: 0,shape_array_key,trips_with_zero,all_trips,percent_of_trips_with_problematic_rows
1397,82f0e3379d90a630b9e42e5ec79e0279,6,7,85.71
333,1e469c778efe30b55db3dd93ee1d9946,19,20,95.0
2060,c750d9ce7a9e659d5d443f0925e175e7,6,7,85.71
908,59626d7e12b3fec5d917b3e052e87d70,17,17,100.0
47,0485a3b83c38283730ce3e9372baf031,2,3,66.67


### Investigate 
#### Stage3: "vp_pared_stops"
* Keeps only first and last point of a segment.

In [28]:
def load_vp_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:
    
    # Subset the dataframe and use it to filter out for only the values of interest
    flagged_df = flagged_df[['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key']]
    vp = pd.read_parquet(f"{speed_utils.GCS_PATH}vp_pared_stops_{date}")
    
    # Merge to filter
    vp2 = pd.merge(flagged_df, vp, how = "inner", on = ['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key'])
    
    return vp2

In [29]:
vp2 = load_vp_stage3(subset, analysis_date)

In [30]:
# Check out stop sequences for the trip below that have division by 0
# subset[subset.trip_id == "1088383"].stop_sequence.unique()

In [31]:
# Stop sequences that were flagged as division by 0
# vp2[vp2.trip_id == "1088383"].sort_values(['trip_id', 'stop_sequence','location_timestamp_local'])

In [32]:
# All the stop sequences for this trip, even those that are ok
# vp_pared[vp_pared.trip_id == "1088383"].sort_values(['trip_id', 'stop_sequence','location_timestamp_local'])

In [33]:
# All the stop sequences for this trip, even those that are ok
# vp_pared[vp_pared.trip_id == "1088383"].sort_values(['location_timestamp_local','stop_sequence',])

In [34]:
def stage3_repeated_timestamps(stage3_df:pd.DataFrame)-> pd.DataFrame:
    """
    Look at how many times a time stamp is repeated a route-trip-location.
    Each of these 3 combos should have a different time for each 
    stop sequence or else the vehicle is not changing locations.
    """
    agg = (stage3_df
     .groupby(['shape_array_key','trip_id', 'location_timestamp_local'])
     .agg({'stop_sequence':'nunique'})
     .reset_index()
     .rename(columns = {'stop_sequence':'number_of_repeated_timestamps'})
    )
    
    # Only keep timestamps that are repeated more than once
    agg = (agg[agg.number_of_repeated_timestamps > 1]).reset_index(drop = True)

    return agg

In [35]:
def stage3_repeated_locations(stage3_df:pd.DataFrame):
    """
    Look at how many times a time stamp is repeated for a stop-trip-route combo.
    Each of these 3 combos should have a different location for each 
    stop sequence or else the vehicle is not changing locations.
    """
    # Concat x and y into a string
    stage3_df['pair'] = stage3_df.x.astype(str) + '/' + vp2.y.astype(str)
    
    # Count number of different stops that reference the same location
    agg = (stage3_df
     .groupby(['shape_array_key','trip_id','pair'])
     .agg({'stop_sequence':'nunique'})
     .reset_index()
     .sort_values('stop_sequence', ascending = False)
     .rename(columns = {'stop_sequence':'number_of_repeated_locs'})               
    )

    # Only keep locations that are repeated more than once
    agg = agg[agg.number_of_repeated_locs != 1].reset_index(drop = True)
    
    return agg

In [36]:
def flag_stage3(flagged_df:pd.DataFrame, date:str) -> pd.DataFrame:
    """
    Flag the errors in stage3
    """
    start = datetime.datetime.now()
    print(start)
    
    # Relevant rows from Vehicle Positions
    vp = load_vp_stage3(flagged_df, date)
    
    # Find repeated timestamps.
    multi_timestamps = stage3_repeated_timestamps(vp)
    
    # Find repeated locations
    multi_locs = stage3_repeated_locations(vp)
    
    # Merge
    timestamps_merge_cols = ['shape_array_key','trip_id','location_timestamp_local']
    loc_merge_cols =  ['shape_array_key','trip_id','pair']
    
    # Want everything found in vehicle positions, so do left merges
    m1 = (vp
          .merge(multi_timestamps, how="left", on= timestamps_merge_cols)
          .merge(multi_locs, how="left", on=loc_merge_cols)
         )
    
    drop_cols = ['vp_idx','x','y','hour','activity_date',]
    m1 = m1.drop(columns = drop_cols)
    
    # Flag
    def flag(row):
        if (row["number_of_repeated_timestamps"] > 1) & (row["number_of_repeated_locs"] > 1):
            return "repeated timestamps & locations"
        elif (row["number_of_repeated_timestamps"] > 1):
            return "repeated timestamps"
        elif (row["number_of_repeated_locs"] > 1):
            return "repeated locations"
        else:
            return "check in stage 2"
        
    m1["stage3_flag"] = m1.apply(lambda x: flag(x), axis=1)
    
    print(m1.stage3_flag.value_counts())
    
    check_in_stage2 = m1[m1.stage3_flag == "check in stage 2"]
    print(f"Have to check {len(check_in_stage2)/len(m1) * 100} % of rows in stage 2")
    
    end = datetime.datetime.now()
    print(f"Took {end-start}")
    return m1

In [37]:
m3 = flag_stage3(m2, analysis_date)

2023-06-26 09:17:07.320679
check in stage 2                   538914
repeated timestamps                 54883
repeated timestamps & locations       107
repeated locations                     42
Name: stage3_flag, dtype: int64
Have to check 90.73451121819154 % of rows in stage 2
Took 0:00:27.798047


In [38]:
m3.shape

(593946, 11)

In [39]:
sort_cols = ['trip_id', 'shape_array_key', 'stop_sequence']

### Stage2: "vp_stop_segment"
* Were the right points kept?

In [None]:
subset_cols = ['gtfs_dataset_key', 'trip_id','stop_sequence','shape_array_key', 'loop_or_inlining', 'stop_id', 'meters_elapsed','sec_elapsed']

In [None]:
m_cols  = ['gtfs_dataset_key', 'stop_sequence','shape_array_key', 'loop_or_inlining','stop_id']

In [None]:
# Find rows that need to be tagged in stage 2
stage2_rows = m3[m3.stage3_flag == "check in stage 2"].reset_index()

In [None]:
stage2_routes = stage2_rows.shape_array_key.unique().tolist() 

In [None]:
# Use flagged df
stage2_rows = m2[m2.shape_array_key.isin(stage2_routes)].reset_index(drop = True)

In [None]:
# Subset df to filter the vp 
subset_for_merge = stage2_rows[subset_cols].drop_duplicates().reset_index(drop = True)

In [None]:
# What's the diff between stop segments normal/special/and without any notation?
stg2 = gpd.read_parquet(f"{speed_utils.GCS_PATH}stop_segments_{analysis_date}.parquet")

In [None]:
# Merge
stg2_m = pd.merge(stg2,
                  subset_for_merge, 
                  how = "inner",
                  on = m_cols
                 )

In [None]:
stage2_rows.shape

In [None]:
for i in ['geometry','geometry_arrowized']:
    print(f"{i}: {stg2_m[i].is_valid.sum()/len(stg2_m)}")
    print(f"{i}: {len(stg2_m[stg2_m[i].is_empty])}")

In [None]:
# Delete out empty geo 
filtered = stg2[~stg2.geometry_arrowized.is_empty]

In [None]:
subset_for_merge.sample()

In [None]:
# Delete out empty geometry arrowized
geo_arrowized = stg2_m[~stg2_m.geometry_arrowized.is_empty]

#### Look at the original routes

In [None]:
# This is the original dataframe loaded in from merging stage
# It's not even flagged. 
original = pd.merge(filtered, 
                    subset[subset_cols],
                    how = "inner", 
                    on = m_cols)

In [None]:
len(geo_arrowized)

In [None]:
# Find number of messed up sequences...are the same sequences being hit?
"""
(subset_for_merge
 .groupby(['shape_array_key','stop_sequence'])
 .agg({'trip_id':'nunique'})
 .rename(columns = {'trip_id':'number_of_trips_with_problematic_stop_seq'})
 .sort_values(['shape_array_key','number_of_trips_with_problematic_stop_seq']
              , ascending = False)
)"""

In [None]:
# subset_for_merge.groupby(['shape_array_key','trip_id']).agg({'stop_sequence':'nunique'})

#### Look at all the stop sequences vs the ones flagged as 0 for each trip

##### 106d979b9a9e6338827a8e1c145e69fd
* 1088383
* 1088403

In [None]:
# SEgments that show up have something wrong with them
geo_arrowized[geo_arrowized.trip_id == '1088383'].set_geometry("geometry_arrowized").explore('stop_sequence',  style_kwds = {'weight':10})

In [None]:
# SEgments that show up have something wrong with them
geo_arrowized[geo_arrowized.trip_id == '1088403'].set_geometry("geometry_arrowized").explore('stop_sequence',  style_kwds = {'weight':10})

In [None]:
original[original.trip_id == '1088383'].set_geometry("geometry_arrowized").explore('stop_sequence', style_kwds = {'weight':10})

#### 0fb4f3627996269dc7075276d3b69e36 
* 16939089

In [None]:
geo_arrowized[geo_arrowized.trip_id == '16939089'].set_geometry("geometry_arrowized").explore('stop_sequence', style_kwds = {'weight':10})

In [None]:
original[original.trip_id == '16939089'].set_geometry("geometry_arrowized").explore('stop_sequence', style_kwds = {'weight':10})

#### 000624bd8453dbe4f2eb2765b04bcb98 
* 1350

In [None]:
geo_arrowized[geo_arrowized.trip_id == '1359'].set_geometry("geometry_arrowized").explore('stop_sequence', style_kwds = {'weight':10})

In [None]:
geo_arrowized[geo_arrowized.trip_id == '1350'].set_geometry("geometry_arrowized").explore('stop_sequence', style_kwds = {'weight':10})

In [None]:
m2[m2.trip_id == '1350']

In [None]:
original[original.trip_id == '1350'].set_geometry("geometry_arrowized").explore('stop_sequence', style_kwds = {'weight':10})

In [None]:
summarize(few_routes_cat, high_low_zero)

#### Draft
* Show which stops are excluded from flags
* Show how many stops are dropped
* Show % of stops that were flagged compared to total stops.

In [None]:
def read_back_gcs():
    # Read back all the partitioned stuff - grab the file number
    # part0.parquet, part1.parquet
    start = datetime.datetime.now()
    print(f"Begin: {start}")
    gcs_file_path1 = f"{speed_utils.GCS_PATH}partitioned_flags"
    file_names_dask = extract_number(gcs_file_path1, "part")

    # https://www.geeksforgeeks.org/read-multiple-csv-files-into-separate-dataframes-in-python/
    # create empty list
    all_df = []

    # append datasets into the list
    for i in range(len(file_names_dask)):
        gcs_file_path2 = f"{gcs_file_path1}/part."
        temp_df = dd.read_parquet(f"{gcs_file_path2}{file_names_dask[i]}.parquet")
        all_df.append(temp_df)

    final_df = dd.concat(all_df, axis=0).reset_index(drop=True)
    print("Begin computing")
    final_df = final_df.compute()
    print("Done computing")
    end = datetime.datetime.now()
    print(f"Finish: {end-start}")
    return final_df

In [None]:
def categorize_by_percentile_pandas(
    df: pd.DataFrame, column_percentile: str, column_str: str
) -> pd.DataFrame:

    # Find percentiles
    agg1 = (
        df.groupby(["shape_array_key", "stop_sequence"])[column_percentile]
        .describe(percentiles=[0.05, 0.95])
        .reset_index()
        .add_prefix(column_str)
    )
    
    # Merge 
    m1 = dd.merge(
        df,
        agg1,
        how="inner",
        left_on=["shape_array_key", "stop_sequence"],
        right_on=[
            f"{column_str}shape_array_key",
            f"{column_str}stop_sequence",
        ],
    )
    
    def percentile(row):

        if row[column_percentile] == row[f"{column_str}mean"]:
            return f"{column_str} elapsed avg"
        elif row[f"{column_str}5%"] < row[column_percentile] <= row[f"{column_str}95%"]:
            return f"{column_str} elapsed avg"
        elif row[column_percentile] <= row[f"{column_str}5%"]:
            return f"{column_str} elapsed low"
        elif row[column_percentile] > row[f"{column_str}95%"]:
            return f"{column_str} elapsed high"

        else:
            return f"{column_str} elapsed avg"
    
    
    # Apply flags
    m1[f"{column_str}cat"] = m1.apply(lambda x: percentile(x), axis=1)
    
    # Delete out any average columns
    m1 = m1.loc[m1[f"{column_str}cat"] != f"{column_str} elapsed avg"].reset_index(drop = True)
    
    # Clean
    m1[f"{column_str}cat"] = m1[f"{column_str}cat"].str.replace("_", "")
    
    columns_to_keep = [
        "shape_array_key",
        "gtfs_dataset_key",
        "_gtfs_dataset_name",
        "speed_mph",
        "loop_or_inlining",
        "stop_sequence",
        "stop_id",
        "trip_id",
        "n_trips",
        "p20_mph",
        "p80_mph",
        "p50_mph",
        "time_of_day",
        "meters_elapsed",
        "sec_elapsed",
        f"{column_str}5%",
        f"{column_str}95%",
        f"{column_str}cat",
    ]
    m1 = m1[columns_to_keep]
    print(f"Done with {column_str}")
    
    return m1  

In [None]:
def extract_number(folder: str, phrase_to_find: str) -> list:
    """
    Extract the numeric portion of a file path.
    """
    files = find_files(folder, phrase_to_find)
    all_file_numbers = []
    for file in files:
        # https://stackoverflow.com/questions/11339210/how-to-get-integer-values-from-a-string-in-python
        file_number = "".join(i for i in file if i.isdigit())
        all_file_numbers.append(file_number)
    return all_file_numbers

In [None]:
# Find all the parquets again
def find_files(folder: str, phrase_to_find: str) -> list:
    """
    Grab a list of files that contain the
    phrase inputted.
    """
    # Create a list of all the files in my folder
    all_files_in_folder = fs.ls(folder)
    my_files = [i for i in all_files_in_folder if phrase_to_find in i]

    # String to add to read the files
    my_string = "gs://"
    my_files = [my_string + i for i in my_files]

    # Extract digit of parquet
    return my_files

In [None]:
def categorize_by_percentile(
    df: pd.DataFrame, column_percentile: str, column_str: str
) -> dd.DataFrame:
    
    # Find percentiles
    agg1 = (
        df.groupby(["shape_array_key", "stop_sequence"])[column_percentile]
        .describe(percentiles=[0.05, 0.95])
        .reset_index()
        .add_prefix(column_str)
    )
    
    
    # Convert to dask because it takes a very long time
    agg1_dask = dd.from_pandas(agg1, npartitions=1)
    df_dask = dd.from_pandas(df, npartitions=1)

    # Merge using dask
    merge1_dask = dd.merge(
        df_dask,
        agg1_dask,
        how="inner",
        left_on=["shape_array_key", "stop_sequence"],
        right_on=[
            f"{column_str}shape_array_key",
            f"{column_str}stop_sequence",
        ],
    )

    def percentile(row):

        if row[column_percentile] == row[f"{column_str}mean"]:
            return f"{column_str} elapsed avg"
        elif row[column_percentile] <= row[f"{column_str}5%"]:
            return f"{column_str} elapsed low"
        elif row[column_percentile] == 0:
            return f"{column_str} elapsed is 0"
        elif row[f"{column_str}5%"] < row[column_percentile] <= row[f"{column_str}95%"]:
            return f"{column_str} elapsed avg"
        elif row[column_percentile] > row[f"{column_str}95%"]:
            return f"{column_str} elapsed high"

        else:
            return "other"

    merge1_dask[f"{column_str}cat"] = merge1_dask.apply(
        lambda x: percentile(x), axis=1, meta=(f"{column_str}cat", "string")
    )
    
    # Filter for only unsually high and low stuff
    merge1_dask = merge1_dask[merge1_dask[f"{column_str}cat"].isin([f"{column_str} elapsed high", f"{column_str} elapsed low"]).reset_index(drop = True)
                              
    # Clean
    merge1_dask[f"{column_str}cat"] = merge1_dask[f"{column_str}cat"].str.replace(
        "_", ""
    )

    columns_to_keep = [
        "shape_array_key",
        "gtfs_dataset_key",
        "_gtfs_dataset_name",
        "speed_mph",
        "loop_or_inlining",
        "stop_sequence",
        "stop_id",
        "trip_id",
        "n_trips",
        "p20_mph",
        "p80_mph",
        "p50_mph",
        "time_of_day",
        "meters_elapsed",
        "sec_elapsed",
        f"{column_str}5%",
        f"{column_str}95%",
        f"{column_str}cat",
    ]
    merge1_dask = merge1_dask[columns_to_keep]
    print(f"Done with {column_str}")
    return merge1_dask

In [None]:
def flag_round1(row):
    if (row["meters_elapsed"] == 0) & (row["sec_elapsed"] == 0):
        return "division by 0"
    elif row["meters_cat"] == "meters elapsed low":
        return "meters too low"
    elif row["seconds_cat"] == "seconds elapsed high":
        return "seconds too high"
    else:
        return "ok"
    
#def flag_round2(row):
#    if (row["meters_elapsed"] == 0) & (row["sec_elapsed"] == 0):
#        return "division by 0"
#    else:
#        return "meters/seconds are filled but flagged"

def categorize_meters_speeds_dask(df):
    start = datetime.datetime.now()
    print(f"Begin: {start}")

    # Find percentiles
    df.speed_mph = df.speed_mph.fillna(0)

    # These are now dask dataframes
    ddf_meters = categorize_by_percentile(df, "meters_elapsed", "meters_")
    ddf_seconds = categorize_by_percentile(df, "sec_elapsed", "seconds_")

    merge_cols = [
        "shape_array_key",
        "gtfs_dataset_key",
        "_gtfs_dataset_name",
        "speed_mph",
        "loop_or_inlining",
        "stop_sequence",
        "stop_id",
        "n_trips",
        "p20_mph",
        "p80_mph",
        "p50_mph",
        "meters_elapsed",
        "sec_elapsed",
        "trip_id",
        "time_of_day",
    ]

    # Merge using dask
    m1 = dd.merge(ddf_meters, ddf_seconds, how="inner", on=merge_cols)

    # Apply flags
    m1["flag"] = m1.apply(lambda x: flag_round1(x), axis=1, meta=("flag", "string"))
    print("Apply first round of flags")

    # Filter out for projects that are ok, retag for zeroes
    m2 = m1[m1.flag != "ok"].reset_index()

    # Apply flag for zeroes
    m2["flag_division_0"] = m2.apply(
        lambda x: flag_round2(x), axis=1, meta=("flag", "string")
    )
    print("Apply second round of flags")

    # Replace values in the original flag
    # https://stackoverflow.com/questions/54302694/updating-the-values-of-a-column-in-a-dask-dataframe-based-on-some-condition-on-s
    condition = m2.flag_division_0 == "division by 0"
    m2["flag"] = m2["flag"].mask(condition, m2.flag_division_0)
    print("Done flagging")

    # Print value counts
    # print(f"breakout of rows after separating out for 0: \n {m2.flag.value_counts().compute()}")

    # Filter for only projects that are divided by 0
    # m2 = m2[m2.flag == "division by 0"].reset_index()
    # Delete older column
    m2 = m2.drop(columns=["flag_division_0", "level_0", "index"])
    print("Drop columns")

    # Save
    # m2 =  m2.repartition(partition_size="5MB")
    # m2.to_parquet(f"{speed_utils.GCS_PATH}partitioned_flags", overwrite = True)
    print("Saved")

    end = datetime.datetime.now()
    print(f"Finish: {end-start}")

    return m2

In [None]:
trips = list(equal_sampling.trip_id.unique())

In [None]:
stops = list(equal_sampling.stop_id.unique())

In [None]:
# Plot some of the trips
sample_data = few_routes_cat[few_routes_cat.trip_id.isin(trips)].reset_index()

In [None]:
sample_data.shape

In [None]:
# sample_data2 = sample_data[['shape_array_key','gtfs_dataset_key','trip_id']]

In [None]:
plotting = sample_data.melt(
    id_vars=[
        "_gtfs_dataset_name",
        "shape_array_key",
        "trip_id",
        "stop_sequence",
        "gtfs_dataset_key",
        "loop_or_inlining",
        "n_trips",
        "meters_elapsed",
        "meters_cat",
        "seconds_cat",
        "sec_elapsed",
        "flag",
        "p20_speed_mph",
        "p80_speed_mph",
        "median_speed_mph",
    ],
    value_vars=["speed_mph"],
)

In [None]:
# Clean
plotting = threshold_utils.pre_clean(plotting)

In [None]:
plotting["Dropdown Menu"] = plotting["Gtfs Dataset Name"] + " " + plotting["Trip Id"]

In [None]:
def alt_dropdown(df, col_for_dropdown: str, dropdown_menu_title: str):
    # Create dropdown menu
    # Exclude "none" operators which are only scheduled data
    df = df.loc[df[col_for_dropdown] != "None"][[col_for_dropdown]]
    dropdown_list = df[col_for_dropdown].unique().tolist()

    # Show only first operator by default
    initialize_first_op = sorted(dropdown_list)[0]
    input_dropdown = alt.binding_select(
        options=sorted(dropdown_list), name=dropdown_menu_title
    )

    selection = alt.selection_single(
        name=dropdown_menu_title,
        fields=[col_for_dropdown],
        bind=input_dropdown,
        init={col_for_dropdown: initialize_first_op},
    )

    return selection

In [None]:
selection_test = alt_dropdown(plotting, "Dropdown Menu", "Route")

In [None]:
(
    threshold_utils.chart_size(
        alt.Chart(plotting)
        .mark_tick(
            size=15,
            thickness=5,
        )
        .encode(
            x="Stop Sequence:N",
            y="Value:Q",
            color=alt.Color(
                "Flag:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BOLD_COLORS)
            ),
            tooltip=plotting.columns.tolist(),
        )
        .interactive(),
        1100,
        400,
    )
    .add_selection(selection_test)
    .transform_filter(selection_test)
)

In [None]:
stop

In [None]:
type(vehicle_positions)

In [None]:
vehicle_positions2 = vehicle_positions[
    vehicle_positions.trip_id.isin(trips)
].reset_index()

In [None]:
vehicle_positions2.shape

In [None]:
gdf1 = pd.merge(
    vehicle_positions2,
    sample_data,
    how="inner",
    on=["gtfs_dataset_key", "_gtfs_dataset_name", "trip_id"],
)

In [None]:
gdf1.shape

In [None]:
gdf1 = gdf1[gdf1.stop_id.isin(stops)]

In [None]:
gdf1.shape

In [None]:
gdf1[
    [
        "geometry",
        "stop_id",
        "stop_sequence",
        "_gtfs_dataset_name",
        "shape_array_key",
        "speed_mph",
        "flag",
    ]
].explore("flag")

In [None]:
stop

In [None]:
high_low_zero2 = high_low_zero.melt(
    id_vars=[
        "_gtfs_dataset_name",
        "shape_array_key",
        "trip_id",
        "stop_sequence",
        "gtfs_dataset_key",
        "loop_or_inlining",
        "n_trips",
        "meters_cat",
        "seconds_cat",
        "unusual_flag",
        "time_of_day",
    ],
    value_vars=["median_speed_mph", "speed_mph", "p20_speed_mph", "p80_speed_mph"],
)

In [None]:
high_low_zero2 = high_low_zero2.drop_duplicates(
    subset=[
        "loop_or_inlining",
        "shape_array_key",
        "stop_sequence",
        "time_of_day",
        "variable",
        "value",
    ]
).reset_index(drop=True)

In [None]:
high_low_zero2.shape

In [None]:
merge1.shape_array_key.nunique(), high_low_zero.shape_array_key.nunique()

In [None]:
# Clean
high_low_zero2 = threshold_utils.pre_clean(high_low_zero2)

In [None]:
# Add dropdown menu
high_low_zero2["Dropdown Menu"] = (
    high_low_zero2["Gtfs Dataset Name"] + " " + high_low_zero2["Shape Array Key"]
)

In [None]:
high_low_zero2["Route Type"] = "Route Type: " + high_low_zero2[
    "Loop Or Inlining"
].astype(str)

In [None]:
selection_test = alt_dropdown(high_low_zero2, "Dropdown Menu", "Route")

In [None]:
# https://github.com/altair-viz/altair/issues/1168
title = (
    alt.Chart(high_low_zero2)
    .mark_text(dy=-40, size=15, fontWeight="normal")
    .encode(
        text="Route Type:N",
    )
    .add_selection(selection_test)
    .transform_filter(selection_test)
)

In [None]:
"""total_stops_altair = (
    alt.Chart(stop_info)
    .mark_text(dy=-40, size=15, fontWeight="normal")
    .encode(
        text="Percentage Of Unusual Stops:N",
    )
    .add_selection(selection_test)
    .transform_filter(selection_test)
)"""

In [None]:
main_chart = (
    threshold_utils.chart_size(
        alt.Chart(high_low_zero2)
        .mark_tick(
            size=15,
            thickness=5,
        )
        .encode(
            x="Stop Sequence:N",
            y="Value:Q",
            color=alt.Color(
                "Variable:N", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)
            ),
            tooltip=high_low_zero2.columns.tolist(),
        )
        .interactive(),
        1100,
        400,
    )
    .add_selection(selection_test)
    .transform_filter(selection_test)
)

In [None]:
high_low_zero2.shape

In [None]:
(title & total_stops_altair | main_chart)

In [None]:
high_low_zero.shape_array_key.unique()

In [None]:
chart2 = threshold_utils.chart_size(chart2, 75, 200)

In [None]:
chart2 = chart2.add_selection(selection_test).transform_filter(selection_test)

In [None]:
title = threshold_utils.chart_size(title, 20, 20)

In [None]:
alt.data_transformers.enable("default", max_rows=None)

In [None]:
title & (chart1.interactive() & chart2.interactive())