In [1]:
import datetime
import dask.dataframe as dd
import dask_geopandas as dg
import dask
import geopandas as gpd
import pandas as pd
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils.project_vars import (
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    GCS_FILE_PATH,
    COMPILED_CACHED_VIEWS,
    RT_SCHED_GCS,
    CONFIG_PATH
)

from typing import Literal
import numpy as np

from shared_utils.rt_utils import MPH_PER_MPS
from calitp_data_analysis import utils

# cd rt_segment_speeds && pip install -r requirements.txt && cd
from shared_utils import portfolio_utils, schedule_rt_utils
from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes, segment_calcs

In [None]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

### All operators

#### Links
* https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/scripts/shapely_project_vp.py#L61

#### Map partitions
* Must specify data type
* Exclude any cols I don't want in meta.

### Spatial Accuracy
* Should this be one big function?

In [2]:
# 14,514,960 rows
vp_usable= dd.read_parquet(
      f"{SEGMENT_GCS}vp_usable_{analysis_date}"
)

In [None]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = "Bay Area 511 Muni VehiclePositions"
gtfs_key = "7cc0cb1871dfd558f11a2885c145d144"

In [None]:
"""
vp_usable= dd.read_parquet(
    f"{SEGMENT_GCS}vp_usable_{analysis_date}",
    filters=[
        [
            ("gtfs_dataset_name", "==", operator),
            ("schedule_gtfs_dataset_key", "==", gtfs_key),
        ]
    ],
)
"""

In [None]:
# vp_usable[['gtfs_dataset_name']].compute().nunique()

In [None]:
def grab_shape_keys_in_vp(vp_usable: dd.DataFrame, analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_usable = vp_usable[['trip_instance_key']]
    
    vp_usable = vp_usable.drop_duplicates().reset_index(drop=True)

    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = (
        pd.read_parquet(
            f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
            columns=["shape_array_key"],
        )
        .dropna()
        .drop_duplicates()
    )

    trips_with_shape = (
        helpers.import_scheduled_trips(
            analysis_date,
            columns=["trip_instance_key", "shape_array_key"],
            get_pandas=True,
        )
        .merge(shapes, on="shape_array_key", how="inner")
        .merge(vp_usable, on="trip_instance_key", how="inner")
        .drop_duplicates()
        .dropna()
        .reset_index(drop=True)
    )
    
    trips_with_shape = trips_with_shape[["trip_instance_key", "shape_array_key"]]

    return trips_with_shape

In [None]:
trips_with_shape = vp_usable.map_partitions(
        grab_shape_keys_in_vp,
        analysis_date,
        meta = {"trip_instance_key": "object",
               "shape_array_key": "object"},
        align_dataframes = False
    ).persist()

In [None]:

len(trips_with_shape)

In [None]:
def buffer_shapes2(
    trips_with_shape_subset: pd.DataFrame,
    analysis_date: str,
    buffer_meters: int = 35,
    **kwargs
) -> gpd.GeoDataFrame:
    """
    Filter scheduled shapes down to the shapes that appear in vp.
    Buffer these.
    
    Attach the shape geometry for a subset of shapes or trips.
    """
    subset = trips_with_shape.shape_array_key.unique()
    subset = subset.compute().tolist()
    
    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns = ["shape_array_key", "geometry"],
        filters = [[("shape_array_key", "in", subset)]],
        crs = PROJECT_CRS,
        get_pandas = True
    )
    
    # to_crs takes awhile, so do a filtering on only shapes we need
    shapes = shapes.assign(
        geometry = shapes.geometry.buffer(buffer_meters)
    )
    
    trips_with_shape_geom = pd.merge(
        shapes,
        trips_with_shape_subset,
        on = "shape_array_key",
        how = "inner"
    )
    
    return trips_with_shape_geom


In [None]:
"""
buffer_shapes = trips_with_shape.map_partitions(
        buffer_shapes2,
        analysis_date,
        35,
        meta = {
               "shape_array_key": "object",
               "geometry":"geometry",
                "trip_instance_key": "object"},
        align_dataframes = False
    ).persist()
    """

In [None]:
len(buffer_shapes)

In [None]:
def merge_vp_with_shape_and_count(
    vp: dd.DataFrame, trips_with_shape_geom: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Merge vp with crosswalk and buffered shapes.
    Get vp count totals and vp within shape.
    """
    # COMMENT BACK IN ONCE I MERGE WITH MAIN
    # vp_gdf = wrangle_shapes.vp_as_gdf(vp)

    vp_gdf = gpd.GeoDataFrame(
        vp, geometry=gpd.points_from_xy(vp.x, vp.y), crs=WGS84
    ).to_crs(PROJECT_CRS)
    
    # NEW
    trips_with_shape_geom = trips_with_shape_geom.set_geometry('geometry')
    trips_with_shape_geom = trips_with_shape_geom.set_crs(PROJECT_CRS)
    vp_gdf = vp_gdf.rename(
        columns={
            0: "gtfs_dataset_name",
            1: "schedule_gtfs_dataset_key",
            2: "trip_id",
            3: "trip_instance_key",
            4: "location_timestamp",
            5: "location_timestamp_local",
            6: "x",
            7: "y",
            8: "vp_idx",
            9: "gtfs_dataset_key",
            10: "vp_dir_xnorm",
            11: "vp_dir_ynorm",
            12: "vp_primary_direction",
        }
    )

    vp2 = pd.merge(
        vp_gdf, trips_with_shape_geom, on="trip_instance_key", how="inner"
    ).reset_index(drop=True)

    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)

    vp2 = vp2.assign(is_within=vp2.geometry_x.within(vp2.geometry_y)).query(
        "is_within==True"
    )

    vps_in_shape = (
        vp2.groupby("trip_instance_key", observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": "vp_in_shape"})
    )

    count_df = pd.merge(total_vp, vps_in_shape, on="trip_instance_key", how="left")

    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )

    count_df["total_vp_in_shape_pct"] = count_df.vp_in_shape / count_df.total_vp * 100

    return count_df

In [None]:
# The kernel for data-analyses/rt_scheduled_v_ran/03_metrics_all_ops.ipynb appears to have died. It will restart automatically.
"""
spatial_accuracy_df = vp_usable.map_partitions(
        merge_vp_with_shape_and_count,
        buffer_shapes,
        meta = {'trip_instance_key':'object', 
                'total_vp':'int32', 
                'vp_in_shape':'int32',
                 'total_vp_in_shape_pct':'float64'},
        align_dataframes = False
    ).persist()
    """

In [None]:
def spatial_accuracy(vp_usable: dd.DataFrame, analysis_date:str):
    trips_with_shape = grab_shape_keys_in_vp(vp_usable, analysis_date)
    
    trips_with_shape_geom = buffer_shapes2(trips_with_shape,analysis_date,  35)

    spatial_accuracy = merge_vp_with_shape_and_count(vp_usable, trips_with_shape_geom)

    return spatial_accuracy

In [None]:
spatial_accuracy_df = vp_usable.map_partitions(
        spatial_accuracy,
        analysis_date,
        meta = {'trip_instance_key':'object', 
                'total_vp':'int32', 
                'vp_in_shape':'int32',
                 'total_vp_in_shape_pct':'float64'},
        align_dataframes = False
    ).persist()

In [None]:
len(spatial_accuracy_df)

### Update Completeness

In [3]:
def pct_of_pings(df: pd.DataFrame):

    # Determine which rows have 2+ pings per minute
    df2 = df.assign(
        atleast2_trip_updates=df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )

    # Calculate total trip time
    df2["max_time"] = df2.location_timestamp_local
    df3 = (
        df2.groupby(["trip_instance_key"])
        .agg(
            {
                "location_timestamp_local": "min",
                "max_time": "max",
                "atleast2_trip_updates": "sum",
            }
        )
        .reset_index()
        .rename(columns={"location_timestamp_local": "min_time"})
    )

    # Add an extra minute
    df3["total_trip_time"] = (df3.max_time - df3.min_time) / pd.Timedelta(minutes=1) + 1

    # Find % of each trip in which one minute has 2+ pings
    df3 = df3.assign(
        pct_with_2_pings_per_min=df3.atleast2_trip_updates.divide(df3.total_trip_time)
        * 100
    )

    return df3

In [7]:
# filter down columns to time
vp_usable2 = vp_usable[["trip_instance_key", "vp_idx", "location_timestamp_local"]]

In [12]:
def counting(vp_usable):
    count = (vp_usable.groupby(["trip_instance_key",
                   pd.Grouper(key="location_timestamp_local", freq="5Min")
                  ], 
                  observed=True, group_keys=False)
             .vp_idx.count()
             .reset_index()
            )
    return count

In [13]:
vp_usable2.map_partitions(
    counting,
    meta = {"trip_instance_key": "object",
           "location_timestamp_local": "datetime64[ns]",
           "vp_idx": "int"},
    align_dataframes = False
).compute()

Unnamed: 0,trip_instance_key,location_timestamp_local,vp_idx
0,01d684e5d18b56f09f2eb816241b77b1,2023-10-11 10:45:00,11
1,01d684e5d18b56f09f2eb816241b77b1,2023-10-11 10:50:00,13
2,01d684e5d18b56f09f2eb816241b77b1,2023-10-11 10:55:00,6
3,0828e4c01c7313d43981af8c2e198491,2023-10-11 11:50:00,5
4,0828e4c01c7313d43981af8c2e198491,2023-10-11 11:55:00,15
...,...,...,...
398,fe66e263bc41d1d08c807c4f9f3797be,2023-10-11 12:00:00,11
399,fe66e263bc41d1d08c807c4f9f3797be,2023-10-11 12:05:00,7
400,fe66e263bc41d1d08c807c4f9f3797be,2023-10-11 12:10:00,11
401,fe66e263bc41d1d08c807c4f9f3797be,2023-10-11 12:15:00,4


In [4]:
def density_of_pings(df: pd.DataFrame):

    # Count number of pings that occur
    # on average per 5 minutes of the trip
    df2 = (
        df.groupby(
            [
                *["trip_instance_key"],
                pd.Grouper(key="location_timestamp_local", freq="5Min"),
            ]
        )
        .sum()
        .reset_index()
        .rename(columns={"number_of_pings_per_minute": "number_of_pings_per_5_min"})
    )

    df3 = (
        df2.groupby(["trip_instance_key"])
        .agg({"number_of_pings_per_5_min": "median"})
        .reset_index()
    )

    df3["avg_pings_per_5_min"] = df3.number_of_pings_per_5_min / 5

    return df3

In [5]:
def total_minutes_with_gtfs(df: pd.DataFrame):
    """
    Total minutes of a trip that recorded any
    vehicle positions.
    """
    df2 = (
        df.groupby(["trip_instance_key"])
        .agg({"gtfs_dataset_key": "count"})
        .rename(columns={"gtfs_dataset_key": "total_minutes_w_gtfs"})
        .reset_index()
    )
    return df2

In [None]:
def update_completeness(df: pd.DataFrame):
    
    complete_cols = [
    "vp_idx",
    "location_timestamp_local",
    "trip_instance_key",
    "gtfs_dataset_key"]
    
    df = df[complete_cols]
    
    # Find number of pings per minute for each trip
    df = df.sort_values(["vp_idx"]).reset_index(drop=True)

    df2 = (
        df.groupby(
            [
                *["trip_instance_key"],
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )

    pings_df = pct_of_pings(df2)
    density_df = density_of_pings(df2)
    total_minutes_df = total_minutes_with_gtfs(df2)

    m1 = pings_df.merge(density_df, on="trip_instance_key", how="inner").merge(
        total_minutes_df, on="trip_instance_key", how="inner"
    )

    m1 = m1.drop(columns=["min_time", "max_time", "number_of_pings_per_5_min"])
    return m1


In [None]:
update_completeness_df = vp_usable.map_partitions(
       update_completeness,
        meta = {'trip_instance_key':'object', 
                'atleast2_trip_updates':'int64', 
                'total_trip_time':'float64',
                 'pct_with_2_pings_per_min':'float64',
               'avg_pings_per_5_min':'float64',
               'total_minutes_w_gtfs':'float64'},
        align_dataframes = False
    ).persist()

In [None]:
type(update_completeness_df)

In [None]:
len(update_completeness_df)

In [None]:
# update_completeness_df2 = update_completeness_df.compute()

In [None]:
# update_completeness_df2.head()

### Speeds

In [14]:
trip_speeds = dd.read_parquet(
    f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet",
    columns=[
        "trip_instance_key",
        "speed_mph",
        "route_id",
        "time_of_day",
        "service_minutes",
    ],
)

In [None]:
entire_df = (dd.merge(trip_speeds, spatial_accuracy_df, on="trip_instance_key", how="outer").merge(update_completeness_df,  on = "trip_instance_key", how = "outer"))


In [None]:
entire_df.columns

In [None]:
type(entire_df)

In [None]:
entire_df = entire_df.compute()

### To do
* Rename pct_with_2_pings_per_min
* Rename atleast2_trip_updates to be something clearer
* Delete total_vp and vp_in_shape


Questions
* Should I bring in gtfs schedule key in `trip_speeds`
* What to do with rows with `nans` because I did outer joins?
    * 12% of the row don't have speed_mph information

Werid trips
* ff14aa6f478c00075d8c78741cbd17cd has 12 service minutes but 55 total trip time.

In [None]:
entire_df = pd.read_parquet('./ah_test')

In [None]:
entire_df = entire_df.drop(columns = ['total_vp','vp_in_shape'])

In [None]:
entire_df.sample(5)

In [None]:
entire_df.loc[entire_df.trip_instance_key == "9323cb6c9e3babc5bc8420cd960affee"]

In [None]:
entire_df.total_vp_in_shape_pct.describe()

In [None]:
entire_df.pct_with_2_pings_per_min.describe()

### Trip Timeliness
* Using % to measure timeliness
    * A trip that is scheduled to run 30 minutes but has 34 minutes of gtfs data is 15% slower than it's "supposed" to be.
* What to do with extreme values.
    * Service minutes is 12, but trip time is 1,440 minutes.

In [None]:
entire_df["trip_timeliness_pct"] = (entire_df.total_trip_time / entire_df.service_minutes - 1) * 100

In [None]:
# entire_df.trip_timeliness_pct = (entire_df.trip_timeliness_pct / 5).fillna(0).round().astype(int) * 5

In [None]:
entire_df.trip_timeliness_pct.describe()

In [None]:
entire_df[["trip_timeliness_pct", "total_trip_time", "total_minutes_w_gtfs","service_minutes"]].sample(10)

### Metric: GTFS Coverage
#### Comparing `total_trip_time` with `total_minutes_w_gtfs`
* The trip can run from 11 am to 12 pm but skip recording data in between 11:30-11:45. 
    * Thus the total trip time would be one hour but minutes with gtfs would be 45 minutes.
* On the otherhand, a trip can run for much longer (by accident)
    * As seen  below in which the trip is 11,900% faster...

In [None]:
entire_df['gtfs_coverage_v_total_trip_time_pct'] = (entire_df.total_minutes_w_gtfs/entire_df.total_trip_time) * 100

In [None]:
# Rounds up
# entire_df['gtfs_coverage_v_total_trip_time_pct'] = (entire_df.gtfs_coverage_v_total_trip_time_pct / 5).fillna(0).round().astype(int) * 5

In [None]:
entire_df['gtfs_coverage_v_total_trip_time_pct'].describe()

In [None]:
entire_df.loc[entire_df['gtfs_coverage_v_total_trip_time_pct'] < 99].shape

In [None]:
entire_df.loc[entire_df['gtfs_coverage_v_total_trip_time_pct'] < 99].gtfs_coverage_v_total_trip_time_pct.describe()

In [None]:
len(entire_df)

In [None]:
# entire_df[['gtfs_coverage_v_total_trip_time_rounded', 'gtfs_coverage_v_total_trip_time']].sample(10)

##### One trip - extreme
* First time stamp is at 7am on 10/11.
* Last timestamp is 10/12 at 7am.

In [None]:
# 14,514,960 rows
#vp_usable_pd = pd.read_parquet(
#      f"{SEGMENT_GCS}vp_usable_{analysis_date}"
#)

In [None]:
# vp_usable_pd.loc[vp_usable_pd.trip_instance_key == "a86db56c6c67f9ae4c1a4c828d5e9f91"].sort_values(['location_timestamp'])

In [None]:
entire_df.loc[entire_df.trip_instance_key == "a86db56c6c67f9ae4c1a4c828d5e9f91"]

In [None]:
# entire_df = entire_df.fillna(0)

#### How does the time recorded by Vehicle Positions  compare to scheduled service minutes?
* Too similar to trip timeliness?
* Use total trip time or total minutes with gtfs?

In [None]:
entire_df['gtfs_coverage_v_scheduled_pct'] = (entire_df.total_minutes_w_gtfs / entire_df.service_minutes * 100)

In [None]:
# Mask any value above 100 because often total trip time is slower than scheduled
entire_df.gtfs_coverage_v_scheduled_pct = entire_df.gtfs_coverage_v_scheduled_pct.mask(entire_df.gtfs_coverage_v_scheduled_pct > 100).fillna(100)

In [None]:
# Temporarily rounding so I can eyeball the distribution
entire_df.gtfs_coverage_v_scheduled_pct = (entire_df.gtfs_coverage_v_scheduled_pct / 5).fillna(0).round().astype(int) * 5

In [None]:
entire_df.gtfs_coverage_v_scheduled_pct.describe()

In [None]:
entire_df.gtfs_coverage_v_scheduled_pct.value_counts()

In [None]:
entire_df.loc[entire_df.trip_timeliness_pct == 11900]

#### Some trips are apparently more than 35% faster than their scheduled service minutes, which seems not very likely? 
* The example trips below
    * Faster than service minutes by 45% and 75%.
    * Every minute has more than 2 GTFS ping, so the data quality looks good

In [None]:
# Filter out
faster_than_scheduled = entire_df.loc[entire_df.trip_timeliness_pct < 0]

In [None]:
faster_than_scheduled.trip_timeliness_pct.describe()

In [None]:
preview = ['service_minutes','total_trip_time','trip_timeliness_pct']

In [None]:
faster_than_scheduled[preview].sample(5)

In [None]:
faster_than_scheduled.loc[faster_than_scheduled.trip_instance_key == "d97697de370980ad54f378ac1ee49142"][preview]

In [None]:
faster_than_scheduled.loc[faster_than_scheduled.trip_instance_key == "2d639eaffc7c93b9e020a1e7bb93edfa"][preview]

In [None]:
faster_than_scheduled.loc[faster_than_scheduled.trip_timeliness_pct == -35].sample(3)[preview]