## Improving on Script
* Feedback: https://github.com/cal-itp/data-analyses/pull/961

In [1]:
import datetime

import dask
import dask.dataframe as dd
import geopandas as gpd
import pandas as pd
from calitp_data_analysis.geography_utils import WGS84
from loguru import logger
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers, wrangle_shapes
from segment_speed_utils.project_vars import (
    GCS_FILE_PATH,
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
)

In [2]:
analysis_date

'2023-11-15'

In [3]:
SEGMENT_GCS

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/'

In [4]:
f"{GCS_FILE_PATH}rt_vs_schedule/"

'gs://calitp-analytics-data/data-analyses/rt_vs_schedule/'

In [5]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [6]:
from scripts import update_vars2

In [7]:
update_vars2.analysis_date_list

['2023-04-12', '2023-10-11', '2023-09-13']

### Filter columns 
* ['trip_instance_key', 'location_timestamp_local', 'x','y','vp_idx']

In [8]:
operator = "Bay Area 511 Muni VehiclePositions"
gtfs_key = "7cc0cb1871dfd558f11a2885c145d144"

In [9]:
def load_vp_usable(analysis_date):

    # Delete schedule_gtfs_dataset_key later
    df = dd.read_parquet(
        f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns=[
            "schedule_gtfs_dataset_key",
            "trip_instance_key",
            "location_timestamp_local",
            "x",
            "y",
            "vp_idx",
        ],
    )

    # Create a copy of location timestamp for the total_trip_time function
    # to avoid type setting
    df["max_time"] = df.location_timestamp_local
    return df

In [10]:
vp_usable = load_vp_usable(analysis_date)
# Filter for now
vp_usable2 = vp_usable.loc[vp_usable.schedule_gtfs_dataset_key == gtfs_key].reset_index(
    drop=True
)

In [11]:
# vp_usable2 = vp_usable2.compute()

In [12]:
type(vp_usable2)

dask.dataframe.core.DataFrame

### Total Trip Time
* Addresses "<i>in this function, min_time, max_time are created on the grouped df (vp_usable grouped by trip and binned minute)...I think to be safer, it should be created on vp_usable grouped by trip.</i>"
* The copy setting is now turned on?? How to get it to go away?

In [13]:
def total_trip_time(vp_usable_df: pd.DataFrame):
    """
    For each trip: find the total service minutes
    recorded in real time data so we can compare it with
    scheduled service minutes.
    """
    subset = ["location_timestamp_local", "trip_instance_key", "max_time"]
    vp_usable_df = vp_usable_df[subset]

    # Need an extra copy of the column to find the max

    # Find the max and the min time based on location timestamp
    df = (
        vp_usable_df.groupby(["trip_instance_key"])
        .agg({"location_timestamp_local": "min", "max_time": "max"})
        .reset_index()
        .rename(columns={"location_timestamp_local": "min_time"})
    )

    # Find total rt service mins and add an extra minute
    df["rt_service_min"] = (df.max_time - df.min_time) / pd.Timedelta(minutes=1) + 1

    # Return only one row per trip with the total trip time
    df = df.drop(columns=["max_time", "min_time"])

    return df

#### Change in script: remove map partitions

In [14]:
start = datetime.datetime.now()
print(start)
test = total_trip_time(vp_usable)

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-13 15:34:20.113 | INFO     | __main__:<module>:6 - execution time: 0:00:00.089104


2023-12-13 15:34:20.024656


In [15]:
# total_trip_time_df = total_trip_time_df.compute()

In [32]:
test.head()

Unnamed: 0,trip_instance_key,rt_service_min
0,a3647253d4cc8f847e972ed8c83d1b9b,22.62
1,7029f592047be84e5bb1d28d299be35d,16.93
2,1040196034fd380818a2cbcf1eafd9b8,40.95
3,5c6d43026fe5f02e5b31c18fcb8c0bf5,62.95
4,ee2f1fd83d87e85119f66014da5d74d5,14.07


### Update Completeness
#### Break apart?

In [16]:
def trips_by_one_min(vp_usable_df: pd.DataFrame):
    """
    For each trip: count how many rows are associated with each minute
    then tag whether or not a minute has 2+ pings. 
    """
    subset = ["location_timestamp_local", "trip_instance_key", "vp_idx"]
    vp_usable_df = vp_usable_df[subset]

    # Find number of pings each minute
    df = (
        vp_usable_df.groupby(
            [
                "trip_instance_key",
                pd.Grouper(key="location_timestamp_local", freq="1Min"),
            ]
        )
        .vp_idx.count()
        .reset_index()
        .rename(columns={"vp_idx": "number_of_pings_per_minute"})
    )

    # Determine which rows have 2+ pings per minute
    df = df.assign(
        min_w_atleast2_trip_updates=df.apply(
            lambda x: 1 if x.number_of_pings_per_minute >= 2 else 0, axis=1
        )
    )
    
    df = df.drop(columns = ['location_timestamp_local'])
    return df

In [17]:
# map partitions here 
# test = trips_by_min(vp_usable2)

In [18]:
start = datetime.datetime.now()
print(start)
one_min_ping_df = vp_usable.map_partitions(
    trips_by_one_min,
    meta={
        "trip_instance_key": "object",
        "number_of_pings_per_minute": "int64",
        "min_w_atleast2_trip_updates":"int64"
    },
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-13 15:34:20.179837


2023-12-13 15:36:18.805 | INFO     | __main__:<module>:14 - execution time: 0:01:58.625921


In [19]:
len(one_min_ping_df)

5554717

In [20]:
type(one_min_ping_df)

dask.dataframe.core.DataFrame

In [22]:
def update_completeness(df: pd.DataFrame):
    """
    For each trip: find the median GTFS pings per minute,
    the total minutes with at least 1 GTFS ping per minute,
    and total minutes with at least 2 GTFS pings per minute.
    """
    # Need a copy of numer of pings per minute to count for total minutes w gtfs
    df["total_min_w_gtfs"] = df.number_of_pings_per_minute
    
    # Find the total min with at least 2 pings per min
    df = (
        df.groupby(["trip_instance_key"])
        .agg(
            {
                "min_w_atleast2_trip_updates": "sum",
                "number_of_pings_per_minute": "sum",
                "total_min_w_gtfs": "count",
            }
        )
        .reset_index()
        .rename(
            columns={
                "number_of_pings_per_minute": "total_pings_for_trip",
            }
        )
    )

    return df

In [23]:
start = datetime.datetime.now()
print(start)
update_df = update_completeness(one_min_ping_df)
end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

2023-12-13 15:36:19.060 | INFO     | __main__:<module>:5 - execution time: 0:00:00.021831


2023-12-13 15:36:19.038506


In [24]:
update_df

Unnamed: 0_level_0,trip_instance_key,min_w_atleast2_trip_updates,total_pings_for_trip,total_min_w_gtfs
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,int64,int64,int64
,...,...,...,...


#### Testing new way to find pings per min

In [27]:
m1 = dd.merge(update_df, test, on = ['trip_instance_key'], how = 'inner')

In [29]:
m1 = m1.compute()

In [33]:
m1['ping_per_min'] = (m1.total_pings_for_trip / m1.rt_service_min)

In [53]:
m1.columns

Index(['trip_instance_key', 'min_w_atleast2_trip_updates',
       'total_pings_for_trip', 'total_min_w_gtfs', 'rt_service_min',
       'ping_per_min'],
      dtype='object')

In [36]:
full_df = pd.read_parquet("./scripts/rt_v_schedule_trip_metrics.parquet")

In [54]:
subset = ['trip_instance_key','ping_per_min', 'rt_service_min']

In [55]:
pings_test = pd.merge(m1[subset], full_df[['trip_instance_key', 'avg_pings_per_min', 'rt_service_min']], on = 'trip_instance_key', how = "inner")

In [56]:
pings_test['difference'] = pings_test.ping_per_min - pings_test.avg_pings_per_min

In [57]:
pings_test['difference'].describe()

count   86832.00
mean       -0.08
std         0.23
min        -3.32
25%        -0.04
50%        -0.01
75%         0.01
max         0.23
Name: difference, dtype: float64

In [72]:
pings_test.loc[pings_test.difference < 0].sort_values(by = ['difference'], ascending = True).head(30)

Unnamed: 0,trip_instance_key,ping_per_min,rt_service_min_x,avg_pings_per_min,rt_service_min_y,difference
46063,639c1765029d72e2b622532c8a65f715,1.21,198.6,4.53,198.6,-3.32
40224,5d029dfa67ed16905334dde3576e30e1,0.07,1407.53,2.94,1407.53,-2.87
40090,3771d3e0e42d16ae2dd09818fec8c066,0.05,1382.93,2.92,1382.93,-2.87
40424,93b478657fd87c9021af2679f85e1624,0.12,1389.65,2.98,1389.65,-2.86
3029,7339cc9fdded9595121d2adb2001e24c,0.09,569.3,2.94,569.3,-2.85
40533,b211c326052d4a03971867acb2c6280b,0.14,1290.7,2.97,1290.7,-2.83
42934,1e8778905116b9025d1acd3b440538a3,0.18,184.65,3.0,184.65,-2.82
40655,da80c80744a7adc8a02cfe4f41ee3cb8,0.17,1427.65,2.99,1427.65,-2.82
44891,cabef7484fd0c7a20ebbb0a1ef400d8b,0.1,721.65,2.92,721.65,-2.82
40098,3a3a0a84abd55eb44bb9501e0dfd620a,0.15,1332.6,2.97,1332.6,-2.82


In [77]:
m1.loc[m1.trip_instance_key == "923ff90bea616b5ac4ec909d79503424"]

Unnamed: 0,trip_instance_key,min_w_atleast2_trip_updates,total_pings_for_trip,total_min_w_gtfs,rt_service_min,ping_per_min
244,923ff90bea616b5ac4ec909d79503424,71,213,73,928.35,0.23


In [78]:
one_trip = one_min_ping_df.loc[one_min_ping_df.trip_instance_key == "923ff90bea616b5ac4ec909d79503424"].compute()

In [79]:
one_trip_og = vp_usable.loc[vp_usable.trip_instance_key == "923ff90bea616b5ac4ec909d79503424"].compute()

In [67]:
# one_trip_og

Unnamed: 0,schedule_gtfs_dataset_key,trip_instance_key,location_timestamp_local,x,y,vp_idx,max_time
2520238,d9272b05e39a35ce5f7e774170e94ff1,639c1765029d72e2b622532c8a65f715,2023-11-15 16:57:32,-121.0,37.64,2520238,2023-11-15 16:57:32
2520239,d9272b05e39a35ce5f7e774170e94ff1,639c1765029d72e2b622532c8a65f715,2023-11-15 16:58:17,-121.0,37.64,2520239,2023-11-15 16:58:17
2520240,d9272b05e39a35ce5f7e774170e94ff1,639c1765029d72e2b622532c8a65f715,2023-11-15 16:58:32,-121.0,37.64,2520240,2023-11-15 16:58:32
2520241,d9272b05e39a35ce5f7e774170e94ff1,639c1765029d72e2b622532c8a65f715,2023-11-15 16:58:47,-121.0,37.64,2520241,2023-11-15 16:58:47
2520242,d9272b05e39a35ce5f7e774170e94ff1,639c1765029d72e2b622532c8a65f715,2023-11-15 16:59:02,-121.0,37.64,2520242,2023-11-15 16:59:02
2520243,d9272b05e39a35ce5f7e774170e94ff1,639c1765029d72e2b622532c8a65f715,2023-11-15 16:59:32,-121.0,37.64,2520243,2023-11-15 16:59:32
2520244,d9272b05e39a35ce5f7e774170e94ff1,639c1765029d72e2b622532c8a65f715,2023-11-15 16:59:48,-121.0,37.64,2520244,2023-11-15 16:59:48
2520245,d9272b05e39a35ce5f7e774170e94ff1,639c1765029d72e2b622532c8a65f715,2023-11-15 17:00:03,-121.0,37.64,2520245,2023-11-15 17:00:03
2520246,d9272b05e39a35ce5f7e774170e94ff1,639c1765029d72e2b622532c8a65f715,2023-11-15 17:00:33,-121.0,37.64,2520246,2023-11-15 17:00:33
2520247,d9272b05e39a35ce5f7e774170e94ff1,639c1765029d72e2b622532c8a65f715,2023-11-15 17:00:48,-121.0,37.64,2520247,2023-11-15 17:00:48


In [71]:
 # one_trip

In [80]:
one_trip.number_of_pings_per_minute.mean()

2.9178082191780823

In [81]:
one_trip.number_of_pings_per_minute.sum()/198.6

1.0725075528700907

### Spatial Accuracy
* Addresses "<i>in next draft, work on grouping functions that belong together, such as this one. total_counts and total_counts_by_trip sound basically equivalent, and they are nearly doing the same thing, except total_counts actually creates 2 columns. work on logically grouping or absorbing functions or rewriting functions so the same function can now be used twice.
Adapt this function to be used twice
Compare it to this to find where they have stuff in common and which part should be removed from the generic function</i>"

In [None]:
def grab_shape_keys_in_vp(vp_usable: dd.DataFrame, analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_usable = (
        vp_usable[["trip_instance_key"]].drop_duplicates().reset_index(drop=True)
    )

    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns=["trip_instance_key", "shape_array_key"],
        get_pandas=True,
    )

    # Only one row per trip/shape
    # trip_instance_key and shape_array_key are the only 2 cols left
    m1 = dd.merge(vp_usable, trips_with_shape, on="trip_instance_key", how="inner")

    return m1

### Why is September not working


In [None]:
date2= '2023-09-13'

In [None]:
vp_usable = load_vp_usable(date2)
# Filter for now
vp_usable2 = vp_usable.loc[vp_usable.schedule_gtfs_dataset_key == gtfs_key].reset_index(
    drop=True
)

In [None]:
vp_usable2 = vp_usable2.compute()

In [None]:
vp_usable2.schedule_gtfs_dataset_key.unique()

In [None]:
test_keys = vp_usable2[['trip_instance_key']].drop_duplicates()

In [None]:
test_keys.shape

In [None]:
test_keys.head()

In [None]:
 trips_with_shape = helpers.import_scheduled_trips(
        date2,
        columns=["trip_instance_key", "shape_array_key"],
        get_pandas=True,
    )

In [None]:
m1 = pd.merge(test_keys, trips_with_shape, on="trip_instance_key", how="inner")

In [None]:
m1.shape

In [None]:
shapes_df = grab_shape_keys_in_vp(vp_usable2, date2)

In [None]:
type(shapes_df)

In [None]:
shapes_df.shape

In [None]:
subset = shapes_df.shape_array_key.unique().tolist()

In [None]:
subset

In [None]:
def buffer_shapes(
    trips_with_shape: pd.DataFrame,
    analysis_date: str,
    buffer_meters: int = 35,
):
    """
    Filter scheduled shapes down to the shapes that appear in vp.
    Buffer these.

    Attach the shape geometry for a subset of shapes or trips.
    """
    subset = trips_with_shape.shape_array_key.unique().compute().tolist()

    shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns=["shape_array_key", "geometry"],
        filters=[[("shape_array_key", "in", subset)]],
        crs=PROJECT_CRS,
        get_pandas=False,
    ).pipe(helpers.remove_shapes_outside_ca)

    # to_crs takes awhile, so do a filtering on only shapes we need
    shapes = shapes.assign(geometry=shapes.geometry.buffer(buffer_meters))

    trips_with_shape_geom = dd.merge(
        shapes, trips_with_shape, on="shape_array_key", how="inner"
    )

    trips_with_shape_geom = trips_with_shape_geom.compute()
    return trips_with_shape_geom

In [None]:
from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS

In [None]:
COMPILED_CACHED_VIEWS

In [None]:
FILE = f"{COMPILED_CACHED_VIEWS}routelines_{date2}.parquet"

In [None]:
FILE

In [None]:
# gs://calitp-analytics-data/data-analyses/rt_delay/compiled_cached_views/routelines_2023-09-13.parquet

In [None]:
shapes = helpers.import_scheduled_shapes(
        date2,
        columns=["shape_array_key", "geometry"],
        crs=PROJECT_CRS,
        get_pandas=False,
    ).pipe(helpers.remove_shapes_outside_ca)

In [None]:
import dask_geopandas as dg

In [None]:
test = dg.read_parquet(FILE)

In [None]:
test2 = gpd.read_parquet(FILE)

In [None]:
test2.columns

In [None]:
buffer_df = buffer_shapes(shapes_df, date2, 35)

In [None]:
def vp_in_shape(
    vp_usable: dd.DataFrame, trips_with_buffered_shape: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:

    keep = ["trip_instance_key", "x", "y", "location_timestamp_local"]
    vp_usable = vp_usable[keep]

    vp_gdf = wrangle_shapes.vp_as_gdf(vp_usable)

    gdf = pd.merge(
        vp_gdf, trips_with_buffered_shape, on="trip_instance_key", how="inner"
    )

    gdf = gdf.assign(is_within=gdf.geometry_x.within(gdf.geometry_y))
    gdf = gdf[["trip_instance_key", "location_timestamp_local", "is_within"]]

    return gdf

In [None]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_df1 = vp_usable.map_partitions(
    vp_in_shape,
    buffer_df,
    meta={
        "trip_instance_key": "object",
        "location_timestamp_local": "datetime64[ns]",
        "is_within": "bool",
    },
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

#### Adapt this to be used in multiple places

In [None]:
def total_vp_counts_by_trip(vp: gpd.GeoDataFrame, new_col_title: str) -> pd.DataFrame:
    """
    Get a count of vp for each trip, whether or not those fall
    within buffered shape or not
    """
    count_vp = (
        vp.groupby("trip_instance_key", observed=True, group_keys=False)
        .agg({"location_timestamp_local": "count"})
        .reset_index()
        .rename(columns={"location_timestamp_local": new_col_title})
    )

    return count_vp

In [None]:
def total_counts(result: dd.DataFrame):

    # Find the total number of vps for each route
    total_vp_df = total_vp_counts_by_trip(result, "total_vp")

    # Find the total number of vps that actually fall within the  route shape
    subset = ["trip_instance_key", "location_timestamp_local"]
    result2 = result.loc[result.is_within == True].reset_index(drop=True)[subset]

    vps_in_shape = total_vp_counts_by_trip(result2, "vp_in_shape")

    # Count total vps for the trip
    count_df = pd.merge(total_vp_df, vps_in_shape, on="trip_instance_key", how="left")

    count_df = count_df.assign(
        vp_in_shape=count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp=count_df.total_vp.fillna(0).astype("int32"),
    )

    return count_df

In [None]:
start = datetime.datetime.now()
print(start)
spatial_accuracy_df2 = spatial_accuracy_df1.map_partitions(
    total_counts,
    meta={"trip_instance_key": "object", "total_vp": "int32", "vp_in_shape": "int32"},
    align_dataframes=False,
).persist()

end = datetime.datetime.now()
logger.info(f"execution time: {end-start}")

In [None]:
spatial_accuracy_df2 = spatial_accuracy_df2.compute()

In [None]:
spatial_accuracy_df2.head()

In [None]:
type(update)

test_m = (total_trip_time_df.merge(update, on = "trip_instance_key", how = "outer")
         .merge(spatial_accuracy_df2, on ="trip_instance_key", how = "outer")) 

### Read back in the file 12/11

In [None]:
full_df = pd.read_parquet("./scripts/rt_v_schedule_trip_metrics.parquet")

In [None]:
full_df.sample()

In [None]:
full_df.info()

In [None]:
full_df.trip_instance_key.nunique(), len(full_df)