# Migrate to VP Usable
* https://github.com/cal-itp/data-analyses/issues/936
* cd rt_segment_speeds && pip install -r requirements.txt && cd ..
    * https://github.com/cal-itp/data-analyses/blob/main/Makefile#L49C2-L49C66
    

In [1]:
import datetime
import dask.dataframe as dd
import dask_geopandas as dg
import dask
import geopandas as gpd
import pandas as pd
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils.project_vars import (
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    GCS_FILE_PATH,
    COMPILED_CACHED_VIEWS,
    RT_SCHED_GCS,
    CONFIG_PATH
)

# For speeds c2
from typing import Literal
import numpy as np

# for speeds c3
# also import numpy 
from shared_utils.rt_utils import MPH_PER_MPS
from calitp_data_analysis import utils
from shared_utils import portfolio_utils, schedule_rt_utils
from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes, segment_calcs

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = 'Bay Area 511 Muni VehiclePositions'
gtfs_key = '7cc0cb1871dfd558f11a2885c145d144'

## Spatial Accuracy
* Based on https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py
### Grab_shape_keys_in_vp
#### First time reading `vp_usable`

In [4]:
def grab_shape_keys_in_vp(analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_trip_df  = pd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]],
        columns = ['trip_instance_key'])
    
    vp_trip_df  = vp_trip_df.drop_duplicates().reset_index(drop = True)
    
    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = pd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
        columns = ["shape_array_key"],
    ).dropna().drop_duplicates()
    
    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["trip_instance_key", "shape_array_key"],
        get_pandas = True
    ).merge(
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).merge(
        vp_trip_df,
        on = "trip_instance_key",
        how = "inner"
    ).drop_duplicates().dropna().reset_index(drop=True)

    return trips_with_shape

In [5]:
trips_with_shape = grab_shape_keys_in_vp(analysis_date)

In [6]:
trips_with_shape.shape

(9240, 2)

In [7]:
trips_with_shape.head()

Unnamed: 0,trip_instance_key,shape_array_key
0,12de3d260e9fe09fa878cb4cdb2d6898,749b225ca6691f77914e88577dc13e68
1,934aea5748bce830ffc2fa88dc01402a,749b225ca6691f77914e88577dc13e68
2,3d06fa8e68e4f38d3ccc7adfabb5c3d9,f1a7410fae06937b7183f6a553707915
3,34109ad8cfeca83cd459d42c7d51d602,f1a7410fae06937b7183f6a553707915
4,34acd907ae9d5eb5456d683d2458bbc6,f1a7410fae06937b7183f6a553707915


### Buffer shapes

In [8]:
# This is trips_with_shape_geom
trips_with_shape_geom =  vp_spatial_accuracy.buffer_shapes(analysis_date, 
                                trips_with_shape,
                                35)

In [9]:
trips_with_shape_geom.shape

(9240, 3)

#### Second time reading in the same file, streamline

In [10]:
vp = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]])

In [11]:
vp[['trip_instance_key']].compute().nunique()

trip_instance_key    9240
dtype: int64

### Full function

In [12]:
def merge_vp_with_shape_and_count(
    vp: dd.DataFrame,
    trips_with_shape_geom: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Merge vp with crosswalk and buffered shapes.
    Get vp count totals and vp within shape.
    """
    vp_gdf = gpd.GeoDataFrame(
        vp,
        geometry = gpd.points_from_xy(vp.x, vp.y),
        crs = WGS84
    ).to_crs(PROJECT_CRS)
    
    vp_gdf = vp_gdf.rename(columns = {
    0:'gtfs_dataset_name',
    1: 'schedule_gtfs_dataset_key', 
    2:'trip_id',
    3:'trip_instance_key',
    4: 'location_timestamp', 
    5: 'location_timestamp_local',
    6:'x',
    7:'y', 
    8:'vp_idx', 
    9:'gtfs_dataset_key', 
    10:'vp_dir_xnorm', 
    11:'vp_dir_ynorm',
    12:'vp_primary_direction'})
    
    vp2 = pd.merge(
        vp_gdf,
        trips_with_shape_geom,
        on = "trip_instance_key",
        how = "inner"
    ).reset_index(drop=True)
    
    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)
    
    vp2 = vp2.assign(
        is_within = vp2.geometry_x.within(vp2.geometry_y)
    ).query('is_within==True')
    
    vps_in_shape = (vp2.groupby("trip_instance_key", 
                                observed = True, group_keys = False)
                    .agg({"location_timestamp_local": "count"})
                    .reset_index()
                    .rename(columns = {"location_timestamp_local": "vp_in_shape"})
                   )
        
    count_df = pd.merge(
        total_vp,
        vps_in_shape,
        on = "trip_instance_key",
        how = "left"
    )
    
    count_df = count_df.assign(
        vp_in_shape = count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp = count_df.total_vp.fillna(0).astype("int32")
    )
    
    return count_df

In [13]:
muni = merge_vp_with_shape_and_count(vp, trips_with_shape_geom)

In [14]:
muni.vp_in_shape.describe()

count   9240.00
mean     161.87
std       64.06
min        0.00
25%      118.00
50%      167.00
75%      207.00
max      481.00
Name: vp_in_shape, dtype: float64

In [15]:
muni.trip_instance_key.nunique()

9240

#### Add %?

In [16]:
(muni.vp_in_shape/muni.total_vp).describe()

count   9240.00
mean       0.93
std        0.14
min        0.00
25%        0.94
50%        1.00
75%        1.00
max        1.00
dtype: float64

In [17]:
muni.loc[muni.vp_in_shape == 0]

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape
167,04ae6ea9655473bdcaf9b80d443558ae,111,0
351,0a1553b7a738967a422b7b8960560ded,75,0
408,0bc53a1063efae8a32eca4a9034d2a21,61,0
1128,1fe28ce8669a7f946f272faf80d80ce7,70,0
1981,37aed2e5ad125f8aede20922b38bf6b9,78,0
3022,54e54e2839a6945b1005322398a89ae9,132,0
3127,580684a7a6b43ce5130f60a441681713,46,0
3213,5a82f8c02f2036db77b1979f4ae228f5,41,0
3861,6cbdc2d321ca63df78bce01eeb06fe07,47,0
3937,6ee4a9165f47b59cb3e10b0125dd72e8,34,0


In [18]:
vp[['trip_instance_key']].compute().nunique()

trip_instance_key    9240
dtype: int64

## Update Completeness
* https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/01_update_completeness.ipynb

### Translate to `vp_usable`

In [19]:
# Only use the trips with shapes.
relevant_trips = list(muni.trip_instance_key.unique())

#### Third time reading in `vp_usable`

In [20]:
completeness_cols = ['vp_idx',
      'location_timestamp_local', 'trip_instance_key',
      'gtfs_dataset_key']

In [21]:
vp_filtered = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns = completeness_cols, 
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key),
                   ('trip_instance_key', 'in', relevant_trips)]])

In [22]:
fresh_updates = vp_filtered.sort_values(['vp_idx']).reset_index(drop = True)

#### Can't use dask for this type of groupby 
* Also grouping only by `trip_instance_key` yields the best result..unsure if that's ok

In [23]:
two_cols = ['trip_instance_key','gtfs_dataset_key']

In [24]:
fresh_updates_df = fresh_updates.compute()

In [25]:
# fresh_updates_df = fresh_updates_df.assign(fresh = 1)

In [26]:
""" DOES NOT WORK
test_group1 = (fresh_updates_df.groupby([
        *two_cols, 
         pd.Grouper(key = 'location_timestamp_local', freq = '1Min')
    ])
    .count()).reset_index() """

" DOES NOT WORK\ntest_group1 = (fresh_updates_df.groupby([\n        *two_cols, \n         pd.Grouper(key = 'location_timestamp_local', freq = '1Min')\n    ])\n    .count()).reset_index() "

In [27]:
test_group2 = (fresh_updates_df.groupby([
        *['trip_instance_key'], 
         pd.Grouper(key = 'location_timestamp_local', freq = '1Min')
    ])
    .count()
    .reset_index()
    .rename(columns = {'vp_idx':'number_of_pings_per_minute'})
        )

### Checks

In [28]:
key1 = "38247cbee93b6f85d58bf1812ae553b9"

In [29]:
key2 = "fac53ed1db7d914cc4c1857e967344f4"

In [30]:
key3 = "9e291393bf06763fb6c6fe950d6e8097"

In [31]:
# test_group2.loc[test_group2.trip_instance_key == key1]

In [32]:
# fresh_updates_df.loc[fresh_updates_df.trip_instance_key == key1]

In [33]:
# test_group2.loc[test_group2.trip_instance_key == key2]

In [34]:
# fresh_updates_df.loc[fresh_updates_df.trip_instance_key == key2]

In [35]:
# test_group2.loc[test_group2.trip_instance_key == key3]

In [36]:
# fresh_updates_df.loc[fresh_updates_df.trip_instance_key == key3]

In [37]:
# (test_group2.fresh - test_group2.number_of_pings_per_minute).describe()

In [38]:
# test_group2.fresh.describe()

In [39]:
test_group2 = test_group2.assign(
        atleast2_trip_updates = test_group2.apply(
            lambda x: 1 if x["number_of_pings_per_minute"] >= 2
            else 0, axis=1)
    )    
    

In [40]:
test_group2.atleast2_trip_updates.value_counts()

1    534886
0      6795
Name: atleast2_trip_updates, dtype: int64

### Trip minutes is wrong
* Check w/ 38247cbee93b6f85d58bf1812ae553b9
* Began at 16:01:00, end at 17:00:000
* Skips having data: jumps from 4:38 to 4:54

In [41]:
""" DOES NOT WORK 
test_group2_grouped = (test_group2.groupby(two_cols)
           .agg({
               "location_timestamp_local": "size",
               "atleast2_trip_updates": "sum"})
           .reset_index()
          ).rename(columns = {
            "location_timestamp_local": "trip_min_elapsed"
    })
  """  

' DOES NOT WORK \ntest_group2_grouped = (test_group2.groupby(two_cols)\n           .agg({\n               "location_timestamp_local": "size",\n               "atleast2_trip_updates": "sum"})\n           .reset_index()\n          ).rename(columns = {\n            "location_timestamp_local": "trip_min_elapsed"\n    })\n  '

In [42]:
# test_group2_grouped.loc[test_group2_grouped.trip_instance_key == key1]

In [43]:
test_group2['max_time'] = test_group2.location_timestamp_local

In [44]:
test_group3 = (test_group2
                        .groupby(['trip_instance_key'])
                        .agg({'location_timestamp_local':'min','max_time':'max', 
                             'atleast2_trip_updates':'sum'})
                        .reset_index()
                        .rename(columns = {'location_timestamp_local':'min_time'})
                       )

In [45]:
test_group3['trip_min_elapsed'] = (test_group3.max_time - test_group3.min_time) / pd.Timedelta(minutes=1)

In [46]:
test_group3.loc[test_group3.trip_instance_key == key1]

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed
2002,38247cbee93b6f85d58bf1812ae553b9,2023-10-11 16:01:00,2023-10-11 17:00:00,45,59.0


In [47]:
# test_group2.loc[test_group2.trip_instance_key == key1]

In [48]:

# fresh_updates_df.loc[fresh_updates_df.trip_instance_key == key1]

In [49]:

fresh_updates_df.loc[fresh_updates_df.trip_instance_key == key1].shape

(133, 4)

In [50]:
test_group3 = test_group3.assign(
        pct_update_complete = test_group3.atleast2_trip_updates.divide(
            test_group3.trip_min_elapsed)
    ) 

### Some trips experiencing pct-update-complete greater than 100%
* They have more rows than trip_min_elapsed

In [51]:
len(test_group3.loc[test_group3.pct_update_complete > 1])

3701

In [52]:
len(test_group3)

9240

In [53]:
test_group3.pct_update_complete.describe()

count   9240.00
mean       1.00
std        0.06
min        0.02
25%        1.00
50%        1.00
75%        1.02
max        1.10
Name: pct_update_complete, dtype: float64

In [54]:
test_group3.loc[test_group3.pct_update_complete == 1.1].sample()

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete
1972,37622040815e89d063272bb6e37acc65,2023-10-11 05:31:00,2023-10-11 05:41:00,11,10.0,1.1


#### How many cols have larger atleast2_trip_updates compared to trip-min_elapsed

In [55]:
test_group3['larger'] =test_group3.trip_min_elapsed -  test_group3.atleast2_trip_updates 

In [56]:
test_group3.larger.describe()

count   9240.00
mean       0.29
std        3.82
min       -1.00
25%       -1.00
50%        0.00
75%        0.00
max      105.00
Name: larger, dtype: float64

#### One trip only recorded 2+ pings per minute in 36% of its duration

In [57]:
test_group3.loc[test_group3.larger == 105].sample()

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete,larger
3942,6ef4805f0104b95614b86a2b1c374d23,2023-10-11 17:48:00,2023-10-11 20:31:00,58,163.0,0.36,105.0


In [58]:
len(fresh_updates_df.loc[fresh_updates_df.trip_instance_key == "6ef4805f0104b95614b86a2b1c374d23"])

176

* Minutes skipped: 6:12 to 6:18
* 6:19-6:26
* 6:28-7:33 etc etc
* Trip started at 5:48, ended at 8:31 

In [59]:
# test_group2.loc[test_group2.trip_instance_key == "6ef4805f0104b95614b86a2b1c374d23"]

In [60]:
# fresh_updates_df.loc[fresh_updates_df.trip_instance_key == "6ef4805f0104b95614b86a2b1c374d23"]

In [61]:
def mask_overly_large_min(row):
    if row.atleast2_trip_updates > row.trip_min_elapsed:
        return row.trip_min_elapsed
    else:
        return row.atleast2_trip_updates

In [62]:
# Apply the function: if the atleast2 trip updates is larger than trip_min_elapsed, mask it
# Or maybe just mask the pct-update-complete? 
test_group3["test_mask"] = test_group3.apply(mask_overly_large_min, axis=1)

In [63]:
test_group3 = test_group3.assign(
        pct_update_complete2 = test_group3.test_mask.divide(
            test_group3.trip_min_elapsed)
    ) 

In [64]:
test_group3.loc[test_group3.larger == -1].sample(2)

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete,larger,test_mask,pct_update_complete2
3595,655e1bd4e33d6856c42f1e9277cbe24a,2023-10-11 15:04:00,2023-10-11 16:24:00,81,80.0,1.01,-1.0,80.0,1.0
2903,51cba90b3f79130b0928ec44ca05df56,2023-10-11 06:20:00,2023-10-11 07:31:00,72,71.0,1.01,-1.0,71.0,1.0


#### 37622040815e89d063272bb6e37acc65
* The trip is ten minutes but there are eleven rows.

In [65]:
test_group3.loc[test_group3.trip_instance_key == "37622040815e89d063272bb6e37acc65"]

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete,larger,test_mask,pct_update_complete2
1972,37622040815e89d063272bb6e37acc65,2023-10-11 05:31:00,2023-10-11 05:41:00,11,10.0,1.1,-1.0,10.0,1.0


In [66]:
len(test_group3.loc[test_group3.trip_instance_key == "37622040815e89d063272bb6e37acc65"])

1

In [67]:
test_group3.loc[test_group3.trip_instance_key == "37622040815e89d063272bb6e37acc65"]

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete,larger,test_mask,pct_update_complete2
1972,37622040815e89d063272bb6e37acc65,2023-10-11 05:31:00,2023-10-11 05:41:00,11,10.0,1.1,-1.0,10.0,1.0


#### 568caf4acf76125fb5db063f8737e5a8
* Trip is 73 minutes

In [68]:
test_group3.loc[test_group3.trip_instance_key == "568caf4acf76125fb5db063f8737e5a8"]

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete,larger,test_mask,pct_update_complete2
3077,568caf4acf76125fb5db063f8737e5a8,2023-10-11 08:32:00,2023-10-11 09:45:00,74,73.0,1.01,-1.0,73.0,1.0


In [69]:
len(test_group2.loc[test_group2.trip_instance_key == "568caf4acf76125fb5db063f8737e5a8"])

74

In [70]:
len(fresh_updates_df.loc[fresh_updates_df.trip_instance_key == "568caf4acf76125fb5db063f8737e5a8"])

221

In [71]:
len(test_group3)

9240

In [72]:
test_group3.sample(3)

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,trip_min_elapsed,pct_update_complete,larger,test_mask,pct_update_complete2
2363,420d3fc4c87a397016e3559828ff2ed8,2023-10-11 21:03:00,2023-10-11 22:19:00,77,76.0,1.01,-1.0,76.0,1.0
8065,e0845beb78a4bed27356dfdc24b4ff0d,2023-10-11 05:30:00,2023-10-11 06:27:00,58,57.0,1.02,-1.0,57.0,1.0
7348,cce9e1f6d83123dc73b090e86abb417f,2023-10-11 16:41:00,2023-10-11 17:36:00,55,55.0,1.0,0.0,55.0,1.0


## How many minutes a trip took and the average speeds?
* Temporary place until I move the spatial stuff out

In [73]:
relevant_trips = list(test_group3.trip_instance_key.unique())

### C2 
* https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C2_triangulate_vp.py
* Break it apart and check it out later to undersatnd what's happening
* No need to `subset_usable_vp` because already filtered above.

In [74]:
def triangulate_vp(
    ddf: dd.DataFrame, 
    group_cols: list = ["trip_instance_key"]
) -> np.ndarray:
    """
    Grab a sample of vehicle positions for each trip to triangulate distance.
    These vp already sjoined onto the shape.
    Roughly pick vp at equally spaced intervals.
    
    Dask aggregation can't group and use lambda to create list of possible 
    vp_idx.
    """        
    grouped_ddf = ddf.groupby(group_cols, observed=True, group_keys=False)

    min_df = (grouped_ddf
              .agg({"vp_idx": "min"})
              .rename(columns = {"vp_idx": "min_vp_idx"})
             )

    max_df = (grouped_ddf
              .agg({"vp_idx": "max"})
              .rename(columns = {"vp_idx": "max_vp_idx"})
             )
    
    vp_range = dd.merge(
        min_df,
        max_df,
        left_index = True,
        right_index = True,
        how = "inner"
    )

    vp_range = vp_range.persist()
    
    vp_range["range_diff"] = vp_range.max_vp_idx - vp_range.min_vp_idx
    
    vp_range = vp_range.assign(
        p25_vp_idx = (vp_range.range_diff * 0.25 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
        p50_vp_idx = (vp_range.range_diff * 0.5 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
        p75_vp_idx = (vp_range.range_diff * 0.75 + vp_range.min_vp_idx
                     ).round(0).astype("int64"),
    )
    
    vp_idx_cols = [
        "min_vp_idx", 
        "p25_vp_idx",
        "p50_vp_idx", 
        "p75_vp_idx",
        "max_vp_idx"
    ]

    results = vp_range[vp_idx_cols].compute().to_numpy().flatten()    
    
    results = list(results)
    return results

#### Help: which df should I use to triangulate?
* Trips in which 0 of the points fall into the shapes should be excluded?

In [75]:
len(vp_filtered)

1604702

In [76]:
triangulate_muni = triangulate_vp(vp_filtered, 'trip_instance_key')

In [77]:
type(triangulate_muni)

list

In [78]:
vp_filtered[['trip_instance_key']].compute().nunique()

trip_instance_key    9240
dtype: int64

#### 4th time loading `vp_usable`

In [79]:
vp_results = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns = [
            "gtfs_dataset_key", "trip_instance_key",
            "location_timestamp_local",
            "x", "y", "vp_idx"],
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key),
                    ('vp_idx', 'in', triangulate_muni)]]).compute()

In [80]:
len(vp_results)

46198

In [81]:
vp_results.trip_instance_key.nunique()

9240

In [193]:
vp_results.sample()

Unnamed: 0,gtfs_dataset_key,trip_instance_key,location_timestamp_local,x,y,vp_idx
12346358,c0e3039da063db95ebabd3fe4ee611a4,82ffe12e667c8afee620b19d9166dc2b,2023-10-11 19:14:36,-122.45,37.72,12346358


In [82]:
def merge_rt_scheduled_trips(
    rt_trips: dd.DataFrame,
    analysis_date: str,
    group_cols: list = ["trip_instance_key"]) -> dd.DataFrame:
    """
    Merge RT trips (vehicle positions) to scheduled trips 
    to get the shape_array_key.
    Don't pull other scheduled trip columns now, wait until
    after aggregation is done.
    """
    trips = helpers.import_scheduled_trips(
        analysis_date,
        columns = group_cols + ["shape_array_key"],
        get_pandas = True
    )
        
    df = dd.merge(
        rt_trips,
        trips,
        on = group_cols,
        how = "left",
    )
    
    return df

In [83]:
# https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C2_triangulate_vp.py#L170-L180
vp_with_sched = (
        merge_rt_scheduled_trips(
            vp_results, 
            analysis_date, 
            group_cols = ["trip_instance_key"]
        ).sort_values("vp_idx")
        .reset_index(drop=True)
    )

In [84]:
type(vp_with_sched)

pandas.core.frame.DataFrame

In [85]:
# Check triangualte worked 
triangulate_check = vp_with_sched.groupby(['trip_instance_key']).agg({'location_timestamp_local':'nunique'}).reset_index()

In [86]:
triangulate_check.loc[triangulate_check.location_timestamp_local == 3]

Unnamed: 0,trip_instance_key,location_timestamp_local
6824,be7a580802c3ec183904cac37e6c0afd,3


In [87]:
triangulate_check.location_timestamp_local.describe()

count   9240.00
mean       5.00
std        0.02
min        3.00
25%        5.00
50%        5.00
75%        5.00
max        5.00
Name: location_timestamp_local, dtype: float64

In [88]:
triangulate_check.location_timestamp_local.value_counts()

5    9239
3       1
Name: location_timestamp_local, dtype: int64

In [89]:
vp_with_sched.loc[vp_with_sched.trip_instance_key == "be7a580802c3ec183904cac37e6c0afd"]

Unnamed: 0,gtfs_dataset_key,trip_instance_key,location_timestamp_local,x,y,vp_idx,shape_array_key
31720,c0e3039da063db95ebabd3fe4ee611a4,be7a580802c3ec183904cac37e6c0afd,2023-10-11 16:13:08,-122.45,37.72,12506123,a18dde89fa6a4c568d23bda75c68f6cf
31721,c0e3039da063db95ebabd3fe4ee611a4,be7a580802c3ec183904cac37e6c0afd,2023-10-11 17:01:19,-122.47,37.74,12506124,a18dde89fa6a4c568d23bda75c68f6cf
31722,c0e3039da063db95ebabd3fe4ee611a4,be7a580802c3ec183904cac37e6c0afd,2023-10-11 17:01:35,-122.47,37.74,12506125,a18dde89fa6a4c568d23bda75c68f6cf


### C3
* https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C3_trip_route_speed.py

In [90]:
# https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C3_trip_route_speed.py#L262-L265
vp_c3 = gpd.GeoDataFrame(
        vp_with_sched,
        geometry = gpd.points_from_xy(vp_with_sched.x, vp_with_sched.y, crs=WGS84)
    ).to_crs(PROJECT_CRS).drop(columns = ["x", "y"])

In [91]:
vp_c3.trip_instance_key.nunique()

9240

In [92]:
shapes_list = vp_c3.shape_array_key.unique().tolist()

In [93]:
shapes = helpers.import_scheduled_shapes(
        analysis_date,
        columns = ["shape_array_key","geometry"],
        filters = [[("shape_array_key", "in", shapes_list)]],
        get_pandas = True,
        crs = PROJECT_CRS
    )

In [94]:
shapes.shape

(255, 2)

In [95]:
type(shapes)

geopandas.geodataframe.GeoDataFrame

In [96]:
# shapes.explore('shape_array_key')

In [97]:
# https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C3_trip_route_speed.py#L280-L287
c3_m1 = pd.merge(
        vp_c3,
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).rename(columns = {"geometry_x": "vp_geometry", 
                        "geometry_y": "shape_geometry"}
            ).set_geometry("vp_geometry")

In [98]:
# https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C3_trip_route_speed.py#L290-L293
shape_meters_geoseries = wrangle_shapes.project_point_geom_onto_linestring(
        c3_m1,
        "shape_geometry",
        "vp_geometry",
    )


In [99]:
c3_m1["shape_meters"] = shape_meters_geoseries

#### distance_and_seconds_elapsed
* Breaking apart to understand
* https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C3_trip_route_speed.py#L290-L293

In [100]:
dist_col = "shape_meters"
time_col = "location_timestamp_local"
group_cols = ["gtfs_dataset_key", "trip_instance_key"]
sort_cols = group_cols + ["vp_idx"]
 

In [101]:
type(c3_m1)

geopandas.geodataframe.GeoDataFrame

In [102]:
c3_m1 = c3_m1.assign(
        prior_dist = (c3_m1.sort_values(sort_cols)
                      .groupby(group_cols, 
                               observed=True, group_keys=False)
                      [dist_col]
                      .apply(lambda x: x.shift(1))
                     ),
        prior_time = (c3_m1.sort_values(sort_cols)
                      .groupby(group_cols, 
                               observed=True, group_keys=False)
                      [time_col]
                      .apply(lambda x: x.shift(1))
                     )   
    )
  

In [103]:
# distance should be positive, but sometimes it's not, 
# so use absolute value
c3_m1 = c3_m1.assign(
        change_meters = abs(c3_m1[dist_col] - c3_m1.prior_dist),
        change_sec = (c3_m1[time_col] - c3_m1.prior_time).divide(
                       np.timedelta64(1, 's'))
    )

In [104]:
one_trip = c3_m1.loc[c3_m1.trip_instance_key == "ec2ef3dc047b844d7abf2d035728e202"]

In [105]:
(one_trip[time_col] - one_trip.prior_time).head()

28840               NaT
28841   0 days 00:05:21
28842   0 days 00:05:21
28843   0 days 00:05:06
28844   0 days 00:05:21
dtype: timedelta64[ns]

In [106]:
one_trip.drop(columns = ['shape_geometry'])

Unnamed: 0,gtfs_dataset_key,trip_instance_key,location_timestamp_local,vp_idx,shape_array_key,vp_geometry,shape_meters,prior_dist,prior_time,change_meters,change_sec
28840,c0e3039da063db95ebabd3fe4ee611a4,ec2ef3dc047b844d7abf2d035728e202,2023-10-11 21:21:08,12415722,652d492c9725307ab5f725bb616ee4a0,POINT (-212402.440 -28559.730),161.5,,NaT,,
28841,c0e3039da063db95ebabd3fe4ee611a4,ec2ef3dc047b844d7abf2d035728e202,2023-10-11 21:26:29,12415738,652d492c9725307ab5f725bb616ee4a0,POINT (-212088.927 -27551.761),1454.71,161.5,2023-10-11 21:21:08,1293.21,321.0
28842,c0e3039da063db95ebabd3fe4ee611a4,ec2ef3dc047b844d7abf2d035728e202,2023-10-11 21:31:50,12415754,652d492c9725307ab5f725bb616ee4a0,POINT (-212560.266 -26634.904),2819.51,1454.71,2023-10-11 21:26:29,1364.8,321.0
28843,c0e3039da063db95ebabd3fe4ee611a4,ec2ef3dc047b844d7abf2d035728e202,2023-10-11 21:36:56,12415769,652d492c9725307ab5f725bb616ee4a0,POINT (-212711.278 -26637.392),2970.41,2819.51,2023-10-11 21:31:50,150.9,306.0
28844,c0e3039da063db95ebabd3fe4ee611a4,ec2ef3dc047b844d7abf2d035728e202,2023-10-11 21:42:17,12415785,652d492c9725307ab5f725bb616ee4a0,POINT (-212752.276 -26640.128),3016.19,2970.41,2023-10-11 21:36:56,45.78,321.0


In [107]:
# Test with one trip
one_trip2 = (one_trip.groupby(group_cols, 
                     observed=True, group_keys=False)
           .agg({"change_meters": "sum", 
                 "change_sec": "sum"})
           .reset_index()
          )
    

In [108]:
one_trip2

Unnamed: 0,gtfs_dataset_key,trip_instance_key,change_meters,change_sec
0,c0e3039da063db95ebabd3fe4ee611a4,ec2ef3dc047b844d7abf2d035728e202,2854.69,1269.0


In [109]:
MPH_PER_MPS

2.237

In [110]:
one_trip2.assign(
        speed_mph = (one_trip2.change_meters.divide(one_trip2.change_sec) * 
                     MPH_PER_MPS)
    )
    


Unnamed: 0,gtfs_dataset_key,trip_instance_key,change_meters,change_sec,speed_mph
0,c0e3039da063db95ebabd3fe4ee611a4,ec2ef3dc047b844d7abf2d035728e202,2854.69,1269.0,5.03


In [111]:
c3_m2 = (c3_m1.groupby(group_cols, 
                     observed=True, group_keys=False)
           .agg({"change_meters": "sum", 
                 "change_sec": "sum"})
           .reset_index()
          )
    

In [112]:
c3_m2 = c3_m2.assign(
        speed_mph = (c3_m2.change_meters.divide(c3_m1.change_sec) * 
                     MPH_PER_MPS)
    )
    


In [113]:
c3_m2.speed_mph.describe()

count   7392.00
mean      29.82
std       21.56
min        0.00
25%       17.66
50%       25.23
75%       35.86
max      340.24
Name: speed_mph, dtype: float64

In [114]:
# muni.loc[muni.vp_in_shape == 0]

In [146]:
c3_m1.loc[c3_m1.trip_instance_key == "04ae6ea9655473bdcaf9b80d443558ae"].drop(columns = ['shape_geometry'])

Unnamed: 0,gtfs_dataset_key,trip_instance_key,location_timestamp_local,vp_idx,shape_array_key,vp_geometry,shape_meters,prior_dist,prior_time,change_meters,change_sec
35703,c0e3039da063db95ebabd3fe4ee611a4,04ae6ea9655473bdcaf9b80d443558ae,2023-10-11 18:34:47,12628774,f82328b9817126227af1aa52033d8847,POINT (-212105.135 -21893.355),1411.65,,NaT,,
35704,c0e3039da063db95ebabd3fe4ee611a4,04ae6ea9655473bdcaf9b80d443558ae,2023-10-11 18:44:18,12628802,f82328b9817126227af1aa52033d8847,POINT (-212100.059 -21901.268),1411.65,1411.65,2023-10-11 18:34:47,0.0,571.0
35705,c0e3039da063db95ebabd3fe4ee611a4,04ae6ea9655473bdcaf9b80d443558ae,2023-10-11 18:53:17,12628829,f82328b9817126227af1aa52033d8847,POINT (-212100.087 -21902.380),1411.65,1411.65,2023-10-11 18:44:18,0.0,539.0
35706,c0e3039da063db95ebabd3fe4ee611a4,04ae6ea9655473bdcaf9b80d443558ae,2023-10-11 19:02:17,12628856,f82328b9817126227af1aa52033d8847,POINT (-212100.228 -21907.936),1411.65,1411.65,2023-10-11 18:53:17,0.0,540.0
35707,c0e3039da063db95ebabd3fe4ee611a4,04ae6ea9655473bdcaf9b80d443558ae,2023-10-11 19:11:31,12628884,f82328b9817126227af1aa52033d8847,POINT (-212098.611 -21913.537),1411.65,1411.65,2023-10-11 19:02:17,0.0,554.0


In [115]:
c3_m2.loc[c3_m2.speed_mph == 0]

Unnamed: 0,gtfs_dataset_key,trip_instance_key,change_meters,change_sec,speed_mph
167,c0e3039da063db95ebabd3fe4ee611a4,04ae6ea9655473bdcaf9b80d443558ae,0.0,2204.0,0.0
863,c0e3039da063db95ebabd3fe4ee611a4,183990ffbd08d7d7df0308b35227a058,0.0,1530.0,0.0
1128,c0e3039da063db95ebabd3fe4ee611a4,1fe28ce8669a7f946f272faf80d80ce7,0.0,1379.0,0.0
2854,c0e3039da063db95ebabd3fe4ee611a4,5056febd44cda31254459111e1fc79d0,0.0,875.0,0.0
3213,c0e3039da063db95ebabd3fe4ee611a4,5a82f8c02f2036db77b1979f4ae228f5,0.0,803.0,0.0
3232,c0e3039da063db95ebabd3fe4ee611a4,5b07e2bf68a16c54d798ef4028e02026,0.0,1004.0,0.0
3937,c0e3039da063db95ebabd3fe4ee611a4,6ee4a9165f47b59cb3e10b0125dd72e8,0.0,649.0,0.0
4562,c0e3039da063db95ebabd3fe4ee611a4,7ff345e91303290951ef0665a57dc90e,0.0,1403.0,0.0
6758,c0e3039da063db95ebabd3fe4ee611a4,bcb78da06a56761d2a3b8b16a3436f3d,0.0,2000.0,0.0
7043,c0e3039da063db95ebabd3fe4ee611a4,c3fc63ec2ff7a35c0f821659075be708,0.0,1173.0,0.0


#### add_scheduled_trip_columns

In [116]:
# Attach scheduled trip columns, like route, direction, time_of_day
group_cols = ["trip_instance_key"]

In [117]:
keep_cols = [
        "gtfs_dataset_key",
        "direction_id", 
        "route_id", "route_short_name", "route_long_name", "route_desc",
    ] + group_cols

In [118]:
crosswalk = helpers.import_scheduled_trips(
        analysis_date, 
        columns = keep_cols, 
        get_pandas = True
    )

In [119]:
crosswalk.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,route_id,route_short_name,route_long_name,route_desc,trip_instance_key
0,1770249a5a2e770ca90628434d4934b1,1.0,3402,Route 11,Route 11,PACIFIC VIEW MALL via TELEPHONE RD,1b3cc71bbb3c3166e8c5540ca26a97ba
1,1770249a5a2e770ca90628434d4934b1,1.0,3402,Route 11,Route 11,PACIFIC VIEW MALL via TELEPHONE RD,f780ba65965e61d394635fd80fc81232


In [120]:
common_shape = sched_rt_utils.most_common_shape_by_route_direction(analysis_date)

In [121]:
common_shape.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,route_id,direction_id,common_shape_id,shape_array_key
0,014d0998350083249a9eb310635548c2,10866826,1.0,10866826:1,80d84c820ca200c0b3d1791185c72b56
1,014d0998350083249a9eb310635548c2,10866849,1.0,10866849:1,99faaf65ccc65b3ec0e6704765b60195


In [122]:
crosswalk2 = pd.merge(
        crosswalk,
        common_shape,
        on = ["schedule_gtfs_dataset_key", "route_id", "direction_id"],
        how = "inner"
    ).astype({"direction_id": "Int64"})
    

In [123]:
crosswalk2.sample()

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,route_id,route_short_name,route_long_name,route_desc,trip_instance_key,common_shape_id,shape_array_key
81833,baeeb157e85a901e47b828ef9fe75091,1,530,Green,Santee - 12th & Imperial,,a87002c1aa2aaef6b0b64d21795cdf2b,530_3_331,c8ea2abb50b0d4210a36a1f1fb07d8ae


In [124]:
time_of_day = sched_rt_utils.get_trip_time_buckets(analysis_date)

In [147]:
time_of_day.shape, time_of_day.trip_instance_key.nunique()

((101973, 5), 101973)

In [125]:
time_of_day.head(2)

Unnamed: 0,trip_instance_key,service_hours,trip_first_departure_datetime_pacific,time_of_day,service_minutes
0,1b3cc71bbb3c3166e8c5540ca26a97ba,0.6,2023-10-11 10:40:00,Midday,36.0
1,f780ba65965e61d394635fd80fc81232,0.55,2023-10-11 17:40:00,PM Peak,33.0


In [126]:
crosswalk2 = portfolio_utils.add_route_name(
        crosswalk2
    ).drop(columns = ["route_short_name", "route_long_name", "route_desc"])


In [127]:
crosswalk2.head(2)

Unnamed: 0,schedule_gtfs_dataset_key,direction_id,route_id,trip_instance_key,common_shape_id,shape_array_key,route_name_used
0,1770249a5a2e770ca90628434d4934b1,1,3402,1b3cc71bbb3c3166e8c5540ca26a97ba,8254,a669792e07a16b260139c6950011b759,PACIFIC VIEW MALL via TELEPHONE RD
1,1770249a5a2e770ca90628434d4934b1,1,3402,f780ba65965e61d394635fd80fc81232,8254,a669792e07a16b260139c6950011b759,PACIFIC VIEW MALL via TELEPHONE RD


In [128]:
type(c3_m2)

pandas.core.frame.DataFrame

In [129]:
# https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C3_trip_route_speed.py#L116-L124
c3_m3 = dd.merge(
        c3_m2,
        crosswalk2,
        on = group_cols,
        how = "left",
    ).merge(
        time_of_day,
        on = group_cols,
        how = "left"
    )

In [130]:
type(c3_m3)

pandas.core.frame.DataFrame

In [131]:
c3_m3.columns

Index(['gtfs_dataset_key', 'trip_instance_key', 'change_meters', 'change_sec',
       'speed_mph', 'schedule_gtfs_dataset_key', 'direction_id', 'route_id',
       'common_shape_id', 'shape_array_key', 'route_name_used',
       'service_hours', 'trip_first_departure_datetime_pacific', 'time_of_day',
       'service_minutes'],
      dtype='object')

#### avg_route_speeds_by_time_of_day

In [132]:
# https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C3_trip_route_speed.py#L166
def drop_extremely_low_and_high_speeds(
    df: pd.DataFrame, 
    speed_range: tuple
) -> pd.DataFrame:
    """
    Descriptives show the 5th percentile is around 5 mph, 
    and 95th percentile is around 25 mph.
    
    There are some weird calculations for <3 mph, and even
    some negative values, so let's exclude those...maybe
    the vp is not traveling across the entirety of the shape.
    
    Exclude unusually high speeds, over 70 mph.
    """
    low, high = speed_range
    
    df2 = df[(df.speed_mph >= low) & 
             (df.speed_mph <= high)
            ].reset_index(drop=True)
    
    return df2

In [133]:
c3_m3 = drop_extremely_low_and_high_speeds(c3_m3, speed_range = (3, 70))

In [134]:
c3_m3.shape, c3_m1.shape

((6993, 15), (46198, 12))

In [149]:
c3_m3.trip_instance_key.nunique()

6993

In [150]:
c3_m1.trip_instance_key.nunique()

9240

* Is this common to have 25% of trips dropped?

In [151]:
6993/9240

0.7568181818181818

In [136]:
group_cols = [
            "gtfs_dataset_key", "time_of_day",
            "route_id", "direction_id",
            "route_name_used",
            "common_shape_id", "shape_array_key"
        ]

In [137]:
# test with one route
one_route = c3_m3.loc[c3_m3.route_id == "14R"]

In [152]:
one_route.shape, one_route.trip_instance_key.nunique()

((193, 15), 193)

In [139]:
one_route.head()

Unnamed: 0,gtfs_dataset_key,trip_instance_key,change_meters,change_sec,speed_mph,schedule_gtfs_dataset_key,direction_id,route_id,common_shape_id,shape_array_key,route_name_used,service_hours,trip_first_departure_datetime_pacific,time_of_day,service_minutes
73,c0e3039da063db95ebabd3fe4ee611a4,0317ba64ff87ddd8b3dd626368a341a8,13487.57,5379.0,40.83,7cc0cb1871dfd558f11a2885c145d144,1,14R,11452,defb027e7468735f91300a9851f3e1d7,5am-10pm daily,0.9,2023-10-11 18:25:00,PM Peak,54.0
144,c0e3039da063db95ebabd3fe4ee611a4,059a49f356f6b8bdb7f12458dafc68e4,20787.34,5069.0,59.09,7cc0cb1871dfd558f11a2885c145d144,1,14R,11452,defb027e7468735f91300a9851f3e1d7,5am-10pm daily,0.75,2023-10-11 08:30:00,AM Peak,45.0
173,c0e3039da063db95ebabd3fe4ee611a4,06dc64f00c021e66d0c7bd52df8fed9c,12924.9,2594.0,32.27,7cc0cb1871dfd558f11a2885c145d144,0,14R,11403,453493134b4598d0dbdddb92e825ae24,5am-10pm daily,0.85,2023-10-11 19:36:00,PM Peak,51.0
202,c0e3039da063db95ebabd3fe4ee611a4,07ab77edf2b69295d643794e3cd497af,13682.26,4427.0,32.49,7cc0cb1871dfd558f11a2885c145d144,1,14R,11452,defb027e7468735f91300a9851f3e1d7,5am-10pm daily,0.77,2023-10-11 05:58:00,Early AM,46.0
230,c0e3039da063db95ebabd3fe4ee611a4,08adb7403f95343c6c9579cb137c8c15,13624.98,5094.0,33.46,7cc0cb1871dfd558f11a2885c145d144,1,14R,11452,defb027e7468735f91300a9851f3e1d7,5am-10pm daily,0.9,2023-10-11 12:49:00,Midday,54.0


In [140]:
group_cols

['gtfs_dataset_key',
 'time_of_day',
 'route_id',
 'direction_id',
 'route_name_used',
 'common_shape_id',
 'shape_array_key']

#### One route test

In [141]:
one_route2 = (one_route.groupby(group_cols, 
                       observed = True, group_keys = False)
           .agg({
               "speed_mph": "mean",
               "service_minutes": "mean",
               "change_sec": "mean",
               "trip_instance_key": "count"
           }).reset_index()
          )

In [142]:
one_route2.shape

(10, 11)

In [143]:
one_route2

Unnamed: 0,gtfs_dataset_key,time_of_day,route_id,direction_id,route_name_used,common_shape_id,shape_array_key,speed_mph,service_minutes,change_sec,trip_instance_key
0,c0e3039da063db95ebabd3fe4ee611a4,AM Peak,14R,0,5am-10pm daily,11403,453493134b4598d0dbdddb92e825ae24,29.38,43.19,3139.44,16
1,c0e3039da063db95ebabd3fe4ee611a4,AM Peak,14R,1,5am-10pm daily,11452,defb027e7468735f91300a9851f3e1d7,39.8,50.36,4804.36,22
2,c0e3039da063db95ebabd3fe4ee611a4,Early AM,14R,0,5am-10pm daily,11403,453493134b4598d0dbdddb92e825ae24,37.89,42.0,3555.8,5
3,c0e3039da063db95ebabd3fe4ee611a4,Early AM,14R,1,5am-10pm daily,11452,defb027e7468735f91300a9851f3e1d7,34.42,44.45,4602.45,11
4,c0e3039da063db95ebabd3fe4ee611a4,Evening,14R,0,5am-10pm daily,11403,453493134b4598d0dbdddb92e825ae24,33.69,51.0,3014.44,9
5,c0e3039da063db95ebabd3fe4ee611a4,Evening,14R,1,5am-10pm daily,11452,defb027e7468735f91300a9851f3e1d7,32.25,53.0,5149.0,5
6,c0e3039da063db95ebabd3fe4ee611a4,Midday,14R,0,5am-10pm daily,11403,453493134b4598d0dbdddb92e825ae24,32.46,46.94,3371.86,35
7,c0e3039da063db95ebabd3fe4ee611a4,Midday,14R,1,5am-10pm daily,11452,defb027e7468735f91300a9851f3e1d7,31.3,50.61,4478.18,33
8,c0e3039da063db95ebabd3fe4ee611a4,PM Peak,14R,0,5am-10pm daily,11403,453493134b4598d0dbdddb92e825ae24,38.41,49.78,3893.81,36
9,c0e3039da063db95ebabd3fe4ee611a4,PM Peak,14R,1,5am-10pm daily,11452,defb027e7468735f91300a9851f3e1d7,31.35,53.71,4823.19,21


In [144]:
one_route2.assign(
        avg_rt_trip_min = one_route2.change_sec.divide(60).round(1),
        service_minutes = one_route2.service_minutes.round(1),
        speed_mph = one_route2.speed_mph.round(1),
    ).rename(columns = {
        "service_minutes": "avg_sched_trip_min",
        "trip_instance_key": "n_trips",
        "route_name_used": "route_name",
    }).drop(columns = "change_sec")

Unnamed: 0,gtfs_dataset_key,time_of_day,route_id,direction_id,route_name,common_shape_id,shape_array_key,speed_mph,avg_sched_trip_min,n_trips,avg_rt_trip_min
0,c0e3039da063db95ebabd3fe4ee611a4,AM Peak,14R,0,5am-10pm daily,11403,453493134b4598d0dbdddb92e825ae24,29.4,43.2,16,52.3
1,c0e3039da063db95ebabd3fe4ee611a4,AM Peak,14R,1,5am-10pm daily,11452,defb027e7468735f91300a9851f3e1d7,39.8,50.4,22,80.1
2,c0e3039da063db95ebabd3fe4ee611a4,Early AM,14R,0,5am-10pm daily,11403,453493134b4598d0dbdddb92e825ae24,37.9,42.0,5,59.3
3,c0e3039da063db95ebabd3fe4ee611a4,Early AM,14R,1,5am-10pm daily,11452,defb027e7468735f91300a9851f3e1d7,34.4,44.5,11,76.7
4,c0e3039da063db95ebabd3fe4ee611a4,Evening,14R,0,5am-10pm daily,11403,453493134b4598d0dbdddb92e825ae24,33.7,51.0,9,50.2
5,c0e3039da063db95ebabd3fe4ee611a4,Evening,14R,1,5am-10pm daily,11452,defb027e7468735f91300a9851f3e1d7,32.3,53.0,5,85.8
6,c0e3039da063db95ebabd3fe4ee611a4,Midday,14R,0,5am-10pm daily,11403,453493134b4598d0dbdddb92e825ae24,32.5,46.9,35,56.2
7,c0e3039da063db95ebabd3fe4ee611a4,Midday,14R,1,5am-10pm daily,11452,defb027e7468735f91300a9851f3e1d7,31.3,50.6,33,74.6
8,c0e3039da063db95ebabd3fe4ee611a4,PM Peak,14R,0,5am-10pm daily,11403,453493134b4598d0dbdddb92e825ae24,38.4,49.8,36,64.9
9,c0e3039da063db95ebabd3fe4ee611a4,PM Peak,14R,1,5am-10pm daily,11452,defb027e7468735f91300a9851f3e1d7,31.4,53.7,21,80.4


In [153]:
# https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C3_trip_route_speed.py#L168-L177
c3_m4 = (c3_m3.groupby(group_cols, 
                       observed = True, group_keys = False)
           .agg({
               "speed_mph": "mean",
               "service_minutes": "mean",
               "change_sec": "mean",
               "trip_instance_key": "count"
           }).reset_index()
          )

In [154]:
# https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C3_trip_route_speed.py#L178-L188
c3_m4 = c3_m4.assign(
        avg_rt_trip_min = c3_m4.change_sec.divide(60).round(1),
        service_minutes = c3_m4.service_minutes.round(1),
        speed_mph = c3_m4.speed_mph.round(1),
    ).rename(columns = {
        "service_minutes": "avg_sched_trip_min",
        "trip_instance_key": "n_trips",
        "route_name_used": "route_name",
    }).drop(columns = "change_sec")

#### Checks

##### Shape 1

In [172]:
shape_array_1 = "0055ea6cd09cc68606d37851a6c91366"
shape1_time = "AM Peak"

In [173]:
def check_shapes(shape_array, time_of_day):
    display(c3_m4.loc[(c3_m4.shape_array_key == shape_array) & (c3_m4.time_of_day == time_of_day)])
    display(c3_m3.loc[(c3_m3.shape_array_key == shape_array) & (c3_m3.time_of_day == time_of_day)])

In [174]:
check_shapes(shape_array_1, shape1_time)

Unnamed: 0,gtfs_dataset_key,time_of_day,route_id,direction_id,route_name,common_shape_id,shape_array_key,speed_mph,avg_sched_trip_min,n_trips,avg_rt_trip_min
69,c0e3039da063db95ebabd3fe4ee611a4,AM Peak,55,0,5am-10pm daily,260,0055ea6cd09cc68606d37851a6c91366,10.1,16.0,4,31.7


Unnamed: 0,gtfs_dataset_key,trip_instance_key,change_meters,change_sec,speed_mph,schedule_gtfs_dataset_key,direction_id,route_id,common_shape_id,shape_array_key,route_name_used,service_hours,trip_first_departure_datetime_pacific,time_of_day,service_minutes
4963,c0e3039da063db95ebabd3fe4ee611a4,b68e89ab04e684839d2cde80be127219,3832.26,2365.0,6.52,7cc0cb1871dfd558f11a2885c145d144,0,55,260,0055ea6cd09cc68606d37851a6c91366,5am-10pm daily,0.27,2023-10-11 07:59:00,AM Peak,16.0
5783,c0e3039da063db95ebabd3fe4ee611a4,d32ca8727a4d9c0ee126664507d42c08,3856.82,2255.0,6.99,7cc0cb1871dfd558f11a2885c145d144,0,55,260,0055ea6cd09cc68606d37851a6c91366,5am-10pm daily,0.27,2023-10-11 09:20:00,AM Peak,16.0
6308,c0e3039da063db95ebabd3fe4ee611a4,e59e81c33cab600d4009318f354fbf9a,1662.33,881.0,3.59,7cc0cb1871dfd558f11a2885c145d144,0,55,260,0055ea6cd09cc68606d37851a6c91366,5am-10pm daily,0.27,2023-10-11 08:20:00,AM Peak,16.0
6559,c0e3039da063db95ebabd3fe4ee611a4,ef66bcc7ebab13dc4cd4e2b0c6085ee7,3843.8,2099.0,23.18,7cc0cb1871dfd558f11a2885c145d144,0,55,260,0055ea6cd09cc68606d37851a6c91366,5am-10pm daily,0.27,2023-10-11 09:41:00,AM Peak,16.0


In [177]:
(2365+2255+881+2099)/4/60

31.666666666666668

##### Shape 2

In [181]:
shape_key2 = "e52c291259f04c198a0c1f245ec26be9"
time2 = "AM Peak"

In [182]:
check_shapes(shape_key2, time2)

Unnamed: 0,gtfs_dataset_key,time_of_day,route_id,direction_id,route_name,common_shape_id,shape_array_key,speed_mph,avg_sched_trip_min,n_trips,avg_rt_trip_min
65,c0e3039da063db95ebabd3fe4ee611a4,AM Peak,52,0,Weekdays 6am-10pm Weekends 8am-10pm,5200,e52c291259f04c198a0c1f245ec26be9,15.9,26.0,6,47.5


Unnamed: 0,gtfs_dataset_key,trip_instance_key,change_meters,change_sec,speed_mph,schedule_gtfs_dataset_key,direction_id,route_id,common_shape_id,shape_array_key,route_name_used,service_hours,trip_first_departure_datetime_pacific,time_of_day,service_minutes
57,c0e3039da063db95ebabd3fe4ee611a4,027ffc2ecdce130df8e8b2f2d81b17d6,6505.34,2703.0,18.19,7cc0cb1871dfd558f11a2885c145d144,0,52,5200,e52c291259f04c198a0c1f245ec26be9,Weekdays 6am-10pm Weekends 8am-10pm,0.4,2023-10-11 09:34:00,AM Peak,24.0
91,c0e3039da063db95ebabd3fe4ee611a4,03b5bb249b7188cc3935b355ad008a82,6625.95,2548.0,18.46,7cc0cb1871dfd558f11a2885c145d144,0,52,5200,e52c291259f04c198a0c1f245ec26be9,Weekdays 6am-10pm Weekends 8am-10pm,0.48,2023-10-11 07:56:00,AM Peak,29.0
2101,c0e3039da063db95ebabd3fe4ee611a4,50848f3ca9a5b8db9149cf4d4c0ce008,6660.49,3261.0,12.02,7cc0cb1871dfd558f11a2885c145d144,0,52,5200,e52c291259f04c198a0c1f245ec26be9,Weekdays 6am-10pm Weekends 8am-10pm,0.42,2023-10-11 08:34:00,AM Peak,25.0
4013,c0e3039da063db95ebabd3fe4ee611a4,94f8ca1587768bca12471d4dfd63cace,6786.87,2735.0,14.68,7cc0cb1871dfd558f11a2885c145d144,0,52,5200,e52c291259f04c198a0c1f245ec26be9,Weekdays 6am-10pm Weekends 8am-10pm,0.42,2023-10-11 08:54:00,AM Peak,25.0
4699,c0e3039da063db95ebabd3fe4ee611a4,acf1b602311569dd97867a45b5c16e4c,9837.34,3691.0,17.8,7cc0cb1871dfd558f11a2885c145d144,0,52,5200,e52c291259f04c198a0c1f245ec26be9,Weekdays 6am-10pm Weekends 8am-10pm,0.48,2023-10-11 08:15:00,AM Peak,29.0
6700,c0e3039da063db95ebabd3fe4ee611a4,f5058b9f8fe6df4e55644a6cc0896faf,6657.79,2162.0,14.39,7cc0cb1871dfd558f11a2885c145d144,0,52,5200,e52c291259f04c198a0c1f245ec26be9,Weekdays 6am-10pm Weekends 8am-10pm,0.4,2023-10-11 09:55:00,AM Peak,24.0


## Outer Join
* HELP: how to connect back to trip instance key?

In [186]:
# Spatial Accuracy
type(muni)

pandas.core.frame.DataFrame

In [187]:
muni.columns

Index(['trip_instance_key', 'total_vp', 'vp_in_shape'], dtype='object')

In [190]:
muni.trip_instance_key.nunique()

9240

In [188]:
# Update Completeness
test_group3.columns

Index(['trip_instance_key', 'min_time', 'max_time', 'atleast2_trip_updates',
       'trip_min_elapsed', 'pct_update_complete', 'larger', 'test_mask',
       'pct_update_complete2'],
      dtype='object')

In [191]:
test_group3.trip_instance_key.nunique()

9240

In [189]:
outer_merge1 = pd.merge(test_group3, muni, 
                        on = "trip_instance_key",
                        how = "outer")

In [199]:
# Averages
crosswalk_back_to_trips = c3_m3[['shape_array_key','gtfs_dataset_key','time_of_day','trip_instance_key']].drop_duplicates().reset_index(drop = True)

In [201]:
crosswalk_back_to_trips.trip_instance_key.nunique()

6993

In [203]:
outer_merge2 = pd.merge(outer_merge1, crosswalk_back_to_trips, 
                        on = "trip_instance_key",
                        how = "outer")

In [205]:
outer_merge2.columns

Index(['trip_instance_key', 'min_time', 'max_time', 'atleast2_trip_updates',
       'trip_min_elapsed', 'pct_update_complete', 'larger', 'test_mask',
       'pct_update_complete2', 'total_vp', 'vp_in_shape', 'shape_array_key',
       'gtfs_dataset_key', 'time_of_day'],
      dtype='object')

In [204]:
c3_m4.columns

Index(['gtfs_dataset_key', 'time_of_day', 'route_id', 'direction_id',
       'route_name', 'common_shape_id', 'shape_array_key', 'speed_mph',
       'avg_sched_trip_min', 'n_trips', 'avg_rt_trip_min'],
      dtype='object')

In [208]:
outer_merge3 = pd.merge(outer_merge2, c3_m4, 
                        on = ["gtfs_dataset_key", "shape_array_key", "time_of_day"],
                        how = "outer")

In [210]:
outer_merge3 = outer_merge3.drop(columns = ['min_time','max_time','trip_min_elapsed', 'pct_update_complete','larger','test_mask'])

In [211]:
outer_merge3.sample(3)

Unnamed: 0,trip_instance_key,atleast2_trip_updates,pct_update_complete2,total_vp,vp_in_shape,shape_array_key,gtfs_dataset_key,time_of_day,route_id,direction_id,route_name,common_shape_id,speed_mph,avg_sched_trip_min,n_trips,avg_rt_trip_min
5376,095d0ee5794fd628f94a65ddb1c22575,48,1.0,143,140,872020ed4598b30bc0b9981791fb0b1d,c0e3039da063db95ebabd3fe4ee611a4,PM Peak,38R,1,Weekdays 5am-10pm Weekends 6am-9pm,13852,25.1,45.2,33.0,48.5
6357,5979e575563c15e17d2fadc68d9f0f4c,45,1.0,134,134,dfd50ed85c60540a1f9b9c2d1afa93ff,c0e3039da063db95ebabd3fe4ee611a4,Evening,30,1,Weedays 5am-12 midnight Weekends 6am-12 midnight,3052,17.9,33.1,11.0,41.7
4032,29a5d0b73daef0dda923a762242373c7,75,1.0,224,224,7cc77a4cdfa2a4fa0694addee4665b51,c0e3039da063db95ebabd3fe4ee611a4,PM Peak,38R,0,Weekdays 5am-10pm Weekends 6am-9pm,13800,26.8,47.2,36.0,67.3


In [214]:
# check_shapes("dfd50ed85c60540a1f9b9c2d1afa93ff", "Evening")

## Final cleaning
* How come you export it twice? 
* https://github.com/cal-itp/data-analyses/blob/metrics_rt/rt_segment_speeds/scripts/C3_trip_route_speed.py#L178-L188