# Migrate to VP Usable
* https://github.com/cal-itp/data-analyses/issues/936
* cd rt_segment_speeds && pip install -r requirements.txt && cd ..
    * https://github.com/cal-itp/data-analyses/blob/main/Makefile#L49C2-L49C66
    

In [1]:
import datetime
import dask.dataframe as dd
import dask_geopandas as dg
import dask
import geopandas as gpd
import pandas as pd
from scripts import vp_spatial_accuracy
from segment_speed_utils import helpers
from calitp_data_analysis.geography_utils import WGS84
from segment_speed_utils.project_vars import (
    PROJECT_CRS,
    SEGMENT_GCS,
    analysis_date,
    GCS_FILE_PATH,
    COMPILED_CACHED_VIEWS,
    RT_SCHED_GCS,
    CONFIG_PATH
)

# For speeds c2
from typing import Literal
import numpy as np

# for speeds c3
# also import numpy 
from shared_utils.rt_utils import MPH_PER_MPS
from calitp_data_analysis import utils
from shared_utils import portfolio_utils, schedule_rt_utils
from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes, segment_calcs

In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
# calitp-analytics-data/data-analyses/rt_segment_speeds/vp_usable_2023-10-11
operator = 'Bay Area 511 Muni VehiclePositions'
gtfs_key = '7cc0cb1871dfd558f11a2885c145d144'

## Spatial Accuracy
* Based on https://github.com/cal-itp/data-analyses/blob/main/rt_scheduled_v_ran/scripts/vp_spatial_accuracy.py
### Grab_shape_keys_in_vp
#### First time reading `vp_usable`

In [4]:
def grab_shape_keys_in_vp(analysis_date: str) -> pd.DataFrame:
    """
    Subset raw vp and find unique trip_instance_keys.
    Create crosswalk to link trip_instance_key to shape_array_key.
    """
    vp_trip_df  = pd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]],
        columns = ['trip_instance_key'])
    
    vp_trip_df  = vp_trip_df.drop_duplicates().reset_index(drop = True)
    
    # Make sure we have a shape geometry too
    # otherwise map_partitions will throw error
    shapes = pd.read_parquet(
        f"{COMPILED_CACHED_VIEWS}routelines_{analysis_date}.parquet",
        columns = ["shape_array_key"],
    ).dropna().drop_duplicates()
    
    trips_with_shape = helpers.import_scheduled_trips(
        analysis_date,
        columns = ["trip_instance_key", "shape_array_key"],
        get_pandas = True
    ).merge(
        shapes,
        on = "shape_array_key",
        how = "inner"
    ).merge(
        vp_trip_df,
        on = "trip_instance_key",
        how = "inner"
    ).drop_duplicates().dropna().reset_index(drop=True)

    return trips_with_shape

In [5]:
trips_with_shape = grab_shape_keys_in_vp(analysis_date)

In [6]:
trips_with_shape.shape

(9240, 2)

In [7]:
trips_with_shape.head()

Unnamed: 0,trip_instance_key,shape_array_key
0,12de3d260e9fe09fa878cb4cdb2d6898,749b225ca6691f77914e88577dc13e68
1,934aea5748bce830ffc2fa88dc01402a,749b225ca6691f77914e88577dc13e68
2,3d06fa8e68e4f38d3ccc7adfabb5c3d9,f1a7410fae06937b7183f6a553707915
3,34109ad8cfeca83cd459d42c7d51d602,f1a7410fae06937b7183f6a553707915
4,34acd907ae9d5eb5456d683d2458bbc6,f1a7410fae06937b7183f6a553707915


### Buffer shapes

In [8]:
# This is trips_with_shape_geom
trips_with_shape_geom =  vp_spatial_accuracy.buffer_shapes(analysis_date, 
                                trips_with_shape,
                                35)

In [9]:
trips_with_shape_geom.shape

(9240, 3)

#### Second time reading in the same file, streamline

In [10]:
vp = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key)]])

### Full function

In [11]:
def merge_vp_with_shape_and_count(
    vp: dd.DataFrame,
    trips_with_shape_geom: gpd.GeoDataFrame
) -> gpd.GeoDataFrame:
    """
    Merge vp with crosswalk and buffered shapes.
    Get vp count totals and vp within shape.
    """
    vp_gdf = gpd.GeoDataFrame(
        vp,
        geometry = gpd.points_from_xy(vp.x, vp.y),
        crs = WGS84
    ).to_crs(PROJECT_CRS)
    
    vp_gdf = vp_gdf.rename(columns = {
    0:'gtfs_dataset_name',
    1: 'schedule_gtfs_dataset_key', 
    2:'trip_id',
    3:'trip_instance_key',
    4: 'location_timestamp', 
    5: 'location_timestamp_local',
    6:'x',
    7:'y', 
    8:'vp_idx', 
    9:'gtfs_dataset_key', 
    10:'vp_dir_xnorm', 
    11:'vp_dir_ynorm',
    12:'vp_primary_direction'})
    
    vp2 = pd.merge(
        vp_gdf,
        trips_with_shape_geom,
        on = "trip_instance_key",
        how = "inner"
    ).reset_index(drop=True)
    
    total_vp = vp_spatial_accuracy.total_vp_counts_by_trip(vp2)
    
    vp2 = vp2.assign(
        is_within = vp2.geometry_x.within(vp2.geometry_y)
    ).query('is_within==True')
    
    vps_in_shape = (vp2.groupby("trip_instance_key", 
                                observed = True, group_keys = False)
                    .agg({"location_timestamp_local": "count"})
                    .reset_index()
                    .rename(columns = {"location_timestamp_local": "vp_in_shape"})
                   )
        
    count_df = pd.merge(
        total_vp,
        vps_in_shape,
        on = "trip_instance_key",
        how = "left"
    )
    
    count_df = count_df.assign(
        vp_in_shape = count_df.vp_in_shape.fillna(0).astype("int32"),
        total_vp = count_df.total_vp.fillna(0).astype("int32")
    )
    
    return count_df

In [12]:
muni = merge_vp_with_shape_and_count(vp, trips_with_shape_geom)

In [13]:
muni.vp_in_shape.describe()

count   9240.00
mean     161.87
std       64.06
min        0.00
25%      118.00
50%      167.00
75%      207.00
max      481.00
Name: vp_in_shape, dtype: float64

In [14]:
muni.trip_instance_key.nunique()

9240

#### Add %?

In [15]:
(muni.vp_in_shape/muni.total_vp).describe()

count   9240.00
mean       0.93
std        0.14
min        0.00
25%        0.94
50%        1.00
75%        1.00
max        1.00
dtype: float64

In [16]:
muni['total_vp_in_shape_percentage'] = muni.vp_in_shape/muni.total_vp

In [17]:
muni.total_vp_in_shape_percentage.describe()

count   9240.00
mean       0.93
std        0.14
min        0.00
25%        0.94
50%        1.00
75%        1.00
max        1.00
Name: total_vp_in_shape_percentage, dtype: float64

#### Question: We are keeping rows in which 0 vps are in the shape?
* Yes
* What's happening with these routes? Where are they traveling? In a route that isn't listed in the schedule? Explore further.

In [18]:
# muni.loc[muni.vp_in_shape == 0]

In [19]:
muni.sample(5)

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape,total_vp_in_shape_percentage
6387,b1ec12da4a062fa65fa3524298274bcc,221,146,0.66
7580,d312a6001a448f716e73136832d1a310,125,125,1.0
7305,cba1dfac565b284265eeeaa26da5c2d9,169,169,1.0
7329,cc5308360703a702c6beaa58aaceca1d,225,217,0.96
3097,57337c021d92235c1f7775e1dc71d402,183,147,0.8


## Update Completeness
* https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/01_update_completeness.ipynb

#### Keep only relevant `trips instance keys`?
* Trips have certain attributes: name, key, gtfs key, etc in addition to trip instance key
* Start from `vp_usable` join to `shapes` as first pass and get total positions in, add a column that counts counts number of pings
* Read dataset in once, derive a bunch of rows. 
* I have 100 vps for one trip -> turn the trip into one row with summary statistics
* Pass output of dataset into three different ways
* Each pass will summarize vp-usable from thousands of rows to just one per trip
* Pass `vp_usable` into three different functions.
* Final step: merge all on trips.
* Each function is individual, can be pandas.

In [20]:
# Only use the trips with shapes.
relevant_trips = list(muni.trip_instance_key.unique())

#### Third time reading in `vp_usable`

In [21]:
completeness_cols = ['vp_idx',
      'location_timestamp_local', 'trip_instance_key',
      'gtfs_dataset_key']

In [22]:
vp_filtered = dd.read_parquet(f"{SEGMENT_GCS}vp_usable_{analysis_date}",
        columns = completeness_cols, 
        filters = [[('gtfs_dataset_name', "==", operator),
                   ('schedule_gtfs_dataset_key', '==', gtfs_key),
                   ('trip_instance_key', 'in', relevant_trips)]])

In [23]:
update_completeness = vp_filtered.sort_values(['vp_idx']).reset_index(drop = True)

In [24]:
two_cols = ['trip_instance_key','gtfs_dataset_key']

In [25]:
update_completeness = update_completeness.compute()

In [26]:
update_completeness2 = (update_completeness.groupby([
        *['trip_instance_key'], 
         pd.Grouper(key = 'location_timestamp_local', freq = '1Min')
    ])
    .count()
    .reset_index()
    .rename(columns = {'vp_idx':'number_of_pings_per_minute'})
        )

#### At least 2 pings per minute

In [27]:
at_least_2_pings = update_completeness2.assign(
        atleast2_trip_updates = update_completeness2.apply(
            lambda x: 1 if x["number_of_pings_per_minute"] >= 2
            else 0, axis=1)
    )    
    

In [28]:
at_least_2_pings.atleast2_trip_updates.value_counts()

1    534886
0      6795
Name: atleast2_trip_updates, dtype: int64

In [29]:
at_least_2_pings['max_time'] = at_least_2_pings.location_timestamp_local

In [30]:
at_least_2_pings = (at_least_2_pings
                        .groupby(['trip_instance_key'])
                        .agg({'location_timestamp_local':'min','max_time':'max', 
                             'atleast2_trip_updates':'sum'})
                        .reset_index()
                        .rename(columns = {'location_timestamp_local':'min_time'})
                       )

In [31]:
at_least_2_pings['total_trip_time'] = (at_least_2_pings.max_time - at_least_2_pings.min_time) / pd.Timedelta(minutes=1) + 1

In [32]:
at_least_2_pings.sample()

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,total_trip_time
8976,f95564edd65fe4358c24f55293e158ce,2023-10-11 09:00:00,2023-10-11 09:25:00,26,26.0


In [35]:
at_least_2_pings = at_least_2_pings.assign(
        pct_with_2_pings_per_min = at_least_2_pings.atleast2_trip_updates.divide(
            at_least_2_pings.total_trip_time)
    ) 

In [36]:
at_least_2_pings.pct_update_complete.describe()

count   9240.00
mean       0.98
std        0.06
min        0.02
25%        0.98
50%        0.99
75%        1.00
max        1.00
Name: pct_update_complete, dtype: float64

In [37]:
at_least_2_pings.loc[at_least_2_pings.trip_instance_key == "37622040815e89d063272bb6e37acc65"]

Unnamed: 0,trip_instance_key,min_time,max_time,atleast2_trip_updates,total_trip_time,pct_update_complete
1972,37622040815e89d063272bb6e37acc65,2023-10-11 05:31:00,2023-10-11 05:41:00,11,11.0,1.0


#### Density

6ef4805f0104b95614b86a2b1c374d23
* Minutes skipped: 6:12 to 6:18
* 6:19-6:26
* 6:28-7:33 etc etc
* Trip started at 5:48, ended at 8:31 

Can also check w/ 38247cbee93b6f85d58bf1812ae553b9
* Began at 16:01:00, end at 17:00:000
* Skips having data: jumps from 4:38 to 4:54
    * Trip: know the min and the max 
    * Bin the minutes: everything else between 4:38 to 4:54 can't be binned. 
    * Dem: gap between max and min 
* Vp doesn't necessarily update every minute 
* TO DO
    * Correct trip duration numerator: average pings per minute or ten minutes or five minutes...Help us understand coverage.  
    * How to express time lapses: time deletas in pandas

In [38]:
density_5_min = (update_completeness2.groupby([
        *['trip_instance_key'], 
         pd.Grouper(key = 'location_timestamp_local', freq = '5Min')
    ])
    .sum()
    .reset_index()
    .rename(columns = {'number_of_pings_per_minute':'number_of_pings_per_5_min'})
        )

In [39]:
density_5_min2 = density_5_min.groupby(['trip_instance_key']).agg({'number_of_pings_per_5_min':'median'}).reset_index()

In [40]:
density_5_min2['avg_pings_per_5_min'] = density_5_min2.number_of_pings_per_5_min/5

In [41]:
density_5_min2.sample(3)

Unnamed: 0,trip_instance_key,number_of_pings_per_5_min,avg_pings_per_5_min
954,1a7ccbfc37cbbdb7176bcac7c140be3e,15.0,3.0
7965,ddca3dd2ebb812be8287e9ad7c8c59c7,15.0,3.0
7322,cc09cf7d41b3868773583577f340f1ff,15.0,3.0


In [42]:
# density_5_min.loc[density_5_min.trip_instance_key == "e124441ba673de6b885c3a75cf22c7f0"]

In [43]:
density_5_min2.avg_pings_per_5_min.describe()

count   9240.00
mean       2.96
std        0.18
min        0.30
25%        3.00
50%        3.00
75%        3.00
max        3.00
Name: avg_pings_per_5_min, dtype: float64

#### Total minutes with GTFS

In [44]:
total_minutes_df = (update_completeness2
                        .groupby(['trip_instance_key'])
                        .agg({'gtfs_dataset_key':'count'})
                        .rename(columns = {'gtfs_dataset_key':'total_minutes_w_gtfs'})
                        .reset_index()
                       )

In [None]:
total_minutes_df.head()

Unnamed: 0,trip_instance_key,total_minutes_w_gtfs
0,00068c2e2316950af50ffaa9584c7a46,43
1,000c62b70d9438b7951457a74a4c89b2,52
2,0010126e10a24f22fb36018ed0f79572,138
3,0013822d90020aa52bae3dab73d15fd0,69
4,001a66b1633388f4fe9eea5acf374481,76


In [46]:
update_completeness = pd.merge(at_least_2_pings, total_minutes_df, on = "trip_instance_key", how = "inner")

In [61]:
update_completeness['pct_of_trip_w_gtfs'] = update_completeness.total_minutes_w_gtfs/update_completeness.total_trip_time

In [62]:
update_completeness2 = pd.merge(update_completeness, density_5_min2, on = "trip_instance_key", how = "inner")

In [63]:
update_completeness2 = update_completeness2.drop(columns = ['min_time','max_time','number_of_pings_per_5_min']) 

In [77]:
update_completeness2.sample(5)

Unnamed: 0,trip_instance_key,atleast2_trip_updates,total_trip_time,pct_update_complete,total_minutes_w_gtfs,pct_of_trip_with_gtfs,pct_of_trip_w_gtfs,avg_pings_per_5_min
8811,f5571c9636dac57a5891a1367323a55b,49,49.0,1.0,49,1.0,1.0,3.0
7124,c67d4cf779fcb442a04532ff36ab0808,50,51.0,0.98,51,1.0,1.0,3.0
2774,4e172af24879f967ed358d97031240de,57,57.0,1.0,57,1.0,1.0,3.0
6926,c10711bca9526b204542561daf52567e,64,64.0,1.0,64,1.0,1.0,3.0
35,00dd3fad34a4a153bc2b02645b80a78d,30,31.0,0.97,31,1.0,1.0,3.0


In [65]:
update_completeness2.pct_of_trip_w_gtfs.describe()

count   9240.00
mean       0.99
std        0.05
min        0.04
25%        1.00
50%        1.00
75%        1.00
max        1.00
Name: pct_of_trip_w_gtfs, dtype: float64

In [74]:
update_completeness2.avg_pings_per_5_min.describe()

count   9240.00
mean       2.96
std        0.18
min        0.30
25%        3.00
50%        3.00
75%        3.00
max        3.00
Name: avg_pings_per_5_min, dtype: float64

In [75]:
update_completeness2.pct_with_2_pings_per_min.describe()

count   9240.00
mean       0.98
std        0.06
min        0.02
25%        0.98
50%        0.99
75%        1.00
max        1.00
Name: pct_update_complete, dtype: float64

## How many minutes a trip took and the average speeds?

In [67]:
trip_speeds = pd.read_parquet(f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet")

In [None]:
trip_speeds  = pd.read_parquet(f"{SEGMENT_GCS}trip_summary/trip_speeds_{analysis_date}.parquet",
        columns = ['trip_instance_key', 'speed_mph'])

In [85]:
trip_speeds.columns

Index(['trip_instance_key', 'meters_elapsed', 'sec_elapsed', 'speed_mph',
       'rt_trip_min', 'schedule_gtfs_dataset_key', 'direction_id', 'route_id',
       'common_shape_id', 'shape_array_key', 'route_name_used',
       'service_hours', 'trip_first_departure_datetime_pacific', 'time_of_day',
       'service_minutes'],
      dtype='object')

In [68]:
trip_speeds.trip_instance_key.nunique()

76255

## Outer Join
#### Question: how to connect back to trip instance key?

In [69]:
# Spatial Accuracy
type(muni)

pandas.core.frame.DataFrame

In [70]:
muni.columns

Index(['trip_instance_key', 'total_vp', 'vp_in_shape',
       'total_vp_in_shape_percentage'],
      dtype='object')

In [71]:
muni.trip_instance_key.nunique()

9240

In [72]:
muni.head()

Unnamed: 0,trip_instance_key,total_vp,vp_in_shape,total_vp_in_shape_percentage
0,00068c2e2316950af50ffaa9584c7a46,126,126,1.0
1,000c62b70d9438b7951457a74a4c89b2,152,140,0.92
2,0010126e10a24f22fb36018ed0f79572,413,315,0.76
3,0013822d90020aa52bae3dab73d15fd0,205,205,1.0
4,001a66b1633388f4fe9eea5acf374481,227,227,1.0


In [78]:
outer_merge1 = pd.merge(update_completeness2, muni, 
                        on = "trip_instance_key",
                        how = "outer")

In [79]:
outer_merge1.sample()

Unnamed: 0,trip_instance_key,atleast2_trip_updates,total_trip_time,pct_update_complete,total_minutes_w_gtfs,pct_of_trip_with_gtfs,pct_of_trip_w_gtfs,avg_pings_per_5_min,total_vp,vp_in_shape,total_vp_in_shape_percentage
2992,544f67a3c2e17f8a605cbbaddeee2bd6,109,109.0,1.0,109,1.0,1.0,3.0,328,239,0.73


In [80]:
trip_speeds.sample()

Unnamed: 0,trip_instance_key,meters_elapsed,sec_elapsed,speed_mph,rt_trip_min,schedule_gtfs_dataset_key,direction_id,route_id,common_shape_id,shape_array_key,route_name_used,service_hours,trip_first_departure_datetime_pacific,time_of_day,service_minutes
7926,1acf6e8b7f2881596d1c14aa76b5ad5f,9549.37,3622.0,5.9,60.37,9809d3f8121513057bc5cb8de7b54ce2,0,005-130,50039,258c9367410a6170b12d1309c0170ed4,Monterey - Carmel Rancho,0.43,2023-10-11 13:44:00,Midday,26.0


In [81]:
outer_merge2 = pd.merge(outer_merge1, trip_speeds, 
                        on = "trip_instance_key",
                        how = "inner")

In [84]:
outer_merge2.sample(10)

Unnamed: 0,trip_instance_key,atleast2_trip_updates,total_trip_time,pct_update_complete,total_minutes_w_gtfs,pct_of_trip_with_gtfs,pct_of_trip_w_gtfs,avg_pings_per_5_min,total_vp,vp_in_shape,total_vp_in_shape_percentage,meters_elapsed,sec_elapsed,speed_mph,rt_trip_min,schedule_gtfs_dataset_key,direction_id,route_id,common_shape_id,shape_array_key,route_name_used,service_hours,trip_first_departure_datetime_pacific,time_of_day,service_minutes
8933,fa6dd9a0b27192b3265c8a682fcab577,44,46.0,0.96,46,1.0,1.0,3.0,134,121,0.9,4666.9,2407.0,4.34,40.12,7cc0cb1871dfd558f11a2885c145d144,1,1,152,e3fe7a7916a145078a6f6bd59cb42cc9,5am-12 midnight daily,0.45,2023-10-11 12:31:00,Midday,27.0
6892,c20d99bc2d6cd8497a025103a38bac8a,60,60.0,1.0,60,1.0,1.0,3.0,178,178,1.0,10465.34,6198.0,3.78,103.3,7cc0cb1871dfd558f11a2885c145d144,0,38R,13800,7cc77a4cdfa2a4fa0694addee4665b51,Weekdays 5am-10pm Weekends 6am-9pm,0.75,2023-10-11 09:42:00,AM Peak,45.0
2680,4c17c7b6c42aa6f0f38cb2c568f90ad7,70,70.0,1.0,70,1.0,1.0,3.0,210,201,0.96,10465.34,3550.0,6.59,59.17,7cc0cb1871dfd558f11a2885c145d144,0,38R,13800,7cc77a4cdfa2a4fa0694addee4665b51,Weekdays 5am-10pm Weekends 6am-9pm,0.82,2023-10-11 17:00:00,PM Peak,49.0
510,0ed3dd6cadda74a50f8e15cb06946c48,45,45.0,1.0,45,1.0,1.0,3.0,134,134,1.0,10586.49,5234.0,4.52,87.23,7cc0cb1871dfd558f11a2885c145d144,1,38,3851,e2f53e4455f4ed7d5aee6216c14c6e9f,24 hour service daily,0.6,2023-10-12 03:46:00,Owl,36.0
8939,fa8f89174e9ff2635fbe1ab6306bb7ea,45,46.0,0.98,46,1.0,1.0,3.0,136,136,1.0,11494.87,2399.0,10.72,39.98,7cc0cb1871dfd558f11a2885c145d144,1,31,3151,54882fdb83a0a8de4e0990b14c944a17,5am-10pm daily,0.67,2023-10-11 05:40:00,Early AM,40.0
1137,2060a6aad91dddc84ced6e13c71a2a8d,27,27.0,1.0,27,1.0,1.0,3.0,78,78,1.0,4800.46,1752.0,6.13,29.2,7cc0cb1871dfd558f11a2885c145d144,0,66,6600,adff56bb2094a25dfe271b847303b474,Weekdays 6am-10pm Weekends 8am-10pm,0.23,2023-10-11 20:00:00,Evening,14.0
6208,aec607702125277555d366f3074c1ac1,24,24.0,1.0,24,1.0,1.0,3.0,71,68,0.96,8457.75,2588.0,7.31,43.13,7cc0cb1871dfd558f11a2885c145d144,0,57,5700,55264d6b96faccc77a1a95feae549aa1,5am-10pm daily,0.4,2023-10-11 07:20:00,AM Peak,24.0
5521,9af084eef4a28299f51fd2c948db527b,60,60.0,1.0,60,1.0,1.0,3.0,178,178,1.0,8778.14,7014.0,2.8,116.9,7cc0cb1871dfd558f11a2885c145d144,0,27,2701,40f1c7440e22326c3d1e407eed291c71,5am-10pm daily,0.8,2023-10-11 17:00:00,PM Peak,48.0
5565,9c43bed79c9df53a6c4721b4cfba4f9f,40,40.0,1.0,40,1.0,1.0,3.0,117,117,1.0,12239.37,2318.0,11.81,38.63,7cc0cb1871dfd558f11a2885c145d144,0,K,9436,72a01bb089970160de498991fe5e858b,5am-12 midnight daily,0.57,2023-10-11 08:01:00,AM Peak,34.0
647,12c3036fb0e69db39ef1d31ad371a3b8,70,76.0,0.92,74,0.97,0.97,3.0,216,168,0.78,14418.18,5284.0,6.1,88.07,7cc0cb1871dfd558f11a2885c145d144,1,KBUS,232,1704e0d76f00ff2b3b6a3b81d72e66f5,Weekdays 5am-6am Weekends 5am-8am,0.85,2023-10-11 05:00:00,Early AM,51.0


In [83]:
outer_merge2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9142 entries, 0 to 9141
Data columns (total 25 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   trip_instance_key                      9142 non-null   object        
 1   atleast2_trip_updates                  9142 non-null   int64         
 2   total_trip_time                        9142 non-null   float64       
 3   pct_update_complete                    9142 non-null   float64       
 4   total_minutes_w_gtfs                   9142 non-null   int64         
 5   pct_of_trip_with_gtfs                  9142 non-null   float64       
 6   pct_of_trip_w_gtfs                     9142 non-null   float64       
 7   avg_pings_per_5_min                    9142 non-null   float64       
 8   total_vp                               9142 non-null   int32         
 9   vp_in_shape                            9142 non-null   int32   