## Scheduled vs, RT total trip time

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import dask.dataframe as dd
from calitp_data_analysis.sql import to_snakecase



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [294]:
import _threshold_utils as threshold_utils
import altair as alt
from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import analysis_date, SEGMENT_GCS

In [4]:
import gcsfs
fs = gcsfs.GCSFileSystem()

In [5]:
import intake
catalog = intake.open_catalog("./catalog.yml")

### RT
#### Catalog

In [6]:
#rt_trips = catalog.rt_trip_diagnostics(analysis_date = analysis_date).read()

In [7]:
#rt_trips.shape

In [8]:
#rt_trips.head()

#### Original file from `b1_rt_trip_diagnostics`

In [9]:
f"{SEGMENT_GCS}vp_sjoin/"

'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/vp_sjoin/'

In [10]:
f"vp_route_segment_{analysis_date}/"

'vp_route_segment_2023-01-18/'

In [11]:
ddf = helpers.import_vehicle_positions(
        gcs_folder = f"{SEGMENT_GCS}vp_sjoin/",
        file_name = f"vp_route_segment_{analysis_date}/",
        file_type = "df",
        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
                   "trip_id", "route_dir_identifier",
                   "location_timestamp"],
        partitioned = True
    ).repartition(partition_size="85MB")

In [12]:
df = ddf.compute()

In [13]:
df.shape

(10668072, 5)

In [108]:
len(df.drop_duplicates(subset = ['gtfs_dataset_key', 'trip_id','location_timestamp']))

10668072

In [109]:
df.columns

Index(['gtfs_dataset_key', '_gtfs_dataset_name', 'trip_id',
       'route_dir_identifier', 'location_timestamp'],
      dtype='object')

In [168]:
# Subset for testing
df_sub = df.head()

In [169]:
# Find max and min times
agg = (df.groupby(["_gtfs_dataset_name", "gtfs_dataset_key", "trip_id"])
        .agg({"location_timestamp": [ "min", "max"]})
        .reset_index()
        .rename(columns = {'min':'start', 'max':'end'})
      ) 

In [170]:
agg.shape

(69975, 5)

In [171]:
agg.columns = agg.columns.droplevel()

In [172]:
agg.columns.values[0] = "_gtfs_dataset_name"
agg.columns.values[1] = "gtfs_dataset_key"
agg.columns.values[2] = "trip_id"


In [173]:
agg.sample()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end
1415,Bay Area 511 AC Transit Vehicle Position,4bb240cf480589e58c6e58d06d6fb72c,12661020,2023-01-18 16:43:23+00:00,2023-01-18 17:40:52+00:00


In [174]:
agg.columns

Index(['_gtfs_dataset_name', 'gtfs_dataset_key', 'trip_id', 'start', 'end'], dtype='object')

In [175]:
# Find number of times the trip appears
agg2 = df.groupby(["_gtfs_dataset_name", "gtfs_dataset_key", "trip_id"]).agg({'location_timestamp':'nunique'}).reset_index().rename(
    columns = {'location_timestamp':'trip_ping_count'})

In [176]:
agg2.shape

(69975, 4)

In [177]:
agg2.columns

Index(['_gtfs_dataset_name', 'gtfs_dataset_key', 'trip_id', 'trip_ping_count'], dtype='object')

In [178]:
agg2.sample()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,trip_ping_count
68493,VCTC Vehicle Positions,910fbc31da89056bdee9ef48b235ef44,152-25,204


In [179]:
m1 = pd.merge(
    agg, agg2, how="inner", on=['_gtfs_dataset_name', 'gtfs_dataset_key', 'trip_id'],
)

In [180]:
m1.head()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count
0,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t190-b1EB5-sl1C,2023-01-18 12:00:33+00:00,2023-01-18 13:59:57+00:00,184
1,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t190-b1EBF-sl1C,2023-01-18 11:57:41+00:00,2023-01-18 13:49:15+00:00,178
2,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t19A-b1EAC-sl1C,2023-01-18 12:10:28+00:00,2023-01-18 13:49:04+00:00,164
3,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t1A9-b1EB6-sl1C,2023-01-18 12:27:46+00:00,2023-01-18 14:33:45+00:00,236
4,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t1B3-b1EAD-sl1C,2023-01-18 12:36:34+00:00,2023-01-18 14:23:06+00:00,176


In [181]:
type(m1)

pandas.core.frame.DataFrame

In [182]:
m1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 69975 entries, 0 to 69974
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   _gtfs_dataset_name  69975 non-null  object             
 1   gtfs_dataset_key    69975 non-null  object             
 2   trip_id             69975 non-null  object             
 3   start               69975 non-null  datetime64[ns, UTC]
 4   end                 69975 non-null  datetime64[ns, UTC]
 5   trip_ping_count     69975 non-null  int64              
dtypes: datetime64[ns, UTC](2), int64(1), object(3)
memory usage: 3.7+ MB


### Timezone
* Double check

In [183]:
PACIFIC_TIMEZONE = "US/Pacific"

In [184]:
m1.end.dt.tz, m1.start.dt.tz

(<UTC>, <UTC>)

In [185]:
m1.end = m1.end.dt.tz_convert(PACIFIC_TIMEZONE).apply(lambda t: t.replace(tzinfo=None))

In [186]:
m1.start = m1.start.dt.tz_convert(PACIFIC_TIMEZONE).apply(lambda t: t.replace(tzinfo=None))

In [187]:
m1.end.dt.tz, m1.start.dt.tz

(None, None)

In [188]:
def convert_timestamp_to_hrs_mins(
    df: pd.DataFrame, 
    timestamp_col: list,
    minutes: bool = True,
) -> pd.DataFrame: 
    """
    Convert timestamp into minutes or seconds.
    """
    if minutes:
        for c in timestamp_col:
            df = df.assign(
                time_min = ((df[c].dt.hour * 60) + 
                                (df[c].dt.minute) + 
                                (df[c].dt.second/60)
                           ),
            ).rename(columns = {"time_min": f"{c}_minutes"})
    
    else:
        for c in timestamp_col:
            df = df.assign(
                time_sec = ((df[timestamp_col].dt.hour * 3_600) + 
                                (df[timestamp_col].dt.minute * 60) + 
                                (df[timestamp_col].dt.second)
                           ),
            ).rename(columns = {"time_sec": f"{timestamp_col}_sec"})

    return df

In [189]:
m2 = convert_timestamp_to_hrs_mins(m1, ['start','end'], True)

In [190]:
m2.shape

(69975, 8)

In [191]:
len(m2.drop_duplicates(subset = ['_gtfs_dataset_name','gtfs_dataset_key','trip_id','trip_ping_count','start_minutes','end_minutes']))

69975

In [192]:
m2['actual_trip_duration_minutes'] = (m2['end_minutes']-m2['start_minutes'])

In [193]:
m1.head()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count
0,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t190-b1EB5-sl1C,2023-01-18 04:00:33,2023-01-18 05:59:57,184
1,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t190-b1EBF-sl1C,2023-01-18 03:57:41,2023-01-18 05:49:15,178
2,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t19A-b1EAC-sl1C,2023-01-18 04:10:28,2023-01-18 05:49:04,164
3,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t1A9-b1EB6-sl1C,2023-01-18 04:27:46,2023-01-18 06:33:45,236
4,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t1B3-b1EAD-sl1C,2023-01-18 04:36:34,2023-01-18 06:23:06,176


In [194]:
m2.head()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,start_minutes,end_minutes,actual_trip_duration_minutes
0,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t190-b1EB5-sl1C,2023-01-18 04:00:33,2023-01-18 05:59:57,184,240.55,359.95,119.4
1,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t190-b1EBF-sl1C,2023-01-18 03:57:41,2023-01-18 05:49:15,178,237.68,349.25,111.57
2,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t19A-b1EAC-sl1C,2023-01-18 04:10:28,2023-01-18 05:49:04,164,250.47,349.07,98.6
3,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t1A9-b1EB6-sl1C,2023-01-18 04:27:46,2023-01-18 06:33:45,236,267.77,393.75,125.98
4,Antelope Valley Transit Authority Vehicle Positions,1a4e07032b35ff5aae9438fefc374b55,t1B3-b1EAD-sl1C,2023-01-18 04:36:34,2023-01-18 06:23:06,176,276.57,383.1,106.53


In [279]:
len(m2.loc[m2.actual_trip_duration_minutes < 0])

4869

In [281]:
m2.loc[m2.actual_trip_duration_minutes < 0].sample(4)

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,start_minutes,end_minutes,actual_trip_duration_minutes
34194,LA Metro Bus Vehicle Positions,7c48cc5675c9266fb6851070489c53e9,10004003352323-DEC22,2023-01-17 22:59:30,2023-01-18 00:55:50,325,1379.5,55.83,-1323.67
37533,LA Metro Bus Vehicle Positions,7c48cc5675c9266fb6851070489c53e9,10066004191514-DEC22,2023-01-17 15:59:50,2023-01-18 15:59:28,280,959.83,959.47,-0.37
66655,Torrance Swiftly Vehicle Positions,9bd1e0098ba62c06837bf2330db605c2,401020,2023-01-17 15:59:35,2023-01-18 15:59:24,178,959.58,959.4,-0.18
28795,Fresno Vehicle Positions,651832dd0d35804ce4e30063d095e3e5,550363,2023-01-17 16:12:49,2023-01-18 15:59:28,221,972.82,959.47,-13.35


### Scheduled

In [195]:
scheduled_trips = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(
    analysis_date = analysis_date, 
    keep_trip_cols = ["feed_key", "trip_id", "service_hours"]
)

In [196]:
scheduled_trips['scheduled_service_minutes'] = scheduled_trips.service_hours * 60

In [197]:
scheduled_trips.sample(3)

Unnamed: 0,feed_key,trip_id,service_hours,gtfs_dataset_key,scheduled_service_minutes
92744,b9a0bc47bcf6590c67677f3af358f490,3286573,1.08,03cadbffef6f3cd4ce839b218bc65e82,65.0
64335,6980032e857182465ae8fccd86458940,f9f43981-e5ac-4a5f-aaba-9431513126de:29,0.2,c1f526828c8d4bba1c71bf647443f534,12.0
5770,244b96b75ab6da1462aadfe8b95d54b9,t_5325792_b_77427_tn_0,0.37,f09062d4bb1cb04b59bdc8232f0cfc90,22.0


### Outer Merge

In [198]:
m3 = pd.merge(
    m2, scheduled_trips, how="outer", on=["gtfs_dataset_key", "trip_id"], indicator=True
)

In [199]:
m3._merge.value_counts()

both          69975
right_only    27118
left_only         0
Name: _merge, dtype: int64

In [200]:
m3['rt_data_proportion'] = ((m3.actual_trip_duration_minutes/m3.scheduled_service_minutes)*100).fillna(0)

In [201]:
m3.rt_data_proportion = m3.rt_data_proportion.clip(lower=0)

In [202]:
m3['pings_per_minute'] =  m3.trip_ping_count/m3.actual_trip_duration_minutes

In [203]:
# m3 = m3.round(1)

In [204]:
m3 = m3.drop(columns = ['_merge'])

In [205]:
m3.shape

(97093, 14)

In [206]:
m3.columns

Index(['_gtfs_dataset_name', 'gtfs_dataset_key', 'trip_id', 'start', 'end',
       'trip_ping_count', 'start_minutes', 'end_minutes',
       'actual_trip_duration_minutes', 'feed_key', 'service_hours',
       'scheduled_service_minutes', 'rt_data_proportion', 'pings_per_minute'],
      dtype='object')

In [207]:
m3 = m3.fillna(
    m3.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [208]:
# Any proportion above 100, mask as 100
m3["rt_data_proportion"] = m3["rt_data_proportion"].mask(m3["rt_data_proportion"] > 100, 100)

In [209]:
m3.actual_trip_duration_minutes = m3.actual_trip_duration_minutes.clip(lower=0)

In [210]:
# Round certain columns
m3['rounded_rt_data_proportion'] = ((m3.rt_data_proportion/100)*10).astype(int)*10

In [211]:
m3['rounded_actual_trip_duration_minutes'] = ((m3.actual_trip_duration_minutes/100)*10).astype(int)*10

In [212]:
m3.pings_per_minute = m3.pings_per_minute.fillna(0).round(1)

In [213]:
# Find total trips by operator 
trips_operator = m3.groupby(['gtfs_dataset_key']).agg({'trip_id':'nunique'}).reset_index().rename(columns = {'trip_id':'total_trips'})

In [214]:
m4 = pd.merge(
    m3, trips_operator, how="left", on=["gtfs_dataset_key"]
)

In [314]:
m4.rounded_rt_data_proportion.unique()

array([ 90, 100,  80,  70,  60,  30,  50,  40,   0,  10,  20])

In [None]:
m4.rounded_actual_trip_duration_minutes.unique()

In [286]:
m4._gtfs_dataset_name.nunique()

71

### Chart

In [329]:
def summary_valid_trips_by_cutoff(df, time_col:str, phrase:str, time_cutoffs: list, ping_cutoffs: list):
    """
    Find percentage & number of trips that meet trip time elapsed
    and pings. 
    """
    final = pd.DataFrame()

    for t in time_cutoffs:
        for s in ping_cutoffs:
            valid = (
                df[(df[time_col] >= t) & (df.pings_per_minute >= s)]
                .groupby(["gtfs_dataset_key", "_gtfs_dataset_name", "total_trips"])
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
            )

            valid = valid.assign(
                trip_cutoff=t, pings=s, cutoff=f"{t} {phrase} & {s} ping(s)+ per minute"
            )

            final = pd.concat([final, valid], axis=0)

    final = final.assign(percentage_usable_trips=final.n_trips.divide(final.total_trips) * 100)

    return final

In [316]:
valid1 = summary_valid_trips_by_cutoff(m4, "actual_trip_duration_minutes", "+ min", [15,30,60,90,120], [1,1.5,2,2.5,3])

In [317]:
valid1.shape

(1276, 8)

In [318]:
valid1.columns

Index(['gtfs_dataset_key', '_gtfs_dataset_name', 'total_trips', 'n_trips',
       'trip_cutoff', 'pings', 'cutoff', 'percentage_usable_trips'],
      dtype='object')

In [319]:
dropdown_list = valid1["_gtfs_dataset_name"].sort_values().unique().tolist()

In [320]:
dropdown = alt.binding_select(options=[None] + dropdown_list, labels = ['All'] + dropdown_list, name = "Operator")    
selection = alt.selection_single(fields=["_gtfs_dataset_name"], bind=dropdown)

In [321]:
threshold_utils.bar_chart(valid1, "percentage_usable_trips", "cutoff",
['_gtfs_dataset_name','gtfs_dataset_key','n_trips',
 'cutoff', 'percentage_usable_trips'], "Pings and RT Trip Time").add_selection(selection).transform_filter(selection).interactive()

In [333]:
valid2 = summary_valid_trips_by_cutoff(m4, "rt_data_proportion", "% +", [10,25,50,75,100], [1,1.5,2,2.5,3])

In [331]:
valid2.columns

Index(['gtfs_dataset_key', '_gtfs_dataset_name', 'total_trips', 'n_trips',
       'trip_cutoff', 'pings', 'cutoff', 'percentage_usable_trips'],
      dtype='object')

In [335]:
threshold_utils.bar_chart(valid2, "percentage_usable_trips", "cutoff",
['_gtfs_dataset_name','gtfs_dataset_key','n_trips',
 'cutoff', 'percentage_usable_trips'], "Pings and Real Time vs. Scheduled Data Percentage").add_selection(selection).transform_filter(selection).interactive()

In [322]:
statewide = (valid1
             .groupby(['cutoff',])
             .agg({'total_trips':'max','n_trips':'max','percentage_usable_trips':'max'})
             .reset_index()
             .sort_values(['percentage_usable_trips'])
            )

In [323]:
statewide

Unnamed: 0,cutoff,total_trips,n_trips,percentage_usable_trips
4,120+ min & 3 ping(s)+ per minute,14196,501,14.42
24,90+ min & 3 ping(s)+ per minute,14196,2023,25.96
3,120+ min & 2.5 ping(s)+ per minute,14196,1244,30.77
0,120+ min & 1 ping(s)+ per minute,14196,1377,31.73
1,120+ min & 1.5 ping(s)+ per minute,14196,1364,31.73
2,120+ min & 2 ping(s)+ per minute,14196,1349,31.73
23,90+ min & 2.5 ping(s)+ per minute,14196,4330,55.77
22,90+ min & 2 ping(s)+ per minute,14196,4692,56.73
21,90+ min & 1.5 ping(s)+ per minute,14196,4734,56.73
20,90+ min & 1 ping(s)+ per minute,14196,4774,56.73
