## Scheduled vs, RT total trip time

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import dask.dataframe as dd
import datetime 
from calitp_data_analysis.sql import to_snakecase



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import _threshold_utils as threshold_utils
import altair as alt
from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import analysis_date, SEGMENT_GCS

In [4]:
import gcsfs
fs = gcsfs.GCSFileSystem()

In [5]:
import intake
catalog = intake.open_catalog("./catalog.yml")

### RT
#### Catalog

In [6]:
#rt_trips = catalog.rt_trip_diagnostics(analysis_date = analysis_date).read()

#### Original file from `b1_rt_trip_diagnostics`

In [7]:
ddf = helpers.import_vehicle_positions(
        gcs_folder = f"{SEGMENT_GCS}vp_sjoin/",
        file_name = f"vp_route_segment_{analysis_date}/",
        file_type = "df",
        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
                   "trip_id", "route_dir_identifier",
                   "location_timestamp"],
        partitioned = True
    ).repartition(partition_size="85MB")

In [8]:
vp = ddf.compute()

In [9]:
vp.shape

(10668072, 5)

In [10]:
# Checkout duplicates - 9,500 less rows.
len(vp.drop_duplicates(subset = ['trip_id','location_timestamp']))

10658575

In [11]:
10668072-10658575

9497

In [12]:
len(vp.drop_duplicates(subset = ["gtfs_dataset_key",'trip_id','location_timestamp']))

10668072

In [13]:
vp.sample()

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,trip_id,route_dir_identifier,location_timestamp
8431925,7c48cc5675c9266fb6851070489c53e9,LA Metro Bus Vehicle Positions,10761000720834-DEC22,3880740724,2023-01-18 18:05:27+00:00


In [14]:
def start_end_pings(df):
    group_cols = ["_gtfs_dataset_name", "gtfs_dataset_key", "trip_id"]
    
    # Find the max and min location stamp 
    # for each gtfs dataset-trip_id to determine
    # start and end date
    start_end = (df.groupby(group_cols)
        .agg({"location_timestamp": [ "min", "max"]})
        .reset_index()
        .rename(columns = {'min':'start', 'max':'end'})
      ) 

    start_end.columns = start_end.columns.droplevel()
    start_end.columns.values[0] = "_gtfs_dataset_name"
    start_end.columns.values[1] = "gtfs_dataset_key"
    start_end.columns.values[2] = "trip_id"
    
    # Find number of pings by 
    # counting nunique location_timestamps
    pings = (df.groupby(group_cols)
          .agg({'location_timestamp':'nunique'})
          .reset_index()
          .rename(columns = {'location_timestamp':'trip_ping_count'})
         )
                 
    # Merge these 2 
    m1 = start_end.merge(pings, how="inner", on = group_cols)
    return m1

In [15]:
vp2 = start_end_pings(vp)

In [16]:
vp2.sample()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count
57789,SBMTD VehiclePositions,c5b2bf3815a12d5f7242923ed399d1ea,2057040,2023-01-18 21:50:30+00:00,2023-01-18 22:19:01+00:00,34


In [17]:
# No duplicates. 
len(vp2.drop_duplicates(subset = ['trip_id','start','end']))

69975

In [18]:
vp2.end.dt.tz, vp2.start.dt.tz

(<UTC>, <UTC>)

### Timezone
* Convert from UTC to Pacific to Naive
* https://stackoverflow.com/questions/24281525/what-is-the-point-of-a-naive-datetimedf.select_dtypes(include=['datetime64'])

In [19]:
def naive_timezone(df):
    """
    Convert UTC to PST to Naive. 
    """
    PACIFIC_TIMEZONE = "US/Pacific"
    
    # Grab datetime columns
    dt_cols = [col for col in df.columns if df[col].dtype == 'datetime64[ns, UTC]']
    
    for col in dt_cols:
        df[col] = df[col].dt.tz_convert(PACIFIC_TIMEZONE).apply(lambda t: t.replace(tzinfo=None))
        
    return df 

In [20]:
vp2 = naive_timezone(vp2)

In [21]:
vp2.end.dt.tz, vp2.start.dt.tz

(None, None)

In [22]:
def convert_timestamp_to_hrs_mins(
    df: pd.DataFrame, 
    timestamp_col: list,
    minutes: bool = True,
) -> pd.DataFrame: 
    """
    Convert timestamp into minutes or seconds.
    """
    if minutes:
        for c in timestamp_col:
            df = df.assign(
                time_min = ((df[c].dt.hour * 60) + 
                                (df[c].dt.minute) + 
                                (df[c].dt.second/60)
                           ),
            ).rename(columns = {"time_min": f"{c}_minutes"})
    
    else:
        for c in timestamp_col:
            df = df.assign(
                time_sec = ((df[timestamp_col].dt.hour * 3_600) + 
                                (df[timestamp_col].dt.minute * 60) + 
                                (df[timestamp_col].dt.second)
                           ),
            ).rename(columns = {"time_sec": f"{timestamp_col}_sec"})

    return df

In [23]:
vp2 = convert_timestamp_to_hrs_mins(vp2, ['start','end'])

In [24]:
len(vp2.drop_duplicates(subset = ['_gtfs_dataset_name','gtfs_dataset_key','trip_id','trip_ping_count','start_minutes','end_minutes']))

69975

In [25]:
len(vp2.drop_duplicates(subset = ['trip_id','start_minutes','end_minutes']))

69975

In [26]:
vp2.sample()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,start_minutes,end_minutes
15895,Bay Area 511 Muni VehiclePositions,644cf39285755e8ba2dc68597faa48aa,11107856_M11,2023-01-17 21:42:10,2023-01-17 21:55:37,41,1302.17,1315.62


In [27]:
vp2._gtfs_dataset_name.nunique(), vp2.gtfs_dataset_key.nunique()

(70, 70)

### Subset trips to 6am to 8pm to test
* Only have trips that appear on January 18. 
* Trips that start after 6AM
* Trips that end at 8PM.

In [28]:
def filter_trips(df, date:str):
    
    # Filter by day
    df = df[(df['start'].dt.strftime('%Y-%m-%d') == date) & (df['end'].dt.strftime('%Y-%m-%d') == date)].reset_index()
    
    # Filter by time
    # https://stackoverflow.com/questions/65510529/how-to-filter-pandas-dataframe-by-time
    time1 = pd.to_datetime('2023-01-18  06:00:00').strftime('%H:%M')
    time2 = pd.to_datetime('2023-01-18  20:00:00').strftime('%H:%M')
    df =df.loc[(df['start'].dt.strftime('%H:%M') >= time1 ) & (df['end'].dt.strftime('%H:%M') <= time2)].reset_index()
    
    # Find actual trip times
    df['actual_trip_duration_minutes'] = (df['end_minutes']-df['start_minutes'])
    
    df = df.drop(columns = ['level_0', 'index'])
    return df 

In [29]:
subset_vp = filter_trips(vp2, analysis_date)

In [30]:
# 5,000 less rows?? 
# len(m3) - len(m3.loc[(m3['start'].dt.strftime('%H:%M') >= time1 ) & (m3['end'].dt.strftime('%H:%M') <= time2)])

In [31]:
# No more negative trips
len(subset_vp.loc[subset_vp.actual_trip_duration_minutes < 0])

0

In [68]:
# subset_vp.loc[subset_vp._gtfs_dataset_name == "Unitrans Vehicle Positions"].sort_values('actual_trip_duration_minutes')

### Outer Merge
* Use an outer join in case there are trip_ids that appear in RT but not in schedule or vice versa
* More trips in scheduled than RT.

In [33]:
def merge_schedule_vp(vp_df, date: str):
    
    # Load scheduled trips
    scheduled_trips = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(analysis_date = date, 
    keep_trip_cols = ["feed_key", "trip_id", "service_hours"])
    
    # Convert trip hours to minutes
    scheduled_trips['scheduled_service_minutes'] = scheduled_trips.service_hours * 60
    
    # Merge
    merge1 = pd.merge(vp_df, scheduled_trips, how="outer", on=["gtfs_dataset_key", "trip_id"], indicator=True)
    
    # Metrics
    merge1 = merge1.assign(
    rt_data_proportion = ((merge1.actual_trip_duration_minutes/merge1.scheduled_service_minutes)*100).fillna(0),
    pings_per_minute = (merge1.trip_ping_count/merge1.actual_trip_duration_minutes).fillna(0))
    
    # Any proportion above 100, mask as 100
    merge1["rt_data_proportion"] = merge1["rt_data_proportion"].mask(merge1["rt_data_proportion"] > 100, 100)
    
        
    # Find number of trips for each operator
    trips_ops = (merge1
                 .groupby("gtfs_dataset_key")
                 .agg({'trip_id':'nunique'})
                 .reset_index()
                 .rename(columns = {'trip_id':'trips_by_operator'})
         )
    
    # Merge trips that appear for the operator in 
    # both scheduled and RT.
    merge2 = trips_ops.merge(merge1, how="inner", on = "gtfs_dataset_key")
    return merge2

In [34]:
merge1 = merge_schedule_vp(subset_vp, analysis_date)

In [35]:
merge1._merge.value_counts()

right_only    56259
both          40834
left_only         0
Name: _merge, dtype: int64

In [36]:
def final_df(vp_df, date: str):
    df = merge_schedule_vp(vp_df, date)
    
    df = df.drop(columns = ['_merge'])
    
    # Fill in 
    df = df.fillna(
    df.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    # Round certain columns to eyeball
    for i in ['rt_data_proportion','actual_trip_duration_minutes']:
        df[f"rounded_{i}"] = ((df[i]/100)*10).astype(int)*10
    
    return df

In [38]:
final = final_df(subset_vp, analysis_date)

In [39]:
final.shape

(97093, 17)

### Charts

In [41]:
def summary_valid_trips_by_cutoff(df, time_col:str, phrase:str,
                                  time_cutoffs: list, ping_cutoffs: list,
                                  pings: bool = True):
    """
    Find percentage & number of trips that meet trip time elapsed
    and pings. 
    """
    final = pd.DataFrame()

    if pings:
        for t in time_cutoffs:
            for s in ping_cutoffs:
                valid = (
                df[(df[time_col] >= t) & (df.pings_per_minute >= s)]
                .groupby(["gtfs_dataset_key", "_gtfs_dataset_name", "trips_by_operator"])
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
                )
                
                valid = valid.assign(
                trip_cutoff=t, pings=s, cutoff=f"{t} {phrase} & {s} ping(s)+ per minute")

                final = pd.concat([final, valid], axis=0)
    else:
        for t in time_cutoffs:
            valid = (
                df[(df[time_col] >= t)]
                .groupby(["gtfs_dataset_key", "_gtfs_dataset_name", "trips_by_operator"])
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
                )

            valid = valid.assign(
                 trip_cutoff=t,cutoff=f"{t}{phrase}"
                )

            final = pd.concat([final, valid], axis=0)

    final = final.assign(percentage_usable_trips=final.n_trips.divide(final.trips_by_operator) * 100)

    return final

In [42]:
pings_trip_duration_ops = summary_valid_trips_by_cutoff(final, "actual_trip_duration_minutes", "+ min", [15,30,60,90,120], [1,2,3])

In [43]:
pings_trip_duration_ops.shape

(682, 8)

In [44]:
dropdown_list = pings_trip_duration_ops["_gtfs_dataset_name"].sort_values().unique().tolist()

In [45]:
dropdown = alt.binding_select(options=[None] + dropdown_list, labels = ['All'] + dropdown_list, name = "Operator")    
selection = alt.selection_single(fields=['Gtfs Dataset Name'], bind=dropdown)

In [46]:
pings_trip_duration_ops = threshold_utils.pre_clean(pings_trip_duration_ops)

In [47]:
threshold_utils.chart_size(
    threshold_utils.bar_chart(pings_trip_duration_ops, 'Percentage Usable Trips', "Cutoff",
['Gtfs Dataset Key', 'Gtfs Dataset Name','N Trips',
 'Cutoff', 'Percentage Usable Trips'], "Pings and RT Trip Time Elapsed").add_selection(selection).transform_filter(selection).interactive(), 500, 400)

In [57]:
# Check with just Unitrans
# len(pings_trip_duration_ops[(pings_trip_duration_ops["_gtfs_dataset_name"] == "Unitrans Vehicle Positions") & (pings_trip_duration_ops["pings_per_minute"] > 1) & (pings_trip_duration_ops["actual_trip_duration_minutes"] > 15)])

In [59]:
pings_trip_duration_ops[(pings_trip_duration_ops["Gtfs Dataset Name"] == "Unitrans Vehicle Positions")]

Unnamed: 0,Gtfs Dataset Key,Gtfs Dataset Name,Trips By Operator,N Trips,Trip Cutoff,Pings,Cutoff,Percentage Usable Trips
41,73ebf252af88c41aaf4426c6d3dc586c,Unitrans Vehicle Positions,860,254,15,1,15 + min & 1 ping(s)+ per minute,29.5
41,73ebf252af88c41aaf4426c6d3dc586c,Unitrans Vehicle Positions,860,247,15,2,15 + min & 2 ping(s)+ per minute,28.7
30,73ebf252af88c41aaf4426c6d3dc586c,Unitrans Vehicle Positions,860,6,15,3,15 + min & 3 ping(s)+ per minute,0.7
41,73ebf252af88c41aaf4426c6d3dc586c,Unitrans Vehicle Positions,860,89,30,1,30 + min & 1 ping(s)+ per minute,10.3
40,73ebf252af88c41aaf4426c6d3dc586c,Unitrans Vehicle Positions,860,83,30,2,30 + min & 2 ping(s)+ per minute,9.7
38,73ebf252af88c41aaf4426c6d3dc586c,Unitrans Vehicle Positions,860,24,60,1,60 + min & 1 ping(s)+ per minute,2.8
34,73ebf252af88c41aaf4426c6d3dc586c,Unitrans Vehicle Positions,860,22,60,2,60 + min & 2 ping(s)+ per minute,2.6


In [69]:
# final[(final["_gtfs_dataset_name"] == "Unitrans Vehicle Positions")][['_gtfs_dataset_name','actual_trip_duration_minutes','pings_per_minute']].sort_values('actual_trip_duration_minutes')          

In [51]:
rt_data_available = summary_valid_trips_by_cutoff(final,"rt_data_proportion", "%+", [20,40,60,80,100], [], False)

In [52]:
rt_data_available.shape

(349, 7)

In [53]:
rt_data_available = threshold_utils.pre_clean(rt_data_available)

In [54]:
threshold_utils.chart_size(threshold_utils.bar_chart(rt_data_available, 'Percentage Usable Trips', "Cutoff",
['Gtfs Dataset Key', 'Gtfs Dataset Name','Trip Cutoff', 'Percentage Usable Trips', 'N Trips'],
                          "Real Time vs. Scheduled Data Percentage").add_selection(selection).transform_filter(selection).interactive(), 500,400)

In [75]:
statewide = (pings_trip_duration_ops
             .groupby(['Cutoff',])
             .agg({'Trips By Operator':'max','N Trips':'max'})
             .reset_index()
            )

In [76]:
statewide['Percentage of Usable Trips'] = statewide['N Trips']/statewide['Trips By Operator']*100

In [77]:
statewide


Unnamed: 0,Cutoff,Trips By Operator,N Trips,Percentage of Usable Trips
0,120 + min & 1 ping(s)+ per minute,14196,907,6.39
1,120 + min & 2 ping(s)+ per minute,14196,883,6.22
2,120 + min & 3 ping(s)+ per minute,14196,107,0.75
3,15 + min & 1 ping(s)+ per minute,14196,6046,42.59
4,15 + min & 2 ping(s)+ per minute,14196,5876,41.39
5,15 + min & 3 ping(s)+ per minute,14196,1724,12.14
6,30 + min & 1 ping(s)+ per minute,14196,5893,41.51
7,30 + min & 2 ping(s)+ per minute,14196,5733,40.38
8,30 + min & 3 ping(s)+ per minute,14196,1515,10.67
9,60 + min & 1 ping(s)+ per minute,14196,5134,36.17
