## Scheduled vs, RT total trip time

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import dask.dataframe as dd
import datetime 
from calitp_data_analysis.sql import to_snakecase



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import _threshold_utils as threshold_utils
import altair as alt
from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import analysis_date, SEGMENT_GCS

In [4]:
import gcsfs
fs = gcsfs.GCSFileSystem()

In [5]:
import intake
catalog = intake.open_catalog("./catalog.yml")

### RT
#### Catalog

In [6]:
#rt_trips = catalog.rt_trip_diagnostics(analysis_date = analysis_date).read()

#### Original file from `b1_rt_trip_diagnostics`

In [7]:
ddf = helpers.import_vehicle_positions(
        gcs_folder = f"{SEGMENT_GCS}vp_sjoin/",
        file_name = f"vp_route_segment_{analysis_date}/",
        file_type = "df",
        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
                   "trip_id", "route_dir_identifier",
                   "location_timestamp"],
        partitioned = True
    ).repartition(partition_size="85MB")

In [8]:
vp = ddf.compute()

In [9]:
vp.shape

(10668072, 5)

In [10]:
# Checkout duplicates - 9,500 less rows.
len(vp.drop_duplicates(subset = ['trip_id','location_timestamp']))

10658575

In [11]:
10668072-10658575

9497

In [12]:
len(vp.drop_duplicates(subset = ["gtfs_dataset_key",'trip_id','location_timestamp']))

10668072

In [13]:
vp.sample()

Unnamed: 0,gtfs_dataset_key,_gtfs_dataset_name,trip_id,route_dir_identifier,location_timestamp
9615417,4594028c6c1ebbb6dbfb4284d7510b4e,Foothill Vehicle Positions,t4D8-b27E6-sl5,3732307419,2023-01-18 21:46:12+00:00


In [29]:
def start_end_pings(df):
    group_cols = ["_gtfs_dataset_name", "gtfs_dataset_key", "trip_id"]
    
    # Find the max and min location stamp 
    # for each gtfs dataset-trip_id to determine
    # start and end date
    start_end = (df.groupby(group_cols)
        .agg({"location_timestamp": [ "min", "max"]})
        .reset_index()
        .rename(columns = {'min':'start', 'max':'end'})
      ) 

    start_end.columns = start_end.columns.droplevel()
    start_end.columns.values[0] = "_gtfs_dataset_name"
    start_end.columns.values[1] = "gtfs_dataset_key"
    start_end.columns.values[2] = "trip_id"
    
    # Find number of pings by 
    # counting nunique location_timestamps
    pings = (df.groupby(group_cols)
          .agg({'location_timestamp':'nunique'})
          .reset_index()
          .rename(columns = {'location_timestamp':'trip_ping_count'})
         )
    
    # Find number of trips for each operator
    trips_ops = (df
                 .groupby(["_gtfs_dataset_name", "gtfs_dataset_key"])
                 .agg({'trip_id':'nunique'})
                 .reset_index()
                 .rename(columns = {'trip_id':'trips_by_operator'})
         )
                 
    # Merge these 2 
    m1 = (
        start_end.merge(
            pings, how="inner", on = group_cols
        )
        .merge(trips_ops, how="inner", on= ["_gtfs_dataset_name", "gtfs_dataset_key"])
    )
    return m1

In [30]:
vp2 = start_end_pings(vp)

In [31]:
vp2.sample()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,trips_by_operator
56658,Riverside VehiclePositions,f1497340b654f3ce925f8b8c2f83e1c2,1468020,2023-01-18 18:07:31+00:00,2023-01-18 19:39:42+00:00,183,1094


In [18]:
# No duplicates. 
len(vp2.drop_duplicates(subset = ['trip_id','start','end']))

69975

In [43]:
vp2.end.dt.tz, vp2.start.dt.tz

(<UTC>, <UTC>)

### Timezone
* Convert from UTC to Pacific to Naive
* https://stackoverflow.com/questions/24281525/what-is-the-point-of-a-naive-datetimedf.select_dtypes(include=['datetime64'])

In [45]:
def naive_timezone(df):
    """
    Convert UTC to PST to Naive. 
    """
    PACIFIC_TIMEZONE = "US/Pacific"
    
    # Grab datetime columns
    dt_cols = [col for col in df.columns if df[col].dtype == 'datetime64[ns, UTC]']
    
    for col in dt_cols:
        df[col] = df[col].dt.tz_convert(PACIFIC_TIMEZONE).apply(lambda t: t.replace(tzinfo=None))
        
    return df 

In [46]:
vp2 = naive_timezone(vp2)

In [47]:
vp2.end.dt.tz, vp2.start.dt.tz

(None, None)

In [48]:
def convert_timestamp_to_hrs_mins(
    df: pd.DataFrame, 
    timestamp_col: list,
    minutes: bool = True,
) -> pd.DataFrame: 
    """
    Convert timestamp into minutes or seconds.
    """
    if minutes:
        for c in timestamp_col:
            df = df.assign(
                time_min = ((df[c].dt.hour * 60) + 
                                (df[c].dt.minute) + 
                                (df[c].dt.second/60)
                           ),
            ).rename(columns = {"time_min": f"{c}_minutes"})
    
    else:
        for c in timestamp_col:
            df = df.assign(
                time_sec = ((df[timestamp_col].dt.hour * 3_600) + 
                                (df[timestamp_col].dt.minute * 60) + 
                                (df[timestamp_col].dt.second)
                           ),
            ).rename(columns = {"time_sec": f"{timestamp_col}_sec"})

    return df

In [49]:
vp2 = convert_timestamp_to_hrs_mins(vp2, ['start','end'])

In [50]:
len(vp2.drop_duplicates(subset = ['_gtfs_dataset_name','gtfs_dataset_key','trip_id','trip_ping_count','start_minutes','end_minutes']))

69975

In [51]:
len(vp2.drop_duplicates(subset = ['trip_id','start_minutes','end_minutes']))

69975

In [52]:
vp2.sample()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,trips_by_operator,start_minutes,end_minutes
34493,LA Metro Bus Vehicle Positions,7c48cc5675c9266fb6851070489c53e9,10014005020744-DEC22,2023-01-18 07:31:47,2023-01-18 09:32:09,362,13812,451.78,572.15


In [53]:
vp2._gtfs_dataset_name.nunique(), vp2.gtfs_dataset_key.nunique()

(70, 70)

### Subset trips to 6am to 8pm to test
* Only have trips that appear on January 18. 
* Trips that start after 6AM
* Trips that end at 8PM.

In [102]:
def filter_trips(df, date:str):
    
    # Filter by day
    df = df[(df['start'].dt.strftime('%Y-%m-%d') == date) & (df['end'].dt.strftime('%Y-%m-%d') == date)].reset_index()
    
    # Filter by time
    # https://stackoverflow.com/questions/65510529/how-to-filter-pandas-dataframe-by-time
    time1 = pd.to_datetime('2023-01-18  06:00:00').strftime('%H:%M')
    time2 = pd.to_datetime('2023-01-18  20:00:00').strftime('%H:%M')
    df =df.loc[(df['start'].dt.strftime('%H:%M') >= time1 ) & (df['end'].dt.strftime('%H:%M') <= time2)].reset_index()
    
    # Find actual trip times
    df['actual_trip_duration_minutes'] = (df['end_minutes']-df['start_minutes'])
    
    df = df.drop(columns = ['level_0', 'index'])
    return df 

In [103]:
subset_vp = filter_trips(vp2, analysis_date)

In [57]:
# 5,000 less rows?? 
# len(m3) - len(m3.loc[(m3['start'].dt.strftime('%H:%M') >= time1 ) & (m3['end'].dt.strftime('%H:%M') <= time2)])

In [60]:
# No more negative trips
len(subset_vp.loc[subset_vp.actual_trip_duration_minutes < 0])

0

In [104]:
subset_vp.sample()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,trips_by_operator,start_minutes,end_minutes,actual_trip_duration_minutes
27023,LA Metro Bus Vehicle Positions,7c48cc5675c9266fb6851070489c53e9,10901000571457-DEC22,2023-01-18 14:37:26,2023-01-18 15:34:45,165,13812,877.43,934.75,57.32


### Outer Merge
* Use an outer join in case there are trip_ids that appear in RT but not in schedule or vice versa
* More trips in scheduled than RT.

In [105]:
def m_schedule_vp(vp_df, date: str):
    
    # Load scheduled trips
    scheduled_trips = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(analysis_date = date, 
    keep_trip_cols = ["feed_key", "trip_id", "service_hours"])
    
    # Convert trip hours to minutes
    scheduled_trips['scheduled_service_minutes'] = scheduled_trips.service_hours * 60
    
    # Merge
    merge1 = pd.merge(vp_df, scheduled_trips, how="outer", on=["gtfs_dataset_key", "trip_id"], indicator=True)
        
    # Fill NA
    #merge1 = merge1.fillna(
    #merge1.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    # Metrics
    #merge1 = merge1.assign(
   # rt_data_proportion = ((merge1.actual_trip_duration_minutes/merge1.scheduled_service_minutes)*100).fillna(0),
   # pings_per_minute = (merge1.trip_ping_count/merge1.actual_trip_duration_minutes).fillna(0),

   # )
    
    # Any proportion above 100, mask as 100
   # merge1["rt_data_proportion"] = merge1["rt_data_proportion"].mask(merge1["rt_data_proportion"] > 100, 100)
    
   # merge1 = merge1.assign(
   # rounded_rt_data_proportion = ((merge1.rt_data_proportion/100)*10).astype(int)*10,
  #  rounded_actual_trip_duration_minutes = ((merge1.actual_trip_duration_minutes/100)*10).astype(int)*10,
  #  )
    return merge1

In [106]:
merge1 = m_schedule_vp(subset_vp, analysis_date)

In [96]:
merge1._merge.value_counts()

right_only    56259
both          40834
left_only         0
Name: _merge, dtype: int64

In [97]:
merge1.sample()

Unnamed: 0,level_0,index,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,trips_by_operator,start_minutes,end_minutes,actual_trip_duration_minutes,feed_key,service_hours,scheduled_service_minutes,_merge
95562,,,,03cadbffef6f3cd4ce839b218bc65e82,3283532,NaT,NaT,,,,,,b9a0bc47bcf6590c67677f3af358f490,0.7,42.0,right_only


In [None]:
# Compare actual minutes recorded vs scheduled minutes
m5['rt_data_proportion'] = ((m5.actual_trip_duration_minutes/m5.scheduled_service_minutes)*100).fillna(0)

In [None]:
# Any proportion above 100, mask as 100
m5["rt_data_proportion"] = m5["rt_data_proportion"].mask(m5["rt_data_proportion"] > 100, 100)

In [None]:
m5.rt_data_proportion.value_counts().head(10)

In [None]:
# Calculate the number of pings per minute 
m5['pings_per_minute'] =  (m5.trip_ping_count/m5.actual_trip_duration_minutes).fillna(0)

In [None]:
m5 = m5.drop(columns = ['_merge'])

In [None]:
m5 = m5.fillna(
    m5.dtypes.replace({"float64": 0.0, "object": "None"})
)

In [None]:
# Any figures below 0, replace it with 0 
# m3.actual_trip_duration_minutes = m3.actual_trip_duration_minutes.clip(lower=0)

In [None]:
# Round certain cols for plotting
m5['rounded_rt_data_proportion'] = ((m5.rt_data_proportion/100)*10).astype(int)*10

In [None]:
m5['rounded_actual_trip_duration_minutes'] = ((m5.actual_trip_duration_minutes/100)*10).astype(int)*10

In [None]:
m5.gtfs_dataset_key.nunique(), m5._gtfs_dataset_name.nunique()

In [None]:
m6.rounded_rt_data_proportion.unique()

In [None]:
m6.drop_duplicates(subset = ['gtfs_dataset_key', '_gtfs_dataset_name'])[[ '_gtfs_dataset_name','total_trips']]

In [None]:
# A lot of trips have only scheduled data 
m6.rounded_actual_trip_duration_minutes.value_counts()

In [None]:
m6_names = set(m6._gtfs_dataset_name.unique().tolist())

In [None]:
m1_names = set(m1._gtfs_dataset_name.unique().tolist())

In [None]:
m6_names - m1_names

### Chart

In [None]:
def summary_valid_trips_by_cutoff(df, time_col:str, phrase:str,
                                  time_cutoffs: list, ping_cutoffs: list,
                                  pings: bool = True):
    """
    Find percentage & number of trips that meet trip time elapsed
    and pings. 
    """
    final = pd.DataFrame()

    if pings:
        for t in time_cutoffs:
            for s in ping_cutoffs:
                valid = (
                df[(df[time_col] >= t) & (df.pings_per_minute >= s)]
                .groupby(["gtfs_dataset_key", "_gtfs_dataset_name", "total_trips"])
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
                )
                
                valid = valid.assign(
                trip_cutoff=t, pings=s, cutoff=f"{t} {phrase} & {s} ping(s)+ per minute")

            final = pd.concat([final, valid], axis=0)
    else:
        for t in time_cutoffs:
            valid = (
                df[(df[time_col] >= t)]
                .groupby(["gtfs_dataset_key", "_gtfs_dataset_name", "total_trips"])
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
                )

            valid = valid.assign(
                 trip_cutoff=t,cutoff=f"{t}{phrase}"
                )

            final = pd.concat([final, valid], axis=0)

    final = final.assign(percentage_usable_trips=final.n_trips.divide(final.total_trips) * 100)

    return final

In [None]:
valid1 = summary_valid_trips_by_cutoff(m6, "actual_trip_duration_minutes", "+ min", [15,30,60,90,120], [1,2,3])

In [None]:
valid1.shape

In [None]:
valid1.columns

In [None]:
dropdown_list = valid1["_gtfs_dataset_name"].sort_values().unique().tolist()

In [None]:
dropdown = alt.binding_select(options=[None] + dropdown_list, labels = ['All'] + dropdown_list, name = "Operator")    
selection = alt.selection_single(fields=['Gtfs Dataset Name'], bind=dropdown)

In [None]:
valid1 = threshold_utils.pre_clean(valid1)

In [None]:
threshold_utils.bar_chart(valid1, 'Percentage Usable Trips', "Cutoff",
['Gtfs Dataset Key', 'Gtfs Dataset Name','N Trips',
 'Cutoff', 'Percentage Usable Trips'], "Pings and RT Trip Time Elapsed").add_selection(selection).transform_filter(selection).interactive()

In [None]:
# Check with just Unitrans
len(m6[(m6["_gtfs_dataset_name"] == "Unitrans Vehicle Positions") & (m6["pings_per_minute"] > 1) & (m6["actual_trip_duration_minutes"] > 15)])

In [None]:
# m1[(m1["_gtfs_dataset_name"] == "Unitrans Vehicle Positions")]

In [None]:
# m6[(m6["_gtfs_dataset_name"] == "Unitrans Vehicle Positions")][['_gtfs_dataset_name','actual_trip_duration_minutes','pings_per_minute']].sort_values('actual_trip_duration_minutes')                                  

In [None]:
(m5[(m5["_gtfs_dataset_name"] == "Unitrans Vehicle Positions")])[['trip_id']].nunique()

In [None]:
valid1.loc[valid1['Gtfs Dataset Name'] == "Unitrans Vehicle Positions"]

In [None]:
valid2 = summary_valid_trips_by_cutoff(m6,"rt_data_proportion", "%+", [20,40,60,80,100], [], False)

In [None]:
valid2.shape

In [None]:
valid2 = threshold_utils.pre_clean(valid2)

In [None]:
threshold_utils.bar_chart(valid2, 'Percentage Usable Trips', "Cutoff",
['Gtfs Dataset Key', 'Gtfs Dataset Name','Trip Cutoff', 'Percentage Usable Trips', 'N Trips'],
                          "Real Time vs. Scheduled Data Percentage").add_selection(selection).transform_filter(selection).interactive()

In [None]:
statewide = (valid1
             .groupby(['Cutoff',])
             .agg({'Total Trips':'max','N Trips':'max'})
             .reset_index()
            )

In [None]:
statewide['Percentage of Usable Trips'] = statewide['N Trips']/statewide['Total Trips']*100

In [None]:
statewide
