## Scheduled vs, RT total trip time

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import dask.dataframe as dd
import datetime 

from calitp_data_analysis.sql import to_snakecase
import _threshold_utils as threshold_utils
import altair as alt
from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import analysis_date, SEGMENT_GCS



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import gcsfs
fs = gcsfs.GCSFileSystem()

In [4]:
import intake
catalog = intake.open_catalog("./catalog.yml")

### RT

#### Original file from `b1_rt_trip_diagnostics`
* vp._gtfs_dataset_name.nunique() == vp.gtfs_dataset_key.nunique() is true
* 10668072 rows 
*  len(vp.drop_duplicates(subset = ['trip_id','location_timestamp'])) produces 10658575 rows

In [5]:

ddf = helpers.import_vehicle_positions(
        gcs_folder = f"{SEGMENT_GCS}vp_sjoin/",
        file_name = f"vp_route_segment_{analysis_date}/",
        file_type = "df",
        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
                   "trip_id", "route_dir_identifier",
                   "location_timestamp"],
        partitioned = True
    ).repartition(partition_size="85MB")


In [6]:
vp = ddf.compute()

In [7]:
# vp.shape

In [8]:
# Checkout duplicates - 9,500 less rows.
# len(vp.drop_duplicates(subset = ['trip_id','location_timestamp']))

In [9]:
# 10668072-10658575

In [10]:
# len(vp.drop_duplicates(subset = ["gtfs_dataset_key",'trip_id','location_timestamp']))

In [11]:
# vp.sample()

In [12]:
test1 = vp.groupby(['gtfs_dataset_key','_gtfs_dataset_name']).agg({'trip_id':'nunique'}).reset_index().rename(columns = {'trip_id':'rt_trips'})

In [13]:
# vp._gtfs_dataset_name.nunique() == vp.gtfs_dataset_key.nunique()

In [14]:
# vp._gtfs_dataset_name.nunique()

In [15]:
#vp[['_gtfs_dataset_name', 'gtfs_dataset_key']].drop_duplicates().sort_values('_gtfs_dataset_name').reset_index(drop = True)

In [17]:
def start_end_pings(gcs:str, date:str):
    """
    Find number of pings and 
    start and end time for a trip.
    
    gcs (str): file path
    date (str): the analysis date
    """
    # Load in file.
    ddf = helpers.import_vehicle_positions(
        gcs_folder = f"{gcs}vp_sjoin/",
        file_name = f"vp_route_segment_{date}/",
        file_type = "df",
        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
                   "trip_id", "route_dir_identifier",
                   "location_timestamp"],
        partitioned = True
    ).repartition(partition_size="85MB")
    
    df = ddf.compute()
    
    group_cols = ["_gtfs_dataset_name", "gtfs_dataset_key", "trip_id"]
    
    # Find the max and min location stamp 
    # to determine start and end date
    start_end = (df.groupby(group_cols)
        .agg({"location_timestamp": [ "min", "max"]})
        .reset_index()
        .rename(columns = {'min':'start', 'max':'end'})
      ) 
    
    # Clean up columns
    start_end.columns = start_end.columns.droplevel()
    start_end.columns.values[0] = "_gtfs_dataset_name"
    start_end.columns.values[1] = "gtfs_dataset_key"
    start_end.columns.values[2] = "trip_id"
    
    # Find number of pings by 
    # counting nunique location_timestamps
    pings = (df.groupby(group_cols)
          .agg({'location_timestamp':'nunique'})
          .reset_index()
          .rename(columns = {'location_timestamp':'trip_ping_count'})
         )
    
    # Find number of trips for each operator
    #trips_ops = (df
    #             .groupby(["gtfs_dataset_key", "_gtfs_dataset_name"])
    #             .agg({'trip_id':'nunique'})
    #             .reset_index()
     #            .rename(columns = {'trip_id':'rt_trip_counts_by_operator'})
    #     )
    
    # Merge
    # m1 = (start_end
    #      .merge(pings, how="inner", on = group_cols)
    #      .merge(trips_ops, how = "inner", on = ["gtfs_dataset_key", "_gtfs_dataset_name"])
    #     )
    
    m1 = (start_end.merge(pings, how="inner", on = group_cols))
          
    return m1

In [18]:
vp2 = start_end_pings(SEGMENT_GCS, analysis_date)

In [19]:
# test1

In [21]:
# No duplicates. 
len(vp2.drop_duplicates(subset = ['trip_id','start','end']))

69975

In [22]:
vp2.end.dt.tz, vp2.start.dt.tz

(<UTC>, <UTC>)

### Timezone
* Convert from UTC to Pacific to Naive
* https://stackoverflow.com/questions/24281525/what-is-the-point-of-a-naive-datetimedf.select_dtypes(include=['datetime64'])

In [23]:
def naive_timezone(df):
    """
    Convert UTC to PST to Naive. 
    """
    PACIFIC_TIMEZONE = "US/Pacific"
    
    # Grab datetime columns
    dt_cols = [col for col in df.columns if df[col].dtype == 'datetime64[ns, UTC]']
    
    for col in dt_cols:
        df[col] = df[col].dt.tz_convert(PACIFIC_TIMEZONE).apply(lambda t: t.replace(tzinfo=None))
        
    return df 

In [24]:
vp2 = naive_timezone(vp2)

In [25]:
vp2.end.dt.tz, vp2.start.dt.tz

(None, None)

In [26]:
def convert_timestamp_to_hrs_mins(
    df: pd.DataFrame, 
    timestamp_col: list,
    minutes: bool = True,
) -> pd.DataFrame: 
    """
    Convert datetime col into minutes or seconds.
    """
    if minutes:
        for c in timestamp_col:
            df = df.assign(
                time_min = ((df[c].dt.hour * 60) + 
                                (df[c].dt.minute) + 
                                (df[c].dt.second/60)
                           ),
            ).rename(columns = {"time_min": f"{c}_minutes"})
    
    else:
        for c in timestamp_col:
            df = df.assign(
                time_sec = ((df[timestamp_col].dt.hour * 3_600) + 
                                (df[timestamp_col].dt.minute * 60) + 
                                (df[timestamp_col].dt.second)
                           ),
            ).rename(columns = {"time_sec": f"{timestamp_col}_sec"})

    return df

In [27]:
vp2 = convert_timestamp_to_hrs_mins(vp2, ['start','end'])

In [28]:
len(vp2.drop_duplicates(subset = ['_gtfs_dataset_name','gtfs_dataset_key','trip_id','trip_ping_count','start_minutes','end_minutes']))

69975

In [29]:
len(vp2.drop_duplicates(subset = ['trip_id','start_minutes','end_minutes']))

69975

In [30]:
vp2.sample(2)

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,start_minutes,end_minutes
65052,San Diego Vehicle Positions,318c6bc87acccb67fb00aa9180aa5a12,16747571,2023-01-18 10:14:12,2023-01-18 10:58:34,123,614.2,658.57
13659,Bay Area 511 Muni VehiclePositions,644cf39285755e8ba2dc68597faa48aa,11084312_M11,2023-01-18 12:56:57,2023-01-18 13:20:27,68,776.95,800.45


In [31]:
vp2._gtfs_dataset_name.nunique(), vp2.gtfs_dataset_key.nunique()

(70, 70)

### Subset trips to 6am to 8pm to test - TEMP
* <b>Move actual trip duration & total operator trips portion away</b>
* Only have trips that appear on January 18. 
* Trips that start after 6AM
* Trips that end at 8PM.

In [39]:
def filter_trips(df, date:str):
    
    # Filter by day
    df = df[(df['start'].dt.strftime('%Y-%m-%d') == date) & (df['end'].dt.strftime('%Y-%m-%d') == date)].reset_index()
    
    # Filter by time. Keep trips that begin at  6am and end before 8pm.
    # https://stackoverflow.com/questions/65510529/how-to-filter-pandas-dataframe-by-time
    time1 = pd.to_datetime(f'{date}  06:00:00').strftime('%H:%M')
    time2 = pd.to_datetime(f'{date} 20:00:00').strftime('%H:%M')
    df = df.loc[(df['start'].dt.strftime('%H:%M') >= time1 ) & (df['end'].dt.strftime('%H:%M') <= time2)].reset_index()
    
    # Find actual trip times
    df['actual_trip_duration_minutes'] = (df['end_minutes']-df['start_minutes'])
    
    df = df.drop(columns = ['level_0', 'index'])
    
    # Find trip by operators
    trips_ops = (df
                .groupby(["gtfs_dataset_key", "_gtfs_dataset_name"])
                .agg({'trip_id':'nunique'})
                .reset_index()
                .rename(columns = {'trip_id':'rt_trip_counts_by_operator'})
        )
    m1 = (df.merge(trips_ops, how="inner", on =  ["gtfs_dataset_key", "_gtfs_dataset_name"]))
    
    return m1 

In [41]:
subset_vp = filter_trips(vp2, analysis_date)

In [42]:
# 5,000 less rows?? 
# len(m3) - len(m3.loc[(m3['start'].dt.strftime('%H:%M') >= time1 ) & (m3['end'].dt.strftime('%H:%M') <= time2)])

In [43]:
# No more negative trips
len(subset_vp.loc[subset_vp.actual_trip_duration_minutes < 0])

0

In [44]:
# subset_vp.loc[subset_vp._gtfs_dataset_name == "Unitrans Vehicle Positions"].sort_values('actual_trip_duration_minutes')

### Deliverable 1
* A script to generate a parquet table within GCS bucket rt_vs_schedule that holds the counts and proportions for each gtfs_dataset_key-feed_key-trip_id (use an outer join in case there are trip_ids that appear in RT but not in schedule or vice versa
* Use an outer join in case there are trip_ids that appear in RT but not in schedule or vice versa (https://github.com/cal-itp/data-analyses/issues/668).
* More trips in scheduled than RT.

In [45]:
 scheduled_trips = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(analysis_date = analysis_date, 
    keep_trip_cols = ["feed_key", "trip_id", "service_hours"])

In [46]:
scheduled_trips['scheduled_service_minutes'] = scheduled_trips.service_hours * 60

In [47]:
test2 = scheduled_trips.groupby(['gtfs_dataset_key','feed_key']).agg({'trip_id':'nunique'}).reset_index().rename(columns = {'trip_id':'scheduled_trips'})

In [48]:
# pd.merge(test1,test2, on = "gtfs_dataset_key", how = "outer", indicator = True).sort_values('_gtfs_dataset_name')

In [49]:
def merge_schedule_vp(vp_df, date: str):

    # Load scheduled trips
    scheduled_trips = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(analysis_date = date, 
    keep_trip_cols = ["feed_key", "trip_id", "service_hours"])
    
    # Convert trip hours to minutes
    scheduled_trips['scheduled_service_minutes'] = scheduled_trips.service_hours * 60
    
    # Merge scheduled with RT. 
    merge1 = pd.merge(vp_df, scheduled_trips, how="outer", on=["gtfs_dataset_key", "trip_id"], indicator=True)
    
    # Metrics
    # Find RT trip time versus scheduled trip time.
    # Find pings per minute.
    merge1 = merge1.assign(
        rt_data_proportion = ((merge1.actual_trip_duration_minutes/merge1.scheduled_service_minutes)*100).fillna(0),
        pings_per_minute = (merge1.trip_ping_count/merge1.actual_trip_duration_minutes).fillna(0))
    
    # Any proportion above 100, mask as 100
    merge1["rt_data_proportion"] = merge1["rt_data_proportion"].mask(merge1["rt_data_proportion"] > 100, 100)
    
    return merge1

* Another function to return a final dataframe because it would be useful to do a `value_counts()` on the `_merge` column

In [50]:
def final_df(vp_df, date: str):
    df = merge_schedule_vp(vp_df, date)
    
    df = df.drop(columns = ['_merge'])
    
    # Fill in 
    df = df.fillna(
    df.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    # Round certain columns to eyeball
    for i in ['rt_data_proportion','actual_trip_duration_minutes']:
        df[f"rounded_{i}"] = ((df[i]/100)*10).astype(int)*10
    
    return df

In [51]:
final = final_df(subset_vp, analysis_date)

In [52]:
final.shape

(97093, 17)

In [53]:
final.sample(10)

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,start_minutes,end_minutes,actual_trip_duration_minutes,rt_trip_counts_by_operator,feed_key,service_hours,scheduled_service_minutes,rt_data_proportion,pings_per_minute,rounded_rt_data_proportion,rounded_actual_trip_duration_minutes
6621,Bay Area 511 Muni VehiclePositions,644cf39285755e8ba2dc68597faa48aa,11069585_M11,2023-01-18 12:17:19,2023-01-18 13:41:14,233.0,737.32,821.23,83.92,4794.0,57d7a160e4588225238b330da8453912,0.9,54.0,100.0,2.78,100,80
38221,StanRTA Vehicle Positions,e70b24e4d4931a9ada95fc76a2b53e9b,t4DD-b208-sl4,2023-01-18 12:34:13,2023-01-18 13:08:46,100.0,754.22,788.77,34.55,667.0,263ac2199429f2a891344436af8426ac,0.42,25.0,100.0,2.89,100,30
41687,,8f0b9daf627c4f17cbe7f4460208a1af,1336,datetime64[ns],datetime64[ns],0.0,0.0,0.0,0.0,0.0,4a182b5062ec22cfe557a219817e9c7e,0.97,58.0,0.0,0.0,0,0
32711,Riverside VehiclePositions,f1497340b654f3ce925f8b8c2f83e1c2,618020,2023-01-18 06:50:34,2023-01-18 08:30:19,199.0,410.57,510.32,99.75,614.0,9520692b76037537fb6c14d126542292,1.27,76.0,100.0,1.99,100,90
30406,OCTA VehiclePositions,0f68ec4d97d179570b494c74b6fca55d,10684513,2023-01-18 10:54:55,2023-01-18 12:04:51,207.0,654.92,724.85,69.93,1663.0,c2678c0ae3a6821a47f3cab62911c33d,1.03,62.0,100.0,2.96,100,60
32922,SBMTD VehiclePositions,c5b2bf3815a12d5f7242923ed399d1ea,1266040,2023-01-18 09:32:39,2023-01-18 09:43:54,18.0,572.65,583.9,11.25,645.0,a82c3d9bef4918ba063956ed0e65dfc5,0.28,17.0,66.18,1.6,60,10
30606,OCTA VehiclePositions,0f68ec4d97d179570b494c74b6fca55d,10692493,2023-01-18 12:30:17,2023-01-18 13:35:12,182.0,750.28,815.2,64.92,1663.0,c2678c0ae3a6821a47f3cab62911c33d,0.98,59.0,100.0,2.8,100,60
59829,,927430a256c4d23e37cf417362df9e7f,1082290,datetime64[ns],datetime64[ns],0.0,0.0,0.0,0.0,0.0,008d5112a7e531d0562d26e34d77869d,1.1,66.0,0.0,0.0,0,0
53125,,73ebf252af88c41aaf4426c6d3dc586c,V_3_outbound_2040,datetime64[ns],datetime64[ns],0.0,0.0,0.0,0.0,0.0,af4596a68387e4782b16160441ccde23,0.12,7.0,0.0,0.0,0,0
73281,,c1f526828c8d4bba1c71bf647443f534,b0b08cf9-d7b0-47ee-9f7f-9d061f662503:26,datetime64[ns],datetime64[ns],0.0,0.0,0.0,0.0,0.0,6980032e857182465ae8fccd86458940,0.2,12.0,0.0,0.0,0,0


### Deliverable 2 - Charts

In [54]:
def summary_valid_trips_by_cutoff(df, time_col:str, phrase:str,
                                  time_cutoffs: list, ping_cutoffs: list,
                                  pings: bool = True):
    """
    Find percentage & number of trips that meet trip time elapsed
    and pings. 
    """
    group_by_cols = ["gtfs_dataset_key", "_gtfs_dataset_name", "rt_trip_counts_by_operator"]
    final = pd.DataFrame()

    if pings:
        for t in time_cutoffs:
            for s in ping_cutoffs:
                valid = (
                df[(df[time_col] >= t) & (df.pings_per_minute >= s)]
                .groupby(group_by_cols)
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
                )
                
                valid = valid.assign(
                trip_cutoff=t, pings=s, cutoff=f"{t} {phrase} & {s} ping(s)+ per minute")

                final = pd.concat([final, valid], axis=0)
    else:
        for t in time_cutoffs:
            valid = (
                df[(df[time_col] >= t)]
                .groupby(group_by_cols)
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
                )

            valid = valid.assign(
                 trip_cutoff=t,cutoff=f"{t}{phrase}"
                )

            final = pd.concat([final, valid], axis=0)

    final = final.assign(percentage_usable_trips=final.n_trips.divide(final.rt_trip_counts_by_operator) * 100)
    final = final.sort_values('_gtfs_dataset_name')
    return final

In [82]:
def create_valid_stats(df):
    # Prepare valid stats
    pings_trip_duration_ops = summary_valid_trips_by_cutoff(df, "actual_trip_duration_minutes", "+ min", [15,30,60], [1,2,3])
    rt_data_available = summary_valid_trips_by_cutoff(df,"rt_data_proportion", "%+", [20,40,60,80,100], [1,2,3], True)
    
     # Clean df
    pings_trip_duration_ops = threshold_utils.pre_clean(pings_trip_duration_ops)
    rt_data_available = threshold_utils.pre_clean(rt_data_available)
    
    return pings_trip_duration_ops,rt_data_available

In [83]:
def operator_visuals(df):
    """
    Return one chart for trip time & # of pings
    and another chart for RT vs Scheduled
    data
    """
    pings_trip_duration_ops,rt_data_available = create_valid_stats(df)
    
    # Prepare dropdown menu
    dropdown_list = pings_trip_duration_ops["Gtfs Dataset Name"].sort_values().unique().tolist()
    dropdown = alt.binding_select(options=[None] + dropdown_list, labels = ['All'] + dropdown_list, name = "Operator")    
    selection = alt.selection_single(fields=['Gtfs Dataset Name'], bind=dropdown)
    
    # Create charts
    pings_chart = threshold_utils.bar_chart(pings_trip_duration_ops, 'Percentage Usable Trips', "Cutoff",
    ['Gtfs Dataset Key', 'Gtfs Dataset Name','N Trips',
    'Cutoff', 'Percentage Usable Trips'], "Pings and RT Trip Time Elapsed") 
    
    rt_chart = threshold_utils.bar_chart(rt_data_available, 'Percentage Usable Trips', "Cutoff", 
                                         ['Gtfs Dataset Key', 'Gtfs Dataset Name','Trip Cutoff', 'Percentage Usable Trips', 'N Trips'],
                          "Real Time vs. Scheduled Data Percentage")
    
    # Clean Charts
    pings_chart = threshold_utils.chart_size(pings_chart.add_selection(selection).transform_filter(selection).interactive(), 500, 400)
    rt_chart = threshold_utils.chart_size(rt_chart.add_selection(selection).transform_filter(selection).interactive(), 500, 400)
    
    return pings_chart & rt_chart

In [84]:
operator_visuals(final)

* Double check with Unitrans

In [85]:
len(final[(final["_gtfs_dataset_name"] == "Unitrans Vehicle Positions") & (final["pings_per_minute"] > 1) & (final["actual_trip_duration_minutes"] > 15)])

254

In [86]:
len(final.loc[final._gtfs_dataset_name == "Unitrans Vehicle Positions"])

423

In [87]:
# final.loc[final._gtfs_dataset_name == "Unitrans Vehicle Positions"][['actual_trip_duration_minutes', 'rt_data_proportion','pings_per_minute']].sort_values('actual_trip_duration_minutes')

In [88]:
# final[(final["_gtfs_dataset_name"] == "Unitrans Vehicle Positions") & (final["pings_per_minute"] > 1) & (final["actual_trip_duration_minutes"] > 15)]

* Check with Redding

In [89]:
# final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions")].shape

In [90]:
# final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions") & (final["rt_data_proportion"] < 20)]

In [91]:
# len(final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions") & (final["rt_data_proportion"] > 20)])

#### Statewide

In [92]:
def statewide_visuals(df):
    # Prepare valid stats
    pings_trip_duration_ops,rt_data_available = create_valid_stats(df)
    
    # Summarize to statewide level
    def statewide(df): 
        summary = (df
             .groupby(['Cutoff',])
             .agg({'Rt Trip Counts By Operator':'max','N Trips':'max'})
             .reset_index()
             .rename(columns = {'Rt Trip Counts By Operator':'All Trips'})
            )
        
        summary['Percentage of Usable Trips'] = summary['N Trips']/summary['All Trips']*100
        return summary
    
    # Aggregate valid stats to statewide level
    statewide_pings = statewide(pings_trip_duration_ops)
    statewide_rt = statewide(rt_data_available)
    
    # Charts
    pings_chart = threshold_utils.bar_chart(statewide_pings,  'Percentage of Usable Trips', "Cutoff", statewide_pings.columns.tolist(), "Statewide Pings and RT Trip Time Elapsed") 
    rt_chart = threshold_utils.bar_chart(statewide_rt,  'Percentage of Usable Trips', "Cutoff",statewide_rt.columns.tolist(), "Statewide RT vs. Scheduled Data") 
    
     # Clean Charts
    pings_chart = threshold_utils.chart_size(pings_chart.interactive(), 500, 400)
    rt_chart = threshold_utils.chart_size(rt_chart.interactive(), 500, 400)
    
    return pings_chart & rt_chart

In [93]:
statewide_visuals(final)