## Scheduled vs, RT total trip time

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import dask.dataframe as dd
import datetime 

from calitp_data_analysis.sql import to_snakecase
import _threshold_utils as threshold_utils
import altair as alt
from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import gcsfs
fs = gcsfs.GCSFileSystem()

In [4]:
import intake
catalog = intake.open_catalog("./catalog.yml")

In [5]:
# analysis_date = "2023-01-18"

### RT

#### Original file from `b1_rt_trip_diagnostics`
* vp._gtfs_dataset_name.nunique() == vp.gtfs_dataset_key.nunique() is true
* 10668072 rows 
*  len(vp.drop_duplicates(subset = ['trip_id','location_timestamp'])) produces 10658575 rows

In [6]:
"""
ddf = helpers.import_vehicle_positions(
        gcs_folder = f"{SEGMENT_GCS}vp_sjoin/",
        file_name = f"vp_route_segment_{analysis_date}/",
        file_type = "df",
        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
                   "trip_id", "route_dir_identifier",
                   "location_timestamp"],
        partitioned = True
    ).repartition(partition_size="85MB")
"""

'\nddf = helpers.import_vehicle_positions(\n        gcs_folder = f"{SEGMENT_GCS}vp_sjoin/",\n        file_name = f"vp_route_segment_{analysis_date}/",\n        file_type = "df",\n        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", \n                   "trip_id", "route_dir_identifier",\n                   "location_timestamp"],\n        partitioned = True\n    ).repartition(partition_size="85MB")\n'

In [7]:
# vp = ddf.compute()

In [8]:
# vp["location_timestamp"] = pd.to_datetime(vp["location_timestamp"])

In [9]:
# vp.info()

In [10]:
def count_rt_min(df):
    # Extract hour and minute
    df['hour'] = df.location_timestamp.dt.hour
    df['minute'] = df.location_timestamp.dt.minute
    
    # Drop duplicates
    df = df.drop_duplicates(subset = ['gtfs_dataset_key', 'trip_id','hour', 'minute']).reset_index(drop = True)
    
    # Count # of rows
    df = (df
          .groupby(['gtfs_dataset_key','trip_id'])
          .agg({'hour':'count'})
          .reset_index()
          .rename(columns = {'hour':'total_rt_min_coverage'})
         )
    return df

In [11]:
# Extract hour-minute
#vp['hour'] = vp.location_timestamp.dt.hour
# vp['minute'] = vp.location_timestamp.dt.minute

In [12]:
# vp.sample(3)

In [13]:
# test = vp.drop_duplicates(subset = ['gtfs_dataset_key', 'trip_id','hour', 'minute']).reset_index(drop = True)

In [14]:
# len(vp) - len(test)

In [15]:
# agg = test.groupby(['gtfs_dataset_key','trip_id']).agg({'hour':'count'}).reset_index()

In [16]:
def find_start_end_times(df, groupby_cols:list):
    """
    Find the max and min location stamp 
    to determine start and end date. 
    """
    start_end = (df.groupby(groupby_cols)
        .agg({"location_timestamp": [ "min", "max"]})
        .reset_index()
        .rename(columns = {'min':'start', 'max':'end'})
      ) 
    
    # Clean up columns
    start_end.columns = start_end.columns.droplevel()
    start_end.columns.values[0] = "_gtfs_dataset_name"
    start_end.columns.values[1] = "gtfs_dataset_key"
    start_end.columns.values[2] = "trip_id"
    
    return start_end

In [17]:
# test.shape

In [18]:
def start_end_pings(gcs:str, date:str):
    """
    Find number of pings and 
    start and end time for a trip.
    
    gcs (str): file path
    date (str): the analysis date
    """
    group_cols = ["_gtfs_dataset_name", "gtfs_dataset_key", "trip_id"]
    
    # Load in file.
    ddf = helpers.import_vehicle_positions(
        gcs_folder = f"{gcs}vp_sjoin/",
        file_name = f"vp_route_segment_{date}/",
        file_type = "df",
        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
                   "trip_id", "route_dir_identifier",
                   "location_timestamp"],
        partitioned = True
    ).repartition(partition_size="85MB")
    
    df = ddf.compute()
    
    # Find number of RT minutes
    rt_min_avail = count_rt_min(df)
    
    # Find start and end time for a trip
    start_end = find_start_end_times(df, group_cols)
    
    # Find number of pings by 
    # counting nunique location_timestamps
    pings = (df.groupby(group_cols)
          .agg({'location_timestamp':'nunique'})
          .reset_index()
          .rename(columns = {'location_timestamp':'trip_ping_count'})
         )
    
    # Find number of trips for each operator
    #trips_ops = (df
    #             .groupby(["gtfs_dataset_key", "_gtfs_dataset_name"])
    #             .agg({'trip_id':'nunique'})
    #             .reset_index()
     #            .rename(columns = {'trip_id':'rt_trip_counts_by_operator'})
    #     )
    
    # Merge
    # m1 = (start_end
    #      .merge(pings, how="inner", on = group_cols)
    #      .merge(trips_ops, how = "inner", on = ["gtfs_dataset_key", "_gtfs_dataset_name"])
    #     )
    
    m1 = (start_end.merge(pings, how="inner", on = group_cols)
                   .merge(rt_min_avail, how = "inner", on = ['gtfs_dataset_key','trip_id'])
         )
          
    return m1

In [19]:
vp2 = start_end_pings(SEGMENT_GCS, analysis_date)

In [20]:
vp2.end.dt.tz, vp2.start.dt.tz

(<UTC>, <UTC>)

In [21]:
vp2.shape

(72302, 7)

In [22]:
vp2.sample()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,total_rt_min_coverage
22135,Bay Area 511 Santa Clara Transit VehiclePositions,03cadbffef6f3cd4ce839b218bc65e82,3293567,2023-02-16 03:11:06+00:00,2023-02-16 03:52:38+00:00,125,42


In [23]:
# Extract hour-minute
vp2['start_hour'] = vp2.start.dt.hour
vp2['start_minute'] = vp2.start.dt.minute

In [24]:
vp2['combo'] = vp2.gtfs_dataset_key + vp2.trip_id

In [25]:
vp2.shape

(72302, 10)

In [26]:
vp2.sample()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,total_rt_min_coverage,start_hour,start_minute,combo
61034,San Diego Vehicle Positions,318c6bc87acccb67fb00aa9180aa5a12,16875333,2023-02-15 16:03:04+00:00,2023-02-15 16:43:01+00:00,74,40,16,3,318c6bc87acccb67fb00aa9180aa5a1216875333


In [27]:
test = vp2.drop_duplicates(subset = ['gtfs_dataset_key', 'trip_id','start_hour','start_minute']).reset_index(drop = True)

In [28]:
test.loc[test.trip_id == '339040']

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,total_rt_min_coverage,start_hour,start_minute,combo


In [29]:
agg = test.groupby(['gtfs_dataset_key','trip_id']).agg({'start_hour':'count'})

In [30]:
# test2 = vp2[['gtfs_dataset_key', 'trip_id','start_hour','start_minute']].drop_duplicates(subset = ['gtfs_dataset_key', 'trip_id','start_hour','start_minute']).reset_index(drop = True)

In [31]:
# test2.shape

In [32]:
# test3 = vp2.drop_duplicates(subset = ['combo','start_hour','start_minute']).reset_index(drop = True)

In [33]:
# test4 = vp2[['combo','start_hour','start_minute']].drop_duplicates(subset = ['combo','start_hour','start_minute']).reset_index(drop = True)

In [34]:
# test4.shape

In [35]:
test3.shape

NameError: name 'test3' is not defined

### Timezone
* Convert from UTC to Pacific to Naive
* https://stackoverflow.com/questions/24281525/what-is-the-point-of-a-naive-datetimedf.select_dtypes(include=['datetime64'])

In [36]:
def naive_timezone(df):
    """
    Convert UTC to PST to Naive. 
    """
    PACIFIC_TIMEZONE = "US/Pacific"
    
    # Grab datetime columns
    dt_cols = [col for col in df.columns if df[col].dtype == 'datetime64[ns, UTC]']
    
    for col in dt_cols:
        df[col] = df[col].dt.tz_convert(PACIFIC_TIMEZONE).apply(lambda t: t.replace(tzinfo=None))
        
    return df 

In [37]:
vp2 = naive_timezone(vp2)

In [38]:
vp2.end.dt.tz, vp2.start.dt.tz

(None, None)

In [39]:
vp2.sample()

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,start,end,trip_ping_count,total_rt_min_coverage,start_hour,start_minute,combo
21721,Bay Area 511 Santa Clara Transit VehiclePositions,03cadbffef6f3cd4ce839b218bc65e82,3288351,2023-02-15 11:10:24,2023-02-15 13:18:44,383,128,19,10,03cadbffef6f3cd4ce839b218bc65e823288351


In [40]:
def convert_timestamp_to_hrs_mins(
    df: pd.DataFrame, 
    timestamp_col: list,
    minutes: bool = True,
) -> pd.DataFrame: 
    """
    Convert datetime col into minutes or seconds.
    """
    if minutes:
        for c in timestamp_col:
            df = df.assign(
                time_min = ((df[c].dt.hour * 60) + 
                                (df[c].dt.minute) + 
                                (df[c].dt.second/60)
                           ),
            ).rename(columns = {"time_min": f"{c}_minutes"})
    
    else:
        for c in timestamp_col:
            df = df.assign(
                time_sec = ((df[timestamp_col].dt.hour * 3_600) + 
                                (df[timestamp_col].dt.minute * 60) + 
                                (df[timestamp_col].dt.second)
                           ),
            ).rename(columns = {"time_sec": f"{timestamp_col}_sec"})

    return df

In [41]:
vp2 = convert_timestamp_to_hrs_mins(vp2, ['start','end'])

In [42]:
vp2._gtfs_dataset_name.nunique(), vp2.gtfs_dataset_key.nunique()

(72, 72)

### Subset trips to 6am to 8pm to test - TEMP
* Should subsetting occur before changing to the naive time zone?
* <b>Move actual trip duration & total operator trips portion away</b>
* Only have trips that appear on January 18. 
* Trips that start after 6AM
* Trips that end at 8PM.

In [43]:
def filter_trips(df, date:str):
    
    # Filter by day
    df = df[(df['start'].dt.strftime('%Y-%m-%d') == date) & (df['end'].dt.strftime('%Y-%m-%d') == date)].reset_index()
    
    # Filter by time. Keep trips that begin at  6am and end before 8pm.
    # https://stackoverflow.com/questions/65510529/how-to-filter-pandas-dataframe-by-time
    time1 = pd.to_datetime(f'{date}  06:00:00').strftime('%H:%M')
    time2 = pd.to_datetime(f'{date} 20:00:00').strftime('%H:%M')
    df = df.loc[(df['start'].dt.strftime('%H:%M') >= time1 ) & (df['end'].dt.strftime('%H:%M') <= time2)].reset_index()
    
    # Find actual trip times
    df['actual_trip_duration_minutes'] = (df['end_minutes']-df['start_minutes'])
    
    df = df.drop(columns = ['level_0', 'index'])
    
    # Find trip by operators
    trips_ops = (df
                .groupby(["gtfs_dataset_key", "_gtfs_dataset_name"])
                .agg({'trip_id':'nunique'})
                .reset_index()
                .rename(columns = {'trip_id':'rt_trip_counts_by_operator'})
        )
    m1 = (df.merge(trips_ops, how="inner", on =  ["gtfs_dataset_key", "_gtfs_dataset_name"]))
    
    return m1 

In [44]:
subset_vp = filter_trips(vp2, analysis_date)

In [45]:
# No more negative trips
len(subset_vp.loc[subset_vp.actual_trip_duration_minutes < 0])

0

### Deliverable 1
* A script to generate a parquet table within GCS bucket rt_vs_schedule that holds the counts and proportions for each gtfs_dataset_key-feed_key-trip_id (use an outer join in case there are trip_ids that appear in RT but not in schedule or vice versa
* Use an outer join in case there are trip_ids that appear in RT but not in schedule or vice versa (https://github.com/cal-itp/data-analyses/issues/668).
* More trips in scheduled than RT.

In [46]:
def merge_schedule_vp(vp_df, date: str):

    # Load scheduled trips
    scheduled_trips = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(analysis_date = date, 
    keep_trip_cols = ["feed_key", "trip_id", "service_hours"])
    
    # Convert trip hours to minutes
    scheduled_trips['scheduled_service_minutes'] = scheduled_trips.service_hours * 60
    
    # Merge scheduled with RT. 
    merge1 = pd.merge(vp_df, scheduled_trips, how="outer", on=["gtfs_dataset_key", "trip_id"], indicator=True)
    
    return merge1

* Another function to return a final dataframe because it would be useful to do a `value_counts()` on the `_merge` column

In [47]:
def final_df(vp_df, date: str):
    df = merge_schedule_vp(vp_df, date).drop(columns = ['_merge'])
    
    # Find RT trip time versus scheduled trip time.
    # Find pings per minute.
    df = df.assign(
        rt_data_proportion_percentage = ((df.total_rt_min_coverage/df.scheduled_service_minutes)*100).fillna(0),
        pings_per_minute = (df.trip_ping_count/df.total_rt_min_coverage).fillna(0))
    
    # Any proportion above 100, mask as 100
    df["rt_data_proportion_percentage"] = df["rt_data_proportion_percentage"].mask(df["rt_data_proportion_percentage"] > 100, 100)
  
    # Fill in 
    df = df.fillna(
    df.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    # Round certain columns to eyeball
    for i in ['rt_data_proportion_percentage','actual_trip_duration_minutes']:
        df[f"rounded_{i}"] = ((df[i]/100)*10).astype(int)*10
    
    df = df.drop(columns = ['start','end', 'service_hours'])
    return df

In [48]:
final = final_df(subset_vp, analysis_date)

In [49]:
final.shape

(98105, 18)

In [50]:
# final.rounded_rt_data_proportion.value_counts()

### Deliverable 2 - Charts 

In [51]:
def summary_valid_trips_by_cutoff(df, time_col:str, phrase:str,
                                  time_cutoffs: list, ping_cutoffs: list,
                                  pings: bool = True):
    """
    Find percentage & number of trips that meet trip time elapsed
    and pings. 
    """
    group_by_cols = ["gtfs_dataset_key", "_gtfs_dataset_name", "rt_trip_counts_by_operator"]
    final = pd.DataFrame()

    if pings:
        for t in time_cutoffs:
            for s in ping_cutoffs:
                valid = (
                df[(df[time_col] >= t) & (df.pings_per_minute >= s)]
                .groupby(group_by_cols)
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
                )
                
                valid = valid.assign(
                trip_cutoff=t, pings=s, cutoff=f"{t} {phrase} & {s} ping(s)+ per minute")

                final = pd.concat([final, valid], axis=0)
    else:
        for t in time_cutoffs:
            valid = (
                df[(df[time_col] >= t)]
                .groupby(group_by_cols)
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
                )

            valid = valid.assign(
                 trip_cutoff=t,cutoff=f"{t}{phrase}"
                )

            final = pd.concat([final, valid], axis=0)

    final = final.assign(percentage_usable_trips=final.n_trips.divide(final.rt_trip_counts_by_operator) * 100)
    final = final.sort_values('_gtfs_dataset_name')
    return final

In [52]:
rt_scheduled = (final
                .groupby(['_gtfs_dataset_name','rounded_rt_data_proportion_percentage'])
                .agg({'rt_trip_counts_by_operator':'max','trip_id':'nunique'})
                .reset_index()
                .rename(columns = {'trip_id':'total_trips'})
               )

In [53]:
rt_scheduled['percentage_of_trips'] = rt_scheduled.total_trips/rt_scheduled.rt_trip_counts_by_operator * 100

In [54]:
dropdown_list = rt_scheduled["_gtfs_dataset_name"].sort_values().unique().tolist()
dropdown = alt.binding_select(options=[None] + dropdown_list, labels = ['All'] + dropdown_list, name = "Operator")    
selection = alt.selection_single(fields=['_gtfs_dataset_name'], bind=dropdown)

In [55]:
threshold_utils.bar_chart(rt_scheduled,  "percentage_of_trips",'rounded_rt_data_proportion_percentage',
    rt_scheduled.columns.tolist(), "% of RT Minutes vs Scheduled Trip").add_selection(selection).transform_filter(selection) 

In [56]:
def trip_duration_categories(row):
    if row.actual_trip_duration_minutes < 30:
        return "Trip < 30 minutes"
    if 31 < row.actual_trip_duration_minutes < 61:
        return "Trip < 60 minutes"
    if 61 < row.actual_trip_duration_minutes < 91:
        return "Trip < 1.5 hours"
    else:
        return "Trip > 1.5 hours"

In [57]:
final["trip_duration"] = final.apply(trip_duration_categories, axis=1)

In [58]:
final.columns

Index(['_gtfs_dataset_name', 'gtfs_dataset_key', 'trip_id', 'trip_ping_count',
       'total_rt_min_coverage', 'start_hour', 'start_minute', 'combo',
       'start_minutes', 'end_minutes', 'actual_trip_duration_minutes',
       'rt_trip_counts_by_operator', 'feed_key', 'scheduled_service_minutes',
       'rt_data_proportion_percentage', 'pings_per_minute',
       'rounded_rt_data_proportion_percentage',
       'rounded_actual_trip_duration_minutes', 'trip_duration'],
      dtype='object')

In [60]:
rt_trip_time = (final
                .groupby(['_gtfs_dataset_name','trip_duration','rounded_rt_data_proportion_percentage'])
                .agg({'rt_trip_counts_by_operator':'max','trip_id':'nunique'})
                .reset_index()
                .rename(columns = {'trip_id':'total_trips'})
               )

In [61]:
rt_trip_time.shape

(1329, 5)

In [62]:
rt_trip_time.head()

Unnamed: 0,_gtfs_dataset_name,trip_duration,rounded_rt_data_proportion_percentage,rt_trip_counts_by_operator,total_trips
0,Anaheim Resort VehiclePositions,Trip < 1.5 hours,100,336.0,22
1,Anaheim Resort VehiclePositions,Trip < 30 minutes,0,336.0,2
2,Anaheim Resort VehiclePositions,Trip < 30 minutes,10,336.0,1
3,Anaheim Resort VehiclePositions,Trip < 30 minutes,20,336.0,1
4,Anaheim Resort VehiclePositions,Trip < 30 minutes,30,336.0,4


In [64]:
rt_trip_time['categories'] = rt_trip_time.trip_duration + ' ' + rt_trip_time.rounded_rt_data_proportion_percentage.astype(str) + '% of RT vs. Scheduled'

* Percentages are wrong.

In [65]:
rt_trip_time['percentage_of_trips'] = rt_trip_time.total_trips/rt_scheduled.rt_trip_counts_by_operator * 100

In [68]:
rt_trip_time.head(10)

Unnamed: 0,_gtfs_dataset_name,trip_duration,rounded_rt_data_proportion_percentage,rt_trip_counts_by_operator,total_trips,categories,percentage_of_trips
0,Anaheim Resort VehiclePositions,Trip < 1.5 hours,100,336.0,22,Trip < 1.5 hours 100% of RT vs. Scheduled,6.55
1,Anaheim Resort VehiclePositions,Trip < 30 minutes,0,336.0,2,Trip < 30 minutes 0% of RT vs. Scheduled,0.6
2,Anaheim Resort VehiclePositions,Trip < 30 minutes,10,336.0,1,Trip < 30 minutes 10% of RT vs. Scheduled,0.3
3,Anaheim Resort VehiclePositions,Trip < 30 minutes,20,336.0,1,Trip < 30 minutes 20% of RT vs. Scheduled,0.3
4,Anaheim Resort VehiclePositions,Trip < 30 minutes,30,336.0,4,Trip < 30 minutes 30% of RT vs. Scheduled,1.19
5,Anaheim Resort VehiclePositions,Trip < 30 minutes,40,336.0,3,Trip < 30 minutes 40% of RT vs. Scheduled,0.89
6,Anaheim Resort VehiclePositions,Trip < 30 minutes,50,336.0,1,Trip < 30 minutes 50% of RT vs. Scheduled,0.3
7,Anaheim Resort VehiclePositions,Trip < 30 minutes,60,336.0,1,Trip < 30 minutes 60% of RT vs. Scheduled,0.3
8,Anaheim Resort VehiclePositions,Trip < 30 minutes,70,336.0,1,Trip < 30 minutes 70% of RT vs. Scheduled,0.3
9,Anaheim Resort VehiclePositions,Trip < 30 minutes,80,336.0,7,Trip < 30 minutes 80% of RT vs. Scheduled,2.08


In [None]:
threshold_utils.bar_chart(rt_trip_time,  "percentage_of_trips",'categories',
    rt_trip_time.columns.tolist(), "% of RT Minutes vs Scheduled Trip").add_selection(selection).transform_filter(selection) 

#### Scratch

In [None]:
def create_valid_stats(df):
    # Prepare valid stats
    pings_trip_duration_ops = summary_valid_trips_by_cutoff(df, "actual_trip_duration_minutes", "+ min", [15,30,60], [1,2,3])
    rt_data_available = summary_valid_trips_by_cutoff(df,"rt_data_proportion", "%+", [20,40,60,80,100], [1,2,3], True)
    
     # Clean df
    pings_trip_duration_ops = threshold_utils.pre_clean(pings_trip_duration_ops)
    rt_data_available = threshold_utils.pre_clean(rt_data_available)
    
    return pings_trip_duration_ops,rt_data_available

In [None]:
def operator_visuals(df):
    """
    Return one chart for trip time & # of pings
    and another chart for RT vs Scheduled
    data
    """
    pings_trip_duration_ops,rt_data_available = create_valid_stats(df)
    
    # Prepare dropdown menu
    dropdown_list = pings_trip_duration_ops["Gtfs Dataset Name"].sort_values().unique().tolist()
    dropdown = alt.binding_select(options=[None] + dropdown_list, labels = ['All'] + dropdown_list, name = "Operator")    
    selection = alt.selection_single(fields=['Gtfs Dataset Name'], bind=dropdown)
    
    # Create charts
    pings_chart = threshold_utils.bar_chart(pings_trip_duration_ops, 'Percentage Usable Trips', "Cutoff",
    ['Gtfs Dataset Key', 'Gtfs Dataset Name','N Trips',
    'Cutoff', 'Percentage Usable Trips'], "Pings and RT Trip Time Elapsed") 
    
    rt_chart = threshold_utils.bar_chart(rt_data_available, 'Percentage Usable Trips', "Cutoff", 
                                         ['Gtfs Dataset Key', 'Gtfs Dataset Name','Trip Cutoff', 'Percentage Usable Trips', 'N Trips'],
                          "Real Time vs. Scheduled Data Percentage")
    
    # Clean Charts
    pings_chart = threshold_utils.chart_size(pings_chart.add_selection(selection).transform_filter(selection).interactive(), 500, 400)
    rt_chart = threshold_utils.chart_size(rt_chart.add_selection(selection).transform_filter(selection).interactive(), 500, 400)
    
    return pings_chart & rt_chart

In [None]:
# operator_visuals(final)

* Double check with Unitrans

In [None]:
# len(final[(final["_gtfs_dataset_name"] == "Unitrans Vehicle Positions") & (final["pings_per_minute"] > 1) & (final["actual_trip_duration_minutes"] > 15)])

In [None]:
# len(final.loc[final._gtfs_dataset_name == "Unitrans Vehicle Positions"])

In [None]:
# final.loc[final._gtfs_dataset_name == "Unitrans Vehicle Positions"][['actual_trip_duration_minutes', 'rt_data_proportion','pings_per_minute']].sort_values('actual_trip_duration_minutes')

In [None]:
# final[(final["_gtfs_dataset_name"] == "Unitrans Vehicle Positions") & (final["pings_per_minute"] > 1) & (final["actual_trip_duration_minutes"] > 15)]

* Check with Redding

In [None]:
# final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions")].shape

In [None]:
# final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions") & (final["rt_data_proportion"] < 20)]

In [None]:
# len(final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions") & (final["rt_data_proportion"] > 20)])

#### Statewide

In [None]:
def statewide_visuals(df):
    # Prepare valid stats
    pings_trip_duration_ops,rt_data_available = create_valid_stats(df)
    
    # Summarize to statewide level
    def statewide(df): 
        summary = (df
             .groupby(['Cutoff',])
             .agg({'Rt Trip Counts By Operator':'max','N Trips':'max'})
             .reset_index()
             .rename(columns = {'Rt Trip Counts By Operator':'All Trips'})
            )
        
        summary['Percentage of Usable Trips'] = summary['N Trips']/summary['All Trips']*100
        return summary
    
    # Aggregate valid stats to statewide level
    statewide_pings = statewide(pings_trip_duration_ops)
    statewide_rt = statewide(rt_data_available)
    
    # Charts
    pings_chart = threshold_utils.bar_chart(statewide_pings,  'Percentage of Usable Trips', "Cutoff", statewide_pings.columns.tolist(), "Statewide Pings and RT Trip Time Elapsed") 
    rt_chart = threshold_utils.bar_chart(statewide_rt,  'Percentage of Usable Trips', "Cutoff",statewide_rt.columns.tolist(), "Statewide RT vs. Scheduled Data") 
    
     # Clean Charts
    pings_chart = threshold_utils.chart_size(pings_chart.interactive(), 500, 400)
    rt_chart = threshold_utils.chart_size(rt_chart.interactive(), 500, 400)
    
    return pings_chart & rt_chart

In [None]:
# statewide_visuals(final)