## Scheduled vs, RT total trip time

In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import dask.dataframe as dd
import datetime 

from calitp_data_analysis.sql import to_snakecase
import _threshold_utils as threshold_utils
import altair as alt
from shared_utils import calitp_color_palette as cp
from segment_speed_utils import helpers, sched_rt_utils
from segment_speed_utils.project_vars import SEGMENT_GCS, analysis_date



In [2]:
pd.options.display.max_columns = 100
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", None)

In [3]:
import gcsfs
fs = gcsfs.GCSFileSystem()

In [4]:
import intake
catalog = intake.open_catalog("./catalog.yml")

### RT

#### Original file from `b1_rt_trip_diagnostics`

In [5]:
"""
ddf = helpers.import_vehicle_positions(
        gcs_folder = f"{SEGMENT_GCS}vp_sjoin/",
        file_name = f"vp_route_segment_{analysis_date}/",
        file_type = "df",
        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
                   "trip_id", "route_dir_identifier",
                   "location_timestamp"],
        partitioned = True
    ).repartition(partition_size="85MB")
"""

'\nddf = helpers.import_vehicle_positions(\n        gcs_folder = f"{SEGMENT_GCS}vp_sjoin/",\n        file_name = f"vp_route_segment_{analysis_date}/",\n        file_type = "df",\n        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", \n                   "trip_id", "route_dir_identifier",\n                   "location_timestamp"],\n        partitioned = True\n    ).repartition(partition_size="85MB")\n'

In [6]:
# vp = ddf.compute()

In [7]:
# vp["location_timestamp"] = pd.to_datetime(vp["location_timestamp"])

In [8]:
# vp.info()

In [9]:
def count_rt_min(df):
    """
    Find total RT minute coverage
    for each trip-operator
    """
    # Extract hour and minute
    df['hour'] = df.location_timestamp.dt.hour
    df['minute'] = df.location_timestamp.dt.minute
    
    # Drop duplicates
    df = df.drop_duplicates(subset = ['gtfs_dataset_key', 'trip_id','hour', 'minute']).reset_index(drop = True)
    
    # Count # of rows to get minutes 
    # of RT data. 
    df = (df
          .groupby(['gtfs_dataset_key','trip_id'])
          .agg({'hour':'count'})
          .reset_index()
          .rename(columns = {'hour':'total_rt_min_coverage'})
         )
    return df

In [10]:
# Extract hour-minute
#vp['hour'] = vp.location_timestamp.dt.hour
# vp['minute'] = vp.location_timestamp.dt.minute

In [11]:
# vp.sample(3)

In [12]:
# test = vp.drop_duplicates(subset = ['gtfs_dataset_key', 'trip_id','hour', 'minute']).reset_index(drop = True)

In [13]:
# len(vp) - len(test)

In [14]:
# agg = test.groupby(['gtfs_dataset_key','trip_id']).agg({'hour':'count'}).reset_index()

In [15]:
def find_start_end_times(df, groupby_cols:list):
    """
    Find the max and min location stamp 
    to determine start and end date. 
    """
    start_end = (df.groupby(groupby_cols)
        .agg({"location_timestamp": [ "min", "max"]})
        .reset_index()
        .rename(columns = {'min':'start', 'max':'end'})
      ) 
    
    # Clean up columns
    start_end.columns = start_end.columns.droplevel()
    start_end.columns.values[0] = "_gtfs_dataset_name"
    start_end.columns.values[1] = "gtfs_dataset_key"
    start_end.columns.values[2] = "trip_id"
    
    return start_end

In [16]:
# test.shape

### Timezone
* Convert from UTC to Pacific to Naive
* https://stackoverflow.com/questions/24281525/what-is-the-point-of-a-naive-datetimedf.select_dtypes(include=['datetime64'])

In [17]:
def naive_timezone(df):
    """
    Convert UTC to PST to Naive. 
    """
    PACIFIC_TIMEZONE = "US/Pacific"
    
    # Grab datetime columns
    dt_cols = [col for col in df.columns if df[col].dtype == 'datetime64[ns, UTC]']
    
    for col in dt_cols:
        df[col] = df[col].dt.tz_convert(PACIFIC_TIMEZONE).apply(lambda t: t.replace(tzinfo=None))
        
    return df 

In [18]:
# vp2 = naive_timezone(vp2)

In [19]:
# vp2.end.dt.tz, vp2.start.dt.tz

In [20]:
# vp2.sample()

In [21]:
def convert_timestamp_to_hrs_mins(
    df: pd.DataFrame, 
    timestamp_col: list,
    minutes: bool = True,
) -> pd.DataFrame: 
    """
    Convert datetime col into minutes or seconds.
    """
    if minutes:
        for c in timestamp_col:
            df = df.assign(
                time_min = ((df[c].dt.hour * 60) + 
                                (df[c].dt.minute) + 
                                (df[c].dt.second/60)
                           ),
            ).rename(columns = {"time_min": f"{c}_minutes"})
    
    else:
        for c in timestamp_col:
            df = df.assign(
                time_sec = ((df[timestamp_col].dt.hour * 3_600) + 
                                (df[timestamp_col].dt.minute * 60) + 
                                (df[timestamp_col].dt.second)
                           ),
            ).rename(columns = {"time_sec": f"{timestamp_col}_sec"})

    return df

In [22]:
# vp2 = convert_timestamp_to_hrs_mins(vp2, ['start','end'])

In [23]:
# vp2._gtfs_dataset_name.nunique(), vp2.gtfs_dataset_key.nunique()

In [24]:
def find_metrics(gcs:str, date:str):
    """
    Find metrics such as number of pings,
    start and end time, and RT coverage for a trip.
    
    gcs (str): file path
    date (str): the analysis date
    """
    group_cols = ["_gtfs_dataset_name", "gtfs_dataset_key", "trip_id"]
    
    # Load in file.
    ddf = helpers.import_vehicle_positions(
        gcs_folder = f"{gcs}vp_sjoin/",
        file_name = f"vp_route_segment_{date}/",
        file_type = "df",
        columns = ["gtfs_dataset_key", "_gtfs_dataset_name", 
                   "trip_id", "route_dir_identifier",
                   "location_timestamp"],
        partitioned = True
    ).repartition(partition_size="85MB")
    
    df = ddf.compute()
    
    # Find number of RT minutes
    rt_min_avail = count_rt_min(df)
    
    # Find start and end time for a trip
    start_end = find_start_end_times(df, group_cols)
    
    # Find number of pings by 
    # counting nunique location_timestamps
    pings = (df.groupby(group_cols)
          .agg({'location_timestamp':'nunique'})
          .reset_index()
          .rename(columns = {'location_timestamp':'trip_ping_count'})
         )
    
    # Find number of trips for each operator
    #trips_ops = (df
    #             .groupby(["gtfs_dataset_key", "_gtfs_dataset_name"])
    #             .agg({'trip_id':'nunique'})
    #             .reset_index()
     #            .rename(columns = {'trip_id':'rt_trip_counts_by_operator'})
    #     )
    
    m1 = (start_end.merge(pings, how="inner", on = group_cols)
                   .merge(rt_min_avail, how = "inner", on = ['gtfs_dataset_key','trip_id'])
         )
    
    # Convert to naive timezone
    m1 = naive_timezone(m1)
    
    # Convert timestamp to minutes
    m1 = convert_timestamp_to_hrs_mins(m1, ['start','end'])
          
    return m1

In [25]:
vp2 = find_metrics(SEGMENT_GCS, analysis_date)

In [26]:
vp2.end.dt.tz, vp2.start.dt.tz

(None, None)

In [27]:
vp2.shape

(72302, 9)

### Subset trips to 6am to 8pm to test - TEMP
* Should subsetting occur before changing to the naive time zone?
* <b>Move actual trip duration & total operator trips portion away to `find_metrics` once Time Zone answered</b>
* Only have trips that appear on January 18. 
* Trips that start after 6AM
* Trips that end at 8PM.

In [28]:
def filter_trips(df, date:str):
    
    # Filter by day
    df = df[(df['start'].dt.strftime('%Y-%m-%d') == date) & (df['end'].dt.strftime('%Y-%m-%d') == date)].reset_index()
    
    # Filter by time. Keep trips that begin at  6am and end before 8pm.
    # https://stackoverflow.com/questions/65510529/how-to-filter-pandas-dataframe-by-time
    time1 = pd.to_datetime(f'{date}  06:00:00').strftime('%H:%M')
    time2 = pd.to_datetime(f'{date} 20:00:00').strftime('%H:%M')
    df = df.loc[(df['start'].dt.strftime('%H:%M') >= time1 ) & (df['end'].dt.strftime('%H:%M') <= time2)].reset_index()
    
    # Find actual trip times
    df['actual_trip_duration_minutes'] = (df['end_minutes']-df['start_minutes'])
    
    df = df.drop(columns = ['level_0', 'index'])
    
    # Find trip by operators
    trips_ops = (df
                .groupby(["gtfs_dataset_key", "_gtfs_dataset_name"])
                .agg({'trip_id':'nunique'})
                .reset_index()
                .rename(columns = {'trip_id':'rt_trip_counts_by_operator'})
        )
    m1 = (df.merge(trips_ops, how="inner", on =  ["gtfs_dataset_key", "_gtfs_dataset_name"]))
    
    return m1 

In [29]:
subset_vp = filter_trips(vp2, analysis_date)

In [30]:
# No more negative trips
len(subset_vp.loc[subset_vp.actual_trip_duration_minutes < 0])

0

### Deliverable 1
* A script to generate a parquet table within GCS bucket rt_vs_schedule that holds the counts and proportions for each gtfs_dataset_key-feed_key-trip_id (use an outer join in case there are trip_ids that appear in RT but not in schedule or vice versa
* Use an outer join in case there are trip_ids that appear in RT but not in schedule or vice versa (https://github.com/cal-itp/data-analyses/issues/668).
* More trips in scheduled than RT.

In [31]:
def merge_schedule_vp(vp_df, date: str):

    # Load scheduled trips
    scheduled_trips = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(analysis_date = date, 
    keep_trip_cols = ["feed_key", "trip_id", "service_hours"])
    
    # Convert trip hours to minutes
    scheduled_trips['scheduled_service_minutes'] = scheduled_trips.service_hours * 60
    
    # Merge scheduled with RT. 
    merge1 = pd.merge(vp_df, scheduled_trips, how="outer", on=["gtfs_dataset_key", "trip_id"], indicator=True)
    
    return merge1

* Another function to return a final dataframe because it could be useful to do a `value_counts()` on the `_merge` column

In [32]:
def trip_duration_categories(row):
    if row.actual_trip_duration_minutes < 30:
        return "0 - 30 minutes"
    elif 31 < row.actual_trip_duration_minutes < 61:
        return "31-60 minutes"
    elif 61 < row.actual_trip_duration_minutes < 91:
        return "61-90 minutes"
    else:
        return "90 + minutes"

In [33]:
def rt_data_proportion(row):
    if  row.rt_data_proportion_percentage < 20:
        return "0-20%"
    elif 21 < row.rt_data_proportion_percentage < 40:
        return "21-40%"
    elif 41 < row.rt_data_proportion_percentage < 60:
        return "41-60%"
    elif 61 < row.rt_data_proportion_percentage < 80:
        return "61-80%"
    else:
        return "81-100%"

In [34]:
def final_df(vp_df, date: str):
    df = merge_schedule_vp(vp_df, date).drop(columns = ['_merge'])
    
    # Find RT trip time versus scheduled trip time.
    # Find pings per minute.
    df = df.assign(
        rt_data_proportion_percentage = ((df.total_rt_min_coverage/df.scheduled_service_minutes)*100).fillna(0),
        pings_per_minute = (df.trip_ping_count/df.total_rt_min_coverage).fillna(0))
    
    # Any proportion above 100, mask as 100
    df["rt_data_proportion_percentage"] = df["rt_data_proportion_percentage"].mask(df["rt_data_proportion_percentage"] > 100, 100)
  
    # Fill in 
    df = df.fillna(
    df.dtypes.replace({"float64": 0.0, "object": "None"}))
    
    # Round certain columns to eyeball
    for i in ['rt_data_proportion_percentage','actual_trip_duration_minutes']:
        df[f"rounded_{i}"] = (((df[i]/100)*10).astype(int)*10)
        # .astype(str) + '%'
        
    # Categorize actual trip duration
    df["trip_category"] = df.apply(trip_duration_categories, axis=1)
    
    # Categorize RT vs. scheduled coverage
    df["rt_category"] = df.apply(rt_data_proportion, axis=1)
    
    # Clean df
    df = df.drop(columns = ['start','end', 'service_hours'])
    
    return df

In [35]:
final = final_df(subset_vp, analysis_date)

In [36]:
final.shape

(98105, 17)

In [37]:
final.sample(10)

Unnamed: 0,_gtfs_dataset_name,gtfs_dataset_key,trip_id,trip_ping_count,total_rt_min_coverage,start_minutes,end_minutes,actual_trip_duration_minutes,rt_trip_counts_by_operator,feed_key,scheduled_service_minutes,rt_data_proportion_percentage,pings_per_minute,rounded_rt_data_proportion_percentage,rounded_actual_trip_duration_minutes,trip_category,rt_category
23177,Foothill Vehicle Positions,4594028c6c1ebbb6dbfb4284d7510b4e,t3F2-b15FA9-sl5,134.0,73.0,613.07,687.4,74.33,1482.0,ac6628ccaa0137f5fe3663210b87c4e8,79.0,92.41,1.84,90,70,61-90 minutes,81-100%
68390,,318c6bc87acccb67fb00aa9180aa5a12,16877334,0.0,0.0,0.0,0.0,0.0,0.0,a7ba6f075198e9bf9152fab6c7faf0f6,50.0,0.0,0.0,0,0,0 - 30 minutes,0-20%
59491,,7c9fed045db42e3243ed76d228978321,t348-b70-sl2_merged_1853,0.0,0.0,0.0,0.0,0.0,0.0,25172f15a0229eca251fdeb5ab37e827,48.0,0.0,0.0,0,0,0 - 30 minutes,0-20%
47103,Riverside VehiclePositions,f1497340b654f3ce925f8b8c2f83e1c2,1467020,85.0,44.0,779.93,826.8,46.87,917.0,9520692b76037537fb6c14d126542292,45.0,97.78,1.93,90,40,31-60 minutes,81-100%
57919,Victor Valley Vehicle Position,d2f0224819832ba7d0039ff5e84c5bdd,03003,175.0,62.0,466.98,527.62,60.63,721.0,17511975e027808131693786ab09493b,48.0,100.0,2.82,100,60,31-60 minutes,81-100%
30774,LA Metro Bus Vehicle Positions,7c48cc5675c9266fb6851070489c53e9,10051003300653-DEC22,222.0,75.0,395.17,471.15,75.98,10485.0,6adf6cd9b6d24ab4ee8ee220e3697a73,60.0,100.0,2.96,100,70,61-90 minutes,81-100%
95777,,bfe6e539e689a9f4f49941a1cf6f2387,1299,0.0,0.0,0.0,0.0,0.0,0.0,d1b694a25d2e172e9ea98abe1829a0fd,23.68,0.0,0.0,0,0,0 - 30 minutes,0-20%
84243,,c1f526828c8d4bba1c71bf647443f534,232977b3-f1c0-45b0-b70f-c98e57f4cd34:12,0.0,0.0,0.0,0.0,0.0,0.0,9bc1318305865dedb7d5a7049de0cb01,12.0,0.0,0.0,0,0,0 - 30 minutes,0-20%
19462,Bay Area 511 Tri Delta VehiclePositions,6ef1ef238c89b672380a82b091df9a2b,250,103.0,53.0,600.0,652.0,52.0,423.0,e5ca47feef0faf28ac77c8323489000a,41.0,100.0,1.94,100,50,31-60 minutes,81-100%
43890,OCTA VehiclePositions,0f68ec4d97d179570b494c74b6fca55d,10815774,354.0,123.0,984.63,1134.27,149.63,2499.0,8a47f5aa51f481e9ddc7c497bd72d264,122.0,100.0,2.88,100,140,90 + minutes,81-100%


In [38]:
# final.rounded_rt_data_proportion.value_counts()

### Deliverable 2 - Charts 

#### Operator

In [39]:
# Exclude "none" operators which are only scheduled data
# for the operator charts
operator_wo_none = final.loc[final._gtfs_dataset_name != "None"][['_gtfs_dataset_name']]

In [40]:
dropdown_list = operator_wo_none["_gtfs_dataset_name"].unique().tolist()
dropdown = alt.binding_select(options=[None] + dropdown_list, labels = ['All'] + dropdown_list, name = "Operator")    
selection = alt.selection_single(fields=['_gtfs_dataset_name'], bind=dropdown)

In [41]:
rt_scheduled = (final
                .groupby(['_gtfs_dataset_name','rounded_rt_data_proportion_percentage'])
                .agg({'rt_trip_counts_by_operator':'max','trip_id':'nunique'})
                .reset_index()
                .rename(columns = {'trip_id':'total_trips'})
               )

In [42]:
rt_scheduled['percentage_of_trips'] = rt_scheduled.total_trips/rt_scheduled.rt_trip_counts_by_operator * 100

In [43]:
rt_scheduled_chart = alt.Chart(rt_scheduled).mark_bar(size=30).encode(
    x=alt.X('rounded_rt_data_proportion_percentage:N',
        scale=alt.Scale(domain = [0,10,20,30,40,50,60,70,80,90,100])),
    y=alt.Y('percentage_of_trips',),
    color=alt.Color("rounded_rt_data_proportion_percentage",
            scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None),
    tooltip = rt_scheduled.columns.tolist()
).properties(title="% of RT minutes vs. Scheduled").add_selection(selection).transform_filter(selection) 

In [44]:
threshold_utils.chart_size(rt_scheduled_chart,500,400)

In [45]:
rt_trip_time = (final
                .groupby(['_gtfs_dataset_name','trip_category','rt_category'])
                .agg({'rt_trip_counts_by_operator':'max','trip_id':'nunique'})
                .reset_index()
                .rename(columns = {'trip_id':'total_trips'})
               )

In [46]:
rt_trip_time.total_trips = (rt_trip_time.total_trips).astype(int)

In [47]:
rt_trip_time['percentage_of_trips'] = (rt_trip_time.total_trips).divide(rt_trip_time.rt_trip_counts_by_operator)*100

In [102]:
rt_trip_time_chart = (alt.Chart(rt_trip_time).mark_bar(clip = True).encode(
    x=alt.X('trip_category', axis=alt.Axis(labelAngle=-45)),
    y=alt.Y('percentage_of_trips', scale=alt.Scale(domain=(0, 100))),
    color=alt.Color("rt_category", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)),
    tooltip = rt_trip_time.columns.tolist())
    .properties(title="RT Trip Duration and % of RT vs. Scheduled")).add_selection(selection).transform_filter(selection)

In [103]:
threshold_utils.chart_size(rt_trip_time_chart,500,400)

#### Statewide
* why is `len(final.loc[final.rounded_rt_data_proportion_percentage == 0])` yielding different results than my graph?
* bc I used `trip_id:nunique` instead of `trip_id:count`

In [50]:
all_trips = final.trip_id.nunique()

In [107]:
all_trips

95184

In [77]:
for i in [0,10,20,30,40,50,60,70,80,90,100]:
    print(len(final.loc[final.rounded_rt_data_proportion_percentage == i]))

39442
205
302
634
572
448
649
1277
2975
5499
46102


In [53]:
total_sw_rt = rt_scheduled.groupby(['rounded_rt_data_proportion_percentage']).agg({'total_trips':'sum'}).reset_index()

In [54]:
total_sw_rt['Percentage of Usable Trips'] = total_sw_rt['total_trips'].div(all_trips)*100

In [109]:
39442-38806

636

In [106]:
total_sw_rt

Unnamed: 0,rounded_rt_data_proportion_percentage,total_trips,Percentage of Usable Trips
0,0,38806,40.77
1,10,205,0.22
2,20,302,0.32
3,30,634,0.67
4,40,572,0.6
5,50,448,0.47
6,60,649,0.68
7,70,1277,1.34
8,80,2975,3.13
9,90,5499,5.78


In [55]:
threshold_utils.chart_size(alt.Chart(total_sw_rt).mark_bar(size=30).encode(
    x=alt.X('rounded_rt_data_proportion_percentage:N',
        scale=alt.Scale(domain = [0,10,20,30,40,50,60,70,80,90,100])),
    y=alt.Y('Percentage of Usable Trips',),
    color=alt.Color("rounded_rt_data_proportion_percentage",
            scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS), legend=None),
    tooltip = total_sw_rt.columns.tolist()
),500,400)

In [57]:
sw_rt_trips = rt_trip_time.groupby(['rt_category','trip_category']).agg({'total_trips':'sum'}).reset_index()

In [58]:
sw_rt_trips['Percentage of Usable Trips'] = sw_rt_trips['total_trips'].div(all_trips)*100


In [59]:
threshold_utils.chart_size(alt.Chart(sw_rt_trips).mark_bar().encode(
    x=alt.X('trip_category', axis=alt.Axis(labelAngle=-45)),
    y='Percentage of Usable Trips',
    color=alt.Color("rt_category", scale=alt.Scale(range=cp.CALITP_CATEGORY_BRIGHT_COLORS)),
    tooltip = sw_rt_trips.columns.tolist()
),500,400)

#### Scratch

In [60]:
def summary_valid_trips_by_cutoff(df, time_col:str, phrase:str,
                                  time_cutoffs: list, ping_cutoffs: list,
                                  pings: bool = True):
    """
    Find percentage & number of trips that meet trip time elapsed
    and pings. 
    """
    group_by_cols = ["gtfs_dataset_key", "_gtfs_dataset_name", "rt_trip_counts_by_operator"]
    final = pd.DataFrame()

    if pings:
        for t in time_cutoffs:
            for s in ping_cutoffs:
                valid = (
                df[(df[time_col] >= t) & (df.pings_per_minute >= s)]
                .groupby(group_by_cols)
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
                )
                
                valid = valid.assign(
                trip_cutoff=t, pings=s, cutoff=f"{t} {phrase} & {s} ping(s)+ per minute")

                final = pd.concat([final, valid], axis=0)
    else:
        for t in time_cutoffs:
            valid = (
                df[(df[time_col] >= t)]
                .groupby(group_by_cols)
                .trip_id.nunique()
                .reset_index()
                .rename(columns={"trip_id": "n_trips"})
                )

            valid = valid.assign(
                 trip_cutoff=t,cutoff=f"{t}{phrase}"
                )

            final = pd.concat([final, valid], axis=0)

    final = final.assign(percentage_usable_trips=final.n_trips.divide(final.rt_trip_counts_by_operator) * 100)
    final = final.sort_values('_gtfs_dataset_name')
    return final

In [61]:
def create_valid_stats(df):
    # Prepare valid stats
    pings_trip_duration_ops = summary_valid_trips_by_cutoff(df, "actual_trip_duration_minutes", "+ min", [15,30,60], [1,2,3])
    rt_data_available = summary_valid_trips_by_cutoff(df,"rt_data_proportion", "%+", [20,40,60,80,100], [1,2,3], True)
    
     # Clean df
    pings_trip_duration_ops = threshold_utils.pre_clean(pings_trip_duration_ops)
    rt_data_available = threshold_utils.pre_clean(rt_data_available)
    
    return pings_trip_duration_ops,rt_data_available

In [62]:
def operator_visuals(df):
    """
    Return one chart for trip time & # of pings
    and another chart for RT vs Scheduled
    data
    """
    pings_trip_duration_ops,rt_data_available = create_valid_stats(df)
    
    # Prepare dropdown menu
    dropdown_list = pings_trip_duration_ops["Gtfs Dataset Name"].sort_values().unique().tolist()
    dropdown = alt.binding_select(options=[None] + dropdown_list, labels = ['All'] + dropdown_list, name = "Operator")    
    selection = alt.selection_single(fields=['Gtfs Dataset Name'], bind=dropdown)
    
    # Create charts
    pings_chart = threshold_utils.bar_chart(pings_trip_duration_ops, 'Percentage Usable Trips', "Cutoff",
    ['Gtfs Dataset Key', 'Gtfs Dataset Name','N Trips',
    'Cutoff', 'Percentage Usable Trips'], "Pings and RT Trip Time Elapsed") 
    
    rt_chart = threshold_utils.bar_chart(rt_data_available, 'Percentage Usable Trips', "Cutoff", 
                                         ['Gtfs Dataset Key', 'Gtfs Dataset Name','Trip Cutoff', 'Percentage Usable Trips', 'N Trips'],
                          "Real Time vs. Scheduled Data Percentage")
    
    # Clean Charts
    pings_chart = threshold_utils.chart_size(pings_chart.add_selection(selection).transform_filter(selection).interactive(), 500, 400)
    rt_chart = threshold_utils.chart_size(rt_chart.add_selection(selection).transform_filter(selection).interactive(), 500, 400)
    
    return pings_chart & rt_chart

In [63]:
# operator_visuals(final)

* Double check with Unitrans

In [64]:
# len(final[(final["_gtfs_dataset_name"] == "Unitrans Vehicle Positions") & (final["pings_per_minute"] > 1) & (final["actual_trip_duration_minutes"] > 15)])

In [65]:
# len(final.loc[final._gtfs_dataset_name == "Unitrans Vehicle Positions"])

In [66]:
# final.loc[final._gtfs_dataset_name == "Unitrans Vehicle Positions"][['actual_trip_duration_minutes', 'rt_data_proportion','pings_per_minute']].sort_values('actual_trip_duration_minutes')

In [67]:
# final[(final["_gtfs_dataset_name"] == "Unitrans Vehicle Positions") & (final["pings_per_minute"] > 1) & (final["actual_trip_duration_minutes"] > 15)]

* Check with Redding

In [68]:
# final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions")].shape

In [69]:
# final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions") & (final["rt_data_proportion"] < 20)]

In [70]:
# len(final[(final["_gtfs_dataset_name"] == "Redding Vehicle Positions") & (final["rt_data_proportion"] > 20)])

#### Statewide

In [71]:
def statewide_visuals(df):
    # Prepare valid stats
    pings_trip_duration_ops,rt_data_available = create_valid_stats(df)
    
    # Summarize to statewide level
    def statewide(df): 
        summary = (df
             .groupby(['Cutoff',])
             .agg({'Rt Trip Counts By Operator':'max','N Trips':'max'})
             .reset_index()
             .rename(columns = {'Rt Trip Counts By Operator':'All Trips'})
            )
        
        summary['Percentage of Usable Trips'] = summary['N Trips']/summary['All Trips']*100
        return summary
    
    # Aggregate valid stats to statewide level
    statewide_pings = statewide(pings_trip_duration_ops)
    statewide_rt = statewide(rt_data_available)
    
    # Charts
    pings_chart = threshold_utils.bar_chart(statewide_pings,  'Percentage of Usable Trips', "Cutoff", statewide_pings.columns.tolist(), "Statewide Pings and RT Trip Time Elapsed") 
    rt_chart = threshold_utils.bar_chart(statewide_rt,  'Percentage of Usable Trips', "Cutoff",statewide_rt.columns.tolist(), "Statewide RT vs. Scheduled Data") 
    
     # Clean Charts
    pings_chart = threshold_utils.chart_size(pings_chart.interactive(), 500, 400)
    rt_chart = threshold_utils.chart_size(rt_chart.interactive(), 500, 400)
    
    return pings_chart & rt_chart

In [72]:
# statewide_visuals(final)