In [1]:
import os
os.environ["CALITP_BQ_MAX_BYTES"] = str(100_000_000_000)

import pandas as pd
import numpy as np
import geopandas as gpd
import fiona
import datetime as dt

import utils

import calitp
from calitp.tables import tbl
from siuba import *

# Replace get_recent_dates()
# Explicitly set dates

dates = {
    'thurs': dt.date(2021, 10, 7),
    'sat': dt.date(2021, 10, 9),
    'sun': dt.date(2021, 10, 10)
}

min_date = min(dates.values())
max_date = max(dates.values())



In [2]:
# store temporarily here
DATA_PATH = "gs://calitp-analytics-data/data-analyses/bus_service_increase/test/"
#thurs = pd.read_parquet(f"{DATA_PATH}trips_thurs.parquet")
#sat = pd.read_parquet(f"{DATA_PATH}trips_sat.parquet")
#sun = pd.read_parquet(f"{DATA_PATH}trips_sun.parquet")

#df = pd.concat([thurs, sat, sun], axis=0, ignore_index=True)
#df.to_parquet(f"{DATA_PATH}all_days_st2.parquet")

df = pd.read_parquet(f"{DATA_PATH}all_days_st2.parquet")

In [3]:
def get_time_calculations(df):
    ## time calculations
    df = df.assign(
        date = pd.to_datetime(df.date),
        departure_time = df.departure_time.dropna().apply(utils.fix_gtfs_time),
    )

    # Something weird comes up trying to generate departure_dt
    # pd.to_datetime() gives today's date
    # datetime.strptime gives year 1900
    # Either way, we have the service date, and later subsetting between 5am-9pm will address this
    df = df.assign(
        departure_time = pd.to_datetime(df.departure_time),
        departure_hour = pd.to_datetime(df.departure_time).dt.hour,
    )
    
    # Any observation with NaTs for departure time get dropped
    # Will create issue later when grouping is done with departure hour
    df = df[df.departure_time.notna()].reset_index(drop=True)
    
    return df


def calculate_runtime_hourlytrips(df):
    # Calculate run time for a trip
    # Find the median stop (keep that observation)
    group_cols = ['trip_key', 'day_name']
    df = df.assign(
        mindt = df.groupby(group_cols)["departure_time"].transform("min"),
        maxdt = df.groupby(group_cols)["departure_time"].transform("max"),
        middle_stop = df.groupby(["trip_key", "day_name"])["stop_sequence"].transform("median"),
    ).astype({"middle_stop": "int64"})

    df = df.assign(
        runtime_seconds = (df.maxdt - df.mindt).dt.seconds
    ).drop(columns = ["mindt", "maxdt"])
    
    # Drop any trips with runtime of NaN calculated
    df = df[df.runtime_seconds.notna()].reset_index(drop=True)

    # Still want to use this to merge on the mean runtime info
    middle_stops = df >> filter(_.stop_sequence == _.middle_stop)
    
    middle_stops = middle_stops.assign(
        mean_runtime_min = (middle_stops.groupby(["calitp_itp_id", 
                                                  "route_id", "shape_id", 
                                                  "departure_hour", "day_name"])
                            ["runtime_seconds"].transform("mean")
                           )
    )
    
    debug_me = middle_stops[middle_stops.mean_runtime_min.isna()][
        ["calitp_itp_id", "shape_id", "trip_key"]]
    print("Debug errors for NaN mean runtimes")
    print(debug_me.head())
    # Why are there some NaNs from this, when NaNs were dropped before?
    # Some are due to no departure_time (handle it above by dropping NaTs)
    
    middle_stops = middle_stops.assign(
        mean_runtime_min = (middle_stops.mean_runtime_min.dropna()     
                            .apply(lambda x: int(round(x) / 60))
                           )
    )   
    
    # Add trips per hour column
    shape_frequency = (
        middle_stops
        >> count(_.calitp_itp_id, _.route_id,
                 _.shape_id, _.departure_hour, _.day_name, sort = True)
        >> rename(trips_per_hour = "n")
        >> inner_join(_, middle_stops, 
                      on = ["calitp_itp_id", "day_name", 
                            "shape_id", "departure_hour", "route_id"])
    )
    
    # Now, data is at the trip-level (trip_key) still present
    # Drop duplicates, but no aggregation because trips_per_hour and mean_runtime 
    # are already correctly generated at the route-level, across trips in that departure hour
    shape_frequency = shape_frequency.drop_duplicates(subset=[
        "calitp_itp_id", "shape_id", "departure_hour",
        "day_name", "route_id"])
    
    # There's an aggregation to deal with multiple route_ids that share same shape_id
    # If there are still multiple route_ids, then aggregate and sum / mean
    # Modify this to include itp_id into the groupby
    shape_frequency2 = (shape_frequency.groupby(
        ["calitp_itp_id", "shape_id", "departure_hour", "day_name"])
                        .agg({"route_id": "max", 
                              "trips_per_hour": "sum", 
                              "mean_runtime_min": "mean"
                             }).reset_index()
                       )
    
    # Now, drop ITP_ID==200 to use individual operator feeds
    shape_frequency3 = shape_frequency2 >> filter(_.calitp_itp_id != 200)
    
    return shape_frequency3


def attach_funding(all_operators_df):
    # This is a small query, can leave it here
    with_funding = (tbl.views.transitstacks()
                    >> select(_.calitp_itp_id == _.itp_id, _.ntd_id, 
                              _.transit_provider, _._5307_funds, _._5311_funds,
                              _.operating_expenses_total_2019)
                    >> collect()
                    >> right_join(_, all_operators_df, on = 'calitp_itp_id')
                   )
    
    def fix_funds(value):
        if type(value) != str:
            return None
        else:
            return int(value.replace('$', '').replace(',', ''))
        
    funding_cols = ["_5307_funds", "_5311_funds", "operating_expenses_total_2019"] 
    for c in funding_cols:
        with_funding[c] = with_funding[c].apply(fix_funds)
    
    return with_funding

In [4]:
## Debug and check some of the cases 
# Just look at LA Metro for now
# See if the 30 trips per hour comes up again
df = pd.read_parquet(f"{DATA_PATH}all_days_st2.parquet")
df = df[df.calitp_itp_id==182]

df = get_time_calculations(df)

In [75]:
# df

In [6]:
def to_debug(df):
    group_cols = ['trip_key', 'day_name']
    df = df.assign(
        mindt = df.groupby(group_cols)["departure_time"].transform("min"),
        maxdt = df.groupby(group_cols)["departure_time"].transform("max"),
        middle_stop = df.groupby(["trip_key", "day_name"])["stop_sequence"].transform("median"),
    ).astype({"middle_stop": "int64"})

    df = df.assign(
        runtime_seconds = (df.maxdt - df.mindt).dt.seconds
    ).drop(columns = ["mindt", "maxdt"])

    # Still want to use this to merge on the mean runtime info
    middle_stops = df >> filter(_.stop_sequence == _.middle_stop)
    
    middle_stops = middle_stops.assign(
        mean_runtime_min = (middle_stops.groupby(["calitp_itp_id", 
                                                  "route_id", "shape_id", 
                                                  "departure_hour", "day_name"])
                            ["runtime_seconds"].transform("mean")
                            .apply(lambda x: int(round(x) / 60)))
    )
    
    # Add trips per hour column
    shape_frequency = (
        middle_stops
        >> count(_.calitp_itp_id, _.route_id,
                 _.shape_id, _.departure_hour, _.day_name, sort = True)
        >> rename(trips_per_hour = "n")
        >> inner_join(_, middle_stops, 
                      on = ["calitp_itp_id", "day_name", 
                            "shape_id", "departure_hour", "route_id"])
    )
    '''
    # Now, data is at the trip-level (trip_key) still present
    # Drop duplicates, but no aggregation because trips_per_hour and mean_runtime 
    # are already correctly generated at the route-level, across trips in that departure hour
    shape_frequency = shape_frequency.drop_duplicates(subset=[
        "calitp_itp_id", "shape_id", "departure_hour",
        "day_name", "route_id"])
    
    # There's an aggregation to deal with multiple route_ids that share same shape_id
    # If there are still multiple route_ids, then aggregate and sum / mean
    # Modify this to include itp_id into the groupby
    shape_frequency2 = (shape_frequency.groupby(
        ["calitp_itp_id", "shape_id", "departure_hour", "day_name"])
                        .agg({"route_id": "max", 
                              "trips_per_hour": "sum", 
                              "mean_runtime_min": "mean"
                             }).reset_index()
                       )
    '''
    return shape_frequency

In [7]:
df2 = to_debug(df)

In [8]:
df2 >> filter(_.trip_key == -3688991411338276459)

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,trips_per_hour,date,trip_key,trip_id,is_in_service,stop_sequence,departure_time,stop_id,middle_stop,runtime_seconds,mean_runtime_min
0,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,-3688991411338276459,54133711-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:26:00,30022,1,540,9


In [9]:
group_cols = ["calitp_itp_id", "shape_id", "departure_hour", "day_name", "route_id"]
df2 = df2.assign(
    obs = df2.groupby(group_cols)["trip_key"].cumcount() + 1,
)

df2 = df2.assign(
    max_obs = df2.groupby(group_cols)["obs"].transform("max")
)

In [10]:
df2.max_obs.max() ##reflects a max of 19 trips/hour, which looks OK (=trips_per_hour)

19

In [11]:
df2[df2.max_obs==19]

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,trips_per_hour,date,trip_key,trip_id,is_in_service,stop_sequence,departure_time,stop_id,middle_stop,runtime_seconds,mean_runtime_min,obs,max_obs
0,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,-3688991411338276459,54133711-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:26:00,30022,1,540,9,1,19
1,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,8297285008995166099,54133713-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:32:00,30022,1,540,9,2,19
2,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,-6494341673185312199,54133717-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:44:00,30022,1,540,9,3,19
3,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,6820499949529949431,54133722-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:59:00,30022,1,540,9,4,19
4,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,51706130366599061,54133719-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:50:00,30022,1,540,9,5,19
5,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,4406007602970273768,54133712-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:29:00,30022,1,540,9,6,19
6,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,-7005119982581110102,54133718-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:47:00,30022,1,540,9,7,19
7,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,-2414876668720152120,54133710-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:23:00,30022,1,540,9,8,19
8,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,5742717980696682034,54133705-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:08:00,30022,1,540,9,9,19
9,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,-2997845282772078049,54133707-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:14:00,30022,1,540,9,10,19


In [12]:
# Ok, to deal with duplicates, because there are multiple trips
# with same departure hour, but that trips_per_hour is already generated correctly
# Just drop duplicates, don't need to sum it up trips_per_hour
# The mean runtime is already derived across all these trips
df3 = df2.drop_duplicates(subset=["calitp_itp_id", "shape_id", "departure_hour",
                                  "day_name", "route_id"])

In [13]:
df3 >> filter(_.shape_id == '6990001_SoFi', _.departure_hour == 16) ##OK

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,trips_per_hour,date,trip_key,trip_id,is_in_service,stop_sequence,departure_time,stop_id,middle_stop,runtime_seconds,mean_runtime_min,obs,max_obs
0,182,SOFI,6990001_SoFi,16,Sunday,19,2021-10-10,-3688991411338276459,54133711-SoFi_Stadium_Express_1305,True,1,2022-02-04 16:26:00,30022,1,540,9,1,19


In [14]:
## Aggregation to deal with multiple route_ids that share same shape_id
# It makes 3 trips per hour, same 
df3[df3.shape_id=="964395_shp"] ## not present?

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,trips_per_hour,date,trip_key,trip_id,is_in_service,stop_sequence,departure_time,stop_id,middle_stop,runtime_seconds,mean_runtime_min,obs,max_obs


In [15]:
keep_trips = [-7711476650844868921,
              -6084928573786923571] ## not present (departure_hour == 10 here)
silver = df2[(df2.route_id.str.contains("910")) & 
            (df2.departure_hour == 9) & (df2.trip_key.isin(keep_trips))] 

In [16]:
silver.runtime_seconds.value_counts()

3540    1
Name: runtime_seconds, dtype: int64

In [17]:
silver

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,trips_per_hour,date,trip_key,trip_id,is_in_service,stop_sequence,departure_time,stop_id,middle_stop,runtime_seconds,mean_runtime_min,obs,max_obs
6480,182,910-13149,9100211_SEPT21,9,Thursday,5,2021-10-07,-7711476650844868921,53936837-SEPT21-D18CAR-1_Weekday,True,11,2022-02-04 09:57:00,5040,11,3540,59,2,5


In [18]:
(3450 +5100)/2 ## unsure meaning

4275.0

In [19]:
# Still want to use this to merge on the mean runtime info
middle_stops = df2 >> filter(_.stop_sequence == _.middle_stop)

In [20]:
middle_stops[middle_stops.trip_key.isin(keep_trips)]

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,trips_per_hour,date,trip_key,trip_id,is_in_service,stop_sequence,departure_time,stop_id,middle_stop,runtime_seconds,mean_runtime_min,obs,max_obs
6480,182,910-13149,9100211_SEPT21,9,Thursday,5,2021-10-07,-7711476650844868921,53936837-SEPT21-D18CAR-1_Weekday,True,11,2022-02-04 09:57:00,5040,11,3540,59,2,5
22956,182,910-13149,9100209_SEPT21,10,Thursday,2,2021-10-07,-6084928573786923571,53936848-SEPT21-D18CAR-1_Weekday,True,17,2022-02-04 10:15:00,10994,17,5100,85,2,2


In [21]:
middle_stops = middle_stops.assign(
    mean_runtime_min = (middle_stops.groupby(["calitp_itp_id", "route_id", "shape_id", 
                          "departure_hour", "day_name"])
                        ["runtime_seconds"].transform("mean")
                        .apply(lambda x: int(round(x) / 60)))
)

In [22]:
# With these 2 trips, it's because the starting stop has departure hour = 9, but by 
# middle stop, it's departure hour = 10. 
# Allow departure hour to differ for trip_keys then, since we want to stick with middle stop +1 implement? 
middle_stops[middle_stops.trip_key.isin(keep_trips)]

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,trips_per_hour,date,trip_key,trip_id,is_in_service,stop_sequence,departure_time,stop_id,middle_stop,runtime_seconds,mean_runtime_min,obs,max_obs
6480,182,910-13149,9100211_SEPT21,9,Thursday,5,2021-10-07,-7711476650844868921,53936837-SEPT21-D18CAR-1_Weekday,True,11,2022-02-04 09:57:00,5040,11,3540,59,2,5
22956,182,910-13149,9100209_SEPT21,10,Thursday,2,2021-10-07,-6084928573786923571,53936848-SEPT21-D18CAR-1_Weekday,True,17,2022-02-04 10:15:00,10994,17,5100,85,2,2


In [23]:
shape_frequency = (
    middle_stops
    >> count(_.calitp_itp_id, _.route_id,
             _.shape_id, _.departure_hour, _.day_name, sort = True)
    >> rename(trips_per_hour = "n")
    >> inner_join(_, middle_stops, 
                  on = ["calitp_itp_id", "day_name", 
                        "shape_id", "departure_hour", "route_id"])
)
    

In [24]:
shape_frequency[shape_frequency.trip_key.isin(keep_trips)]

Unnamed: 0,calitp_itp_id,route_id,shape_id,departure_hour,day_name,trips_per_hour_x,trips_per_hour_y,date,trip_key,trip_id,is_in_service,stop_sequence,departure_time,stop_id,middle_stop,runtime_seconds,mean_runtime_min,obs,max_obs
6480,182,910-13149,9100211_SEPT21,9,Thursday,5,5,2021-10-07,-7711476650844868921,53936837-SEPT21-D18CAR-1_Weekday,True,11,2022-02-04 09:57:00,5040,11,3540,59,2,5
22956,182,910-13149,9100209_SEPT21,10,Thursday,2,2,2021-10-07,-6084928573786923571,53936848-SEPT21-D18CAR-1_Weekday,True,17,2022-02-04 10:15:00,10994,17,5100,85,2,2


In [25]:
multiple_shapes = (shape_frequency.groupby(["shape_id", "day_name", "departure_hour"])
                   .agg({"route_id": "nunique"})
                   .reset_index()
                  )
                  

In [26]:
all_operators_shape_frequency = pd.read_parquet(f"{utils.GCS_FILE_PATH}shape_frequency.parquet")

In [27]:
single = all_operators_shape_frequency[all_operators_shape_frequency.calitp_itp_id==279]

In [28]:
(single >> filter(_.shape_id == '964395_shp')).dropna()

Unnamed: 0,shape_id,day_name,departure_hour,calitp_itp_id,route_id,trips_per_hour,mean_runtime_min
797,964395_shp,Thursday,5,279,4,4,24.5
822,964395_shp,Saturday,6,279,4,1,24.0
848,964395_shp,Sunday,8,279,4,1,24.0


In [29]:
debug = pd.read_parquet(f"{utils.GCS_FILE_PATH}test/timecalc_thurs.parquet") ## assuming these are results of get_time_calculations

error_trips = [
    #thurs
    4561616254186924304,
    -3573784532994111184,
    -5890052631007121734,
    # sat
    #8542179829811914215,
    #4330243390808932084,
    #-5417826296843621461
]

debug = debug[debug.trip_key.isin(error_trips)]
debug

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,stop_sequence,departure_time,stop_id,shape_id,route_id,departure_hour
6830,120,2021-10-07,4561616254186924304,DX03S0319339,True,Thursday,79,NaT,233,3sb,3,
6849,120,2021-10-07,4561616254186924304,DX03S0319339,True,Thursday,84,2022-01-29 16:05:00,250,3sb,3,16.0
6855,120,2021-10-07,4561616254186924304,DX03S0319339,True,Thursday,71,NaT,200,3sb,3,
18158,323,2021-10-07,-3573784532994111184,210000031,True,Thursday,1,2022-01-29 20:00:00,107,,LAX FlyAway Bus,20.0
18166,323,2021-10-07,-5890052631007121734,210000022,True,Thursday,1,2022-01-29 15:30:00,107,,LAX FlyAway Bus,15.0
22502,323,2021-10-07,-5890052631007121734,210000022,True,Thursday,2,2022-01-29 16:00:00,180,,LAX FlyAway Bus,16.0
172857,120,2021-10-07,4561616254186924304,DX03S0319339,True,Thursday,66,NaT,192,3sb,3,
338706,120,2021-10-07,4561616254186924304,DX03S0319339,True,Thursday,63,NaT,189,3sb,3,
338736,120,2021-10-07,4561616254186924304,DX03S0319339,True,Thursday,59,NaT,262,3sb,3,
338744,120,2021-10-07,4561616254186924304,DX03S0319339,True,Thursday,97,NaT,271,3sb,3,


In [30]:
## 120 = Glendale Beeline, 323 = ...Metrolink?? is FlyAway supposed to be in the Metrolink feed?? (maybe it is?) (no shapes?)

In [31]:
all_operators_shape_frequency = pd.read_parquet(f"{utils.GCS_FILE_PATH}shape_frequency.parquet")
all_operators_shape_frequency[
    (all_operators_shape_frequency.calitp_itp_id.isin(debug.calitp_itp_id)) & 
    (all_operators_shape_frequency.route_id.isin(debug.route_id)) & 
    (all_operators_shape_frequency.mean_runtime_min.isna())
]

Unnamed: 0,shape_id,day_name,departure_hour,calitp_itp_id,route_id,trips_per_hour,mean_runtime_min
792,3nb,Thursday,0,120,3,0,
793,3nb,Thursday,1,120,3,0,
794,3nb,Thursday,2,120,3,0,
795,3nb,Thursday,3,120,3,0,
796,3nb,Thursday,4,120,3,0,
...,...,...,...,...,...,...,...
1003,3sb,Sunday,19,120,3,0,
1004,3sb,Sunday,20,120,3,0,
1005,3sb,Sunday,21,120,3,0,
1006,3sb,Sunday,22,120,3,0,


In [32]:
my_shape_frequency = pd.read_parquet(f"{utils.GCS_FILE_PATH}test/shape_frequency.parquet")

In [33]:
shapes_frequency_funding = attach_funding(my_shape_frequency)

  return self.connectable.execute(*args, **kwargs)


In [34]:
shapes_frequency_funding.to_parquet(f"{utils.GCS_FILE_PATH}test/shape_frequency_funding.parquet")

In [35]:
my_shape_frequency[(my_shape_frequency.calitp_itp_id.isin(debug.calitp_itp_id)) &
                   (my_shape_frequency.route_id.isin(debug.route_id) & 
                    (my_shape_frequency.mean_runtime_min.isna())
                   )]

Unnamed: 0,calitp_itp_id,shape_id,departure_hour,day_name,route_id,trips_per_hour,mean_runtime_min


In [36]:

my_shape_frequency.trips_per_hour.describe()

count    95933.000000
mean         1.944013
std          1.504917
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         19.000000
Name: trips_per_hour, dtype: float64

#### Eric Tries Stuff

In [37]:
all_operators_shape_frequency.shape ## original calculations, right?

(423216, 7)

In [38]:
my_shape_frequency.shape ## much smaller row count for new scripts?

(95933, 7)

In [39]:
my_grouped = my_shape_frequency >> group_by(_.calitp_itp_id) >> summarize(total_trips = _.trips_per_hour.sum()) >> arrange(-_.total_trips)
my_grouped >> head(3)

Unnamed: 0,calitp_itp_id,total_trips
73,182,36755
116,282,24482
112,278,15491


In [40]:
orig_grouped = all_operators_shape_frequency >> group_by(_.calitp_itp_id) >> summarize(total_trips = _.trips_per_hour.sum()) >> arrange(-_.total_trips)
orig_grouped >> head(3)

Unnamed: 0,calitp_itp_id,total_trips
69,182,36309
111,282,24482
0,4,13094


In [41]:
my_grouped.total_trips.sum()

186495

In [42]:
orig_grouped.total_trips.sum()

172136

In [43]:
all_operators_shape_frequency >> filter(_.calitp_itp_id == 278) ##sdmts

Unnamed: 0,shape_id,day_name,departure_hour,calitp_itp_id,route_id,trips_per_hour,mean_runtime_min


In [44]:
my_shape_frequency >> filter(_.calitp_itp_id == 278) ##sdmts

Unnamed: 0,calitp_itp_id,shape_id,departure_hour,day_name,route_id,trips_per_hour,mean_runtime_min
27033,278,105_0_95,5.0,Thursday,105,2,30.0
27034,278,105_0_95,6.0,Thursday,105,1,34.0
27035,278,105_0_95,7.0,Thursday,105,2,37.0
27036,278,105_0_95,8.0,Thursday,105,2,39.0
27037,278,105_0_95,9.0,Thursday,105,3,36.0
...,...,...,...,...,...,...,...
89537,278,AIR_9_1,19.0,Sunday,AIR,4,10.0
89538,278,AIR_9_1,20.0,Sunday,AIR,4,10.0
89539,278,AIR_9_1,21.0,Sunday,AIR,4,10.0
89540,278,AIR_9_1,22.0,Sunday,AIR,4,10.0


In [45]:
tbl.views.gtfs_schedule_fact_daily_trips() >> filter(_.calitp_itp_id == 278) >> distinct(_.calitp_extracted_at) >> arrange(_.calitp_extracted_at)

Unnamed: 0,calitp_extracted_at
0,2021-04-15
1,2021-05-21
2,2021-06-06
3,2021-06-28
4,2021-08-23


In [73]:
all_operators_shape_frequency >> filter(_.trips_per_hour == 0) >> head(5)

Unnamed: 0,shape_id,day_name,departure_hour,calitp_itp_id,route_id,trips_per_hour,mean_runtime_min
0,13737,Thursday,0,257,66,0,
1,13737,Thursday,1,257,66,0,
2,13737,Thursday,2,257,66,0,
3,13737,Thursday,3,257,66,0,
4,13737,Thursday,4,257,66,0,


In [47]:
my_shape_frequency >> filter(_.trips_per_hour == 0)

Unnamed: 0,calitp_itp_id,shape_id,departure_hour,day_name,route_id,trips_per_hour,mean_runtime_min


In [72]:
# all_operators_shape_frequency

In [74]:
## it's the zeros!
## per Hunter (and I concur), zero trips for a shape x hour is very important data for calculating service increases!
## added zero generation into create_analysis_data.py