# Use existing warehouse queries?

* Grab trips for selected date.
* Let's test with a parquet already stored in GCS for `bus-service-increase` exercise.
* Filter down to 1 trip for `route_id`...pick for relatively free-flowing traffic

In [1]:
import datetime as dt
import geopandas as gpd
import os
import pandas as pd

os.environ["CALITP_BQ_MAX_BYTES"] = str(130_000_000_000)

from calitp.tables import tbl
from calitp import query_sql
from siuba import *

import utils
import shared_utils

DATA_PATH = f"{utils.GCS_FILE_PATH}2022_Jan/"

E0321 17:04:44.615438058    1205 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0321 17:04:48.446742575    1205 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


In [2]:
trips = pd.read_parquet(f"{DATA_PATH}trips_joined_thurs.parquet")

In [38]:
trips.columns

Index(['calitp_itp_id', 'date', 'trip_key', 'trip_id', 'is_in_service',
       'day_name', 'stop_sequence', 'stop_id', 'departure_time', 'shape_id',
       'route_id'],
      dtype='object')

In [3]:
SELECTED_DATE = '2022-1-6'
#SELECTED_DATE = warehouse_queries.dates['thurs']

In [4]:
def grab_service_hours(df, SELECTED_DATE):
    daily_trip_info = (
        tbl.views.gtfs_schedule_fact_daily_trips()
        >> filter(_.service_date == SELECTED_DATE)
        >> select(_.calitp_itp_id, 
               _.trip_key, _.service_hours, 
               _.trip_first_departure_ts, _.trip_last_arrival_ts
              ) 
        >> collect()
    )
    
    df2 = pd.merge(df, 
                   daily_trip_info,
                   on = ["calitp_itp_id", "trip_key"],
                   how = "inner",
                   # m:1 because trips has stop_level data by trips
                   # 1 is on the right beause service_hours is trip-level
                   validate = "m:1"
                  )
    
    return df2

    
def grab_stop_geom(df):
    stop_info = (tbl.views.gtfs_schedule_dim_stops()
                 >> select(_.calitp_itp_id,
                       _.stop_id, _.stop_lon, _.stop_lat,
                      )
             >> distinct()
             >> collect()
    )
    
    df2 = pd.merge(
        df,
        (stop_info.sort_values(["calitp_itp_id", "stop_id", "stop_lon"])
         .drop_duplicates(subset=["calitp_itp_id", "stop_id"])
        ),
        on = ["calitp_itp_id", "stop_id"],
        how = "inner", 
        validate = "m:1"
    )
    
    return df2

In [5]:
def subset_to_parallel_routes(df):
    #parallel_routes = shared_utils.utils.download_geoparquet(utils.GCS_FILE_PATH, 
    #                                         "parallel_or_intersecting")
    parallel_routes = gpd.read_parquet("./data/parallel_or_intersecting.parquet")
    
    keep_cols = ["calitp_itp_id", "shape_id", "route_id", "geometry"]

    parallel_routes2 = (parallel_routes[parallel_routes.parallel==1]
           .reset_index(drop=True)
           .rename(columns = {"itp_id": "calitp_itp_id"})
           [keep_cols]
           .drop_duplicates()
           .reset_index(drop=True)
          )
    
    # Put parallel routes on the left because it has line geometry already
    gdf = pd.merge(
        parallel_routes2,
        df,
        on = ["calitp_itp_id", "shape_id", "route_id"],
        how = "inner",
        validate = "1:m",
    )
    
    return gdf

In [6]:
trips2 = grab_service_hours(trips, SELECTED_DATE)
trips3 = grab_stop_geom(trips2)



Decide if stop's point geometry should be used or route's line geometry.

If going with every 3rd or 5th bus stop, it's easier to take point geometry and have the car travel from point to point. The line geometry contains much more points in between stops and is not exactly the bus stop, but on the road.

For this calculation, distance traveled, speed, etc, it probably won't matter too much.

In [7]:
trips4 = trips3[trips3.calitp_itp_id==182].reset_index(drop=True)

In [8]:
trips5 = subset_to_parallel_routes(trips4)

In [25]:
def select_one_trip(df):
    drop_cols = ["stop_sequence", "stop_id", "departure_time", 
                 "trip_first_departure_ts", "trip_last_arrival_ts", 
                 "stop_lon", "stop_lat"]
    

    df = df.assign(
        departure_hr = pd.to_datetime(df.trip_first_departure_ts, unit='s').dt.hour                                        
    ).drop(columns = drop_cols).drop_duplicates().reset_index(drop=True)
    
    # Not sure why across trip_ids, 
    # for the same route_id, there are differing max_stop_sequence
    # Use longest route (max stop sequence)?
    # Use median or mean service hours or miles traveled?
    group_cols = ["calitp_itp_id", "route_id"]
    
    
    # Should there be a check that there are mid-day trips for that route_id?
    # Select trip by departure_hr
    hour_order = [
        12, 11, 13, 10, 14, # ideally we want mid-day
        15, 7, 20, 6, 21, # but, can move into earlier PM or early AM
        0, 1, 2, 3, 4, 5, 22, 23, # owl service
        8, 9, # AM peak 
        16, 17, 18, 19, # PM peak
    ]
    for i in range(0, 24):
        if i == 0:
            df['selection_rank'] = df.apply(
                lambda x: hour_order[i] if x.departure_hr == i 
                else 0, axis=1) 
        else:
            df['selection_rank'] = df.apply(
                lambda x: hour_order[i] if x.departure_hr == i 
                else x.selection_rank, axis=1) 
    
    # Select a trip that is somewhere in 20th-25th percentile (lower means faster!)
    for i in [20, 25]:
        quantile = (df.groupby(group_cols)["service_hours"]
                    .quantile(i/100., ).reset_index()
                    .rename(columns = {"service_hours": f"p{i}"})
                   )
    
        df = pd.merge(df, quantile, 
                      on = group_cols,
                      how = "inner",
                      validate = "m:1"
                )
    
    # If there's still no faster trip identified, if service_hours doesn't fall
    # within that range, then select the one closest
    df["difference"] = df.service_hours - df.p20
    df["min_diff"] = df.groupby(group_cols)["difference"].transform("min")

    df = df.assign(
        faster_trip = df.apply(lambda x: 
                               1 if (
                               ((x.service_hours <= x.p25) and (x.service_hours >= x.p20)) 
                                   or (x.difference == x.min_diff))
                               else 0, axis=1),
    )
    
    
    # Now select the first trip
    df2 = (df[df.faster_trip==1].sort_values(group_cols + ["service_hours"])
           .drop_duplicates(subset=group_cols)
           .drop(columns = ["faster_trip", "difference", "min_diff", "p20", "p25"])
          )
    
    return df

In [26]:
trips6 = select_one_trip(trips5)

Faster trips are happening in early AM or late night (owl service).

* If the 20th-25th percentile service hours trips are selected, they hit mostly owl service periods. 
* If we sort by departure hour, vast majority will be mid-day, and where mid-day trip cannot be found, we end up with very few in owl service periods.

In [37]:
import altair as alt

group_cols = ["calitp_itp_id", "route_id"]
plot_me = (trips6[trips6.faster_trip==1]
           .sort_values(group_cols + ["difference"])
           .drop_duplicates(subset=group_cols)
           .drop(columns = "date")
           .reset_index(drop=True)
          )

print(f"# rows: {len(plot_me)}")
print(f"# trips: {plot_me.route_id.nunique()}")

hist1 = (alt.Chart(plot_me)
 .mark_bar()
 .encode(
     x=alt.X("departure_hr:Q", bin=alt.Bin(step=1)),
     y="count()",
 )
)

#hist1

# rows: 72
# trips: 72


In [36]:
trips7=trips6.assign(
    min_rank = trips6.groupby(["calitp_itp_id", "route_id"])["selection_rank"].transform("min")
)

plot_me = (trips7[trips7.selection_rank==trips7.min_rank]
           .sort_values(group_cols + ["difference"])
           .drop_duplicates(subset=group_cols)
           .drop(columns = "date")
           .reset_index(drop=True)
          )

print(f"# rows: {len(plot_me)}")
print(f"# trips: {plot_me.route_id.nunique()}")

hist2 = (alt.Chart(plot_me)
 .mark_bar()
 .encode(
     x=alt.X("departure_hr:Q", bin=alt.Bin(step=1)),
     y="count()",
 )
)

#hist2

# rows: 72
# trips: 72


In [13]:
# Usually, the mid-day trips are not the faster ones
trips6[trips6.selection_rank==0].faster_trip.value_counts()

0    158
1      5
Name: faster_trip, dtype: int64

In [14]:
trips6[trips6.faster_trip==1].service_hours.describe()

count    412.000000
mean       1.040008
std        0.366129
min        0.233333
25%        0.800000
50%        1.000000
75%        1.250000
max        2.050000
Name: service_hours, dtype: float64

In [15]:
trips6[trips6.faster_trip==0].service_hours.describe()

count    2309.000000
mean        1.375285
std         0.393532
min         0.350000
25%         1.083333
50%         1.333333
75%         1.616667
max         2.383333
Name: service_hours, dtype: float64