# Use existing warehouse queries?

* Grab trips for selected date.
* Let's test with a parquet already stored in GCS for `bus-service-increase` exercise.
* Filter down to 1 trip for `route_id`...pick for relatively free-flowing traffic

In [1]:
import datetime as dt
import geopandas as gpd
import os
import pandas as pd

os.environ["CALITP_BQ_MAX_BYTES"] = str(130_000_000_000)

from calitp.tables import tbl
from calitp import query_sql
from siuba import *

import utils
import shared_utils

DATA_PATH = f"{utils.GCS_FILE_PATH}2022_Jan/"

E0321 20:16:36.142038405     972 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0321 20:16:45.778818294     972 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


In [2]:
trips = pd.read_parquet(f"{DATA_PATH}trips_joined_thurs.parquet")

In [3]:
SELECTED_DATE = '2022-1-6'
#SELECTED_DATE = warehouse_queries.dates['thurs']

In [4]:
def grab_service_hours(df, SELECTED_DATE):
    daily_trip_info = (
        tbl.views.gtfs_schedule_fact_daily_trips()
        >> filter(_.service_date == SELECTED_DATE)
        >> select(_.calitp_itp_id, 
               _.trip_key, _.service_hours, 
               _.trip_first_departure_ts, _.trip_last_arrival_ts
              ) 
        >> collect()
    )
    
    df2 = pd.merge(df, 
                   daily_trip_info,
                   on = ["calitp_itp_id", "trip_key"],
                   how = "inner",
                   # m:1 because trips has stop_level data by trips
                   # 1 is on the right beause service_hours is trip-level
                   validate = "m:1"
                  )
    
    return df2

In [5]:
def select_one_trip(df):
    drop_cols = ["stop_sequence", "stop_id", "departure_time", 
                 "trip_first_departure_ts", "trip_last_arrival_ts", 
                ]
    

    df = df.assign(
        departure_hr = pd.to_datetime(df.trip_first_departure_ts, unit='s').dt.hour                                        
    ).drop(columns = drop_cols).drop_duplicates().reset_index(drop=True)
    
    # Across trip_ids, for the same route_id, there are differing max_stop_sequence
    # Can't use max_stop_sequence to find longest route_length
    # Use service hours instead to find faster trip during free-flowing traffic
    group_cols = ["calitp_itp_id", "route_id"]
    
    # Should there be a check that there are mid-day trips for that route_id?
    # Select trip by departure_hr
    hour_order = [
        12, 11, 13, 10, 14, # ideally we want mid-day
        15, 7, 20, 6, 21, # but, can move into earlier PM or early AM
        0, 1, 2, 3, 4, 5, 22, 23, # owl service
        8, 9, # AM peak 
        16, 17, 18, 19, # PM peak
    ]
    for i in range(0, 24):
        if i == 0:
            df['selection_rank'] = df.apply(
                lambda x: hour_order[i] if x.departure_hr == i 
                else 0, axis=1) 
        else:
            df['selection_rank'] = df.apply(
                lambda x: hour_order[i] if x.departure_hr == i 
                else x.selection_rank, axis=1) 
    
    # Select a trip that closest to 25th percentile (lower means faster!)
    # This groupby ruins the index, throws an error, so just merge in as separate df
    quantile = (df.groupby(group_cols)["service_hours"]
                .quantile(0.25).reset_index()
                .rename(columns = {"service_hours": "p25"})
               )
    
    df = pd.merge(df, quantile, 
                  on = group_cols,
                  how = "inner",
                  validate = "m:1"
            )
    
    # Select trip that is closest to 25th percentile (min_diff)
    df["difference"] = df.service_hours - df.p25
    df["min_diff"] = df.groupby(group_cols)["difference"].transform("min")

    df['faster_trip'] = df.apply(lambda x: 
                                 1 if x.difference == x.min_diff else 0, axis=1)
    
    # If there are multiple trips selected for a route, do a sort/drop duplicates
    # This df is trip-level (no stop_id, becuase that was dropped at beginning)
    df2 = (df[df.faster_trip==1]
           .sort_values(group_cols + ["departure_hr"], 
                        # If there are multiple trips with same service hours, 
                        # pick one with later departure hr (closer to mid-day)
                        ascending=[True, True, False])
           .drop_duplicates(subset=group_cols)
           .drop(columns = ["faster_trip", "difference", "min_diff", 
                            "p25", "selection_rank"])
           .reset_index(drop=True)
          )

    return df2

In [6]:
trips2 = grab_service_hours(trips, SELECTED_DATE)
trips2.to_parquet("./data/trips_with_hours.parquet")



In [7]:
trips2 = pd.read_parquet("./data/trips_with_hours.parquet")

trips3 = select_one_trip(trips2)

In [8]:
def subset_to_parallel_routes(df):
    # Just use route_id to flag parallel, not shape_id
    # It won't matter anyway, because we will use stop's point geom
    #parallel_routes = shared_utils.utils.download_geoparquet(utils.GCS_FILE_PATH, 
    #                                         "parallel_or_intersecting")
    parallel_routes = gpd.read_parquet("./data/parallel_or_intersecting.parquet")
    
    keep_cols = ["calitp_itp_id", "route_id", "geometry"]

    parallel_routes2 = (parallel_routes[parallel_routes.parallel==1]
           .reset_index(drop=True)
           .rename(columns = {"itp_id": "calitp_itp_id"})
           [keep_cols]
           .drop_duplicates()
           .reset_index(drop=True)
          )
    
    # Put parallel routes on the left because it has line geometry already
    gdf = pd.merge(
        parallel_routes2,
        df,
        on = ["calitp_itp_id", "route_id"],
        how = "inner",
        validate = "1:m",
    )
    
    return gdf

In [9]:
trips4 = subset_to_parallel_routes(trips3)

In [10]:
print(f"# rows in trips3: {len(trips3)}")
print(f"# rows in trips4: {len(trips4)}")
print(f"# LA Metro rows in trips3: {len(trips3[trips3.calitp_itp_id==182])}")
print(f"# LA Metro rows in trips4: {len(trips4[trips4.calitp_itp_id==182])}")

# rows in trips3: 3314
# rows in trips4: 2242
# LA Metro rows in trips3: 113
# LA Metro rows in trips4: 73


Decide if stop's point geometry should be used or route's line geometry.

If going with every 3rd or 5th bus stop, it's easier to take point geometry and have the car travel from point to point. The line geometry contains much more points in between stops and is not exactly the bus stop, but on the road.

For this calculation, distance traveled, speed, etc, it probably won't matter too much.

In [34]:
def grab_stop_geom(df):
    stop_info = (tbl.views.gtfs_schedule_dim_stops()
                 >> select(_.calitp_itp_id,
                       _.stop_id, _.stop_lon, _.stop_lat,
                      )
             >> distinct()
             >> collect()
    )
    
    df2 = pd.merge(
        df,
        (stop_info.sort_values(["calitp_itp_id", "stop_id", "stop_lon"])
         .drop_duplicates(subset=["calitp_itp_id", "stop_id"])
        ),
        on = ["calitp_itp_id", "stop_id"],
        how = "inner", 
        validate = "m:1"
    )
    
    gdf = (shared_utils.geography_utils.create_point_geometry(df2)
           .sort_values(["calitp_itp_id", "route_id", 
                         "trip_id", "stop_sequence"])
           .reset_index(drop=True)
           .drop(columns = ["stop_lon", "stop_lat", 
                            "trip_first_departure_ts", "trip_last_arrival_ts"])
          )

    return gdf

In [13]:
# Recall: trips2 contains stop_id
# trips4 is trip-level
# Pare down trips2 with isin
trips5 = trips2[trips2.trip_key.isin(trips4.trip_key)].reset_index(drop=True)

In [35]:
trips6 = grab_stop_geom(trips5)
trips6.to_parquet("./data/cleaned_trips_with_stops.parquet")

In [38]:
# Can use trips6 to select every 3rd or every 5th stop, calculate distance traveled, etc



This metadata specification does not yet make stability promises.  We do not yet recommend using this in a production setting unless you are able to rewrite your Parquet/Feather files.



In [36]:
trips6.columns

Index(['calitp_itp_id', 'date', 'trip_key', 'trip_id', 'is_in_service',
       'day_name', 'stop_sequence', 'stop_id', 'departure_time', 'shape_id',
       'route_id', 'service_hours', 'geometry'],
      dtype='object')

In [37]:
trips6[(trips6.calitp_itp_id==182) & (trips6.route_id=="901-13153")]

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,stop_sequence,stop_id,departure_time,shape_id,route_id,service_hours,geometry
20313,182,2022-01-06,-4877570458680265374,10901000570527-DEC21,True,Thursday,1,15684,05:27:00,9010057_DEC21,901-13153,0.6,POINT (-118.37799 34.16846)
20314,182,2022-01-06,-4877570458680265374,10901000570527-DEC21,True,Thursday,2,15611,05:30:00,9010057_DEC21,901-13153,0.6,POINT (-118.39730 34.16857)
20315,182,2022-01-06,-4877570458680265374,10901000570527-DEC21,True,Thursday,3,15661,05:35:00,9010057_DEC21,901-13153,0.6,POINT (-118.42292 34.17288)
20316,182,2022-01-06,-4877570458680265374,10901000570527-DEC21,True,Thursday,4,15583,05:37:00,9010057_DEC21,901-13153,0.6,POINT (-118.42960 34.17916)
20317,182,2022-01-06,-4877570458680265374,10901000570527-DEC21,True,Thursday,5,15546,05:40:00,9010057_DEC21,901-13153,0.6,POINT (-118.44826 34.18064)
20318,182,2022-01-06,-4877570458680265374,10901000570527-DEC21,True,Thursday,6,15535,05:43:00,9010057_DEC21,901-13153,0.6,POINT (-118.46903 34.18090)
20319,182,2022-01-06,-4877570458680265374,10901000570527-DEC21,True,Thursday,7,15588,05:46:00,9010057_DEC21,901-13153,0.6,POINT (-118.48443 34.18629)
20320,182,2022-01-06,-4877570458680265374,10901000570527-DEC21,True,Thursday,8,15515,05:48:00,9010057_DEC21,901-13153,0.6,POINT (-118.50072 34.18597)
20321,182,2022-01-06,-4877570458680265374,10901000570527-DEC21,True,Thursday,9,15415,05:54:00,9010057_DEC21,901-13153,0.6,POINT (-118.53680 34.18044)
20322,182,2022-01-06,-4877570458680265374,10901000570527-DEC21,True,Thursday,10,15435,05:56:00,9010057_DEC21,901-13153,0.6,POINT (-118.55419 34.18156)
