In [1]:
#https://stackoverflow.com/questions/55162077/how-to-get-the-driving-distance-between-two-geographical-coordinates-using-pytho
import geopandas as gpd
import os
import pandas as pd

os.environ["CALITP_BQ_MAX_BYTES"] = str(130_000_000_000)

from calitp.tables import tbl
from calitp import query_sql
from siuba import *

import shared_utils
import utils



In [None]:
''' Do we need this anymore?
daily_trips = (
    tbl.views.gtfs_schedule_dim_trips()
    >> select(_.itp_id==_.calitp_itp_id, _.trip_id, _.route_id)
    >> distinct()
)


routes_with_stops = (
    daily_stop_times 
    >> inner_join(_, daily_trips, on = ["itp_id", "trip_id"])
    >> select(_.itp_id, _.route_id, 
              _.stop_id, _.stop_lon, _.stop_lat,
              _.stop_sequence, _.shape_dist_traveled
             )
    >> collect()
)
'''

In [None]:
'''KEEP THIS

SELECTED_DATE = "2022-1-6"

tbl_stop_times = (
    tbl.views.gtfs_schedule_dim_stop_times()
    >> filter(_.calitp_extracted_at <= SELECTED_DATE, 
              _.calitp_deleted_at > SELECTED_DATE, 
             )
)


daily_stop_times = (
    tbl.views.gtfs_schedule_fact_daily_trips()
    >> filter(_.service_date == SELECTED_DATE, 
          _.is_in_service == True)
    >> filter(_.calitp_itp_id==182)
    >> left_join(_, tbl_stop_times,
              # also added url number to the join keys ----
             ["calitp_itp_id", "calitp_url_number", "trip_id"])
    >> select(_.calitp_itp_id,
           _.trip_id, _.route_id, _.stop_id, _.stop_sequence, 
          )    
    >> inner_join(_, 
                  (tbl.views.gtfs_schedule_dim_stops()
                   >> select(_.calitp_itp_id,
                            _.stop_id, _.stop_lon, _.stop_lat,
                            )
                  ), on = ["calitp_itp_id", "stop_id"]
    )
    >> arrange(_.calitp_itp_id, _.route_id, _.trip_id, _.stop_sequence)
    >> distinct()
    >> collect()
)
'''

In [None]:
#daily_stop_times.to_parquet("./data/metro_routes.parquet")

In [9]:
def select_longest_route(df):
    # Not sure why across trip_ids, 
    # for the same route_id, there are differing max_stop_sequence
    # Can't identify which stop sequence is linked to what trip_id
    # once we drop trip_id, and hard to select just 1 trip for that route
    # Go with the longest trip (max stops) and use that to stand-in for route
    group_cols = ["calitp_itp_id", "route_id"]
    df = df.assign(
        max_stop = (df.groupby(group_cols + ["trip_id"])
                    ["stop_sequence"].transform("max")
                   ),
    )
    
    df = df.assign(
        longest_trip = (df.groupby(group_cols)
                        ["max_stop"].transform("max")
                       )
    )
    
    df2 = (df[df.max_stop == df.longest_trip]
           .reset_index(drop=True)
           .drop(columns = ["max_stop", "longest_trip"])
           .rename(columns = {"calitp_itp_id": "itp_id"})
          )
    
    return df2

In [None]:
#routes_with_stops.to_parquet("./data/metro_routes.parquet")

In [10]:
routes_with_stops = pd.read_parquet("./data/metro_routes.parquet")
routes_with_stops = select_longest_route(routes_with_stops)

In [6]:
gdf = shared_utils.utils.download_geoparquet(utils.GCS_FILE_PATH, 
                                             "parallel_or_intersecting")

gdf = gdf[gdf.parallel==1].reset_index(drop=True)

# Start with LA Metro
gdf = gdf[gdf.itp_id==182].reset_index(drop=True)

In [11]:
def select_parallel_routes(df, parallel_info):
    gdf = (df[df.route_id.isin(parallel_info.route_id)]
            .sort_values(["itp_id", "route_id", "stop_sequence"])
            .drop_duplicates(subset=["itp_id", "route_id", "stop_sequence"])
            .reset_index(drop=True)
           )
    
    gdf = shared_utils.geography_utils.create_point_geometry(
        gdf, longitude_col = "stop_lon", latitude_col = "stop_lat",
    )
    
    return gdf

parallel = select_parallel_routes(routes_with_stops, gdf)

In [None]:
#https://stackoverflow.com/questions/25055712/pandas-every-nth-row
# Maybe not use every bus stop, since bus stops are spaced fairly closely
# Maybe every other, every 3rd? want to mimic the bus route, do not want
# to stray too far
#df = df.iloc[::3]

Don't like how `osmx` is returning the same nodes for bus stops, even at every 5th bus stop.

`osrm` doesn't install bc of some `GDAL` dependencies.

Can Google API be used? But need to check terms and conditions if we can make requests to calculate travel time or even grab speed limits through the
[Python package](https://github.com/googlemaps/google-maps-services-python)

At minimum, can calculate distance between stops, sum it up, and for cars, set an assumption of 30 mph or 45 mph. If we can't use Google API to grab speed limit, then we will hard code it.

In [12]:
def calculate_distance_traveled(df):
    group_cols = ["itp_id", "route_id"]
    sort_cols = group_cols + ["stop_sequence"]
    
    df = df.to_crs(shared_utils.geography_utils.CA_StatePlane)
    
    # Distance traveled
    df = df.assign(
        # Previous geometry
        start = (df.sort_values(sort_cols)
                 .groupby(group_cols)["geometry"]
                 .apply(lambda x: x.shift(1))),
        end = (df.sort_values(sort_cols)
               .groupby(group_cols)["geometry"]
               .apply(lambda x: x.shift(0))
              )
    )
    
    df = df.assign(
        feet_traveled = df.end.distance(df.start) 
    ).drop(columns = ["start", "end"])
        
    return df
            

In [29]:
def calculate_time_traveled(df):
    # Use a set of assumptions
    
    AVG_SPEED = 40
    
    df2 = shared_utils.geography_utils.aggregate_by_geography(
        df,
        group_cols = ["itp_id", "route_id"],
        sum_cols = ["feet_traveled"]
    )
    
    df2 = df2.assign(
        miles_traveled = df2.feet_traveled.divide(
            shared_utils.geography_utils.FEET_PER_MI)
    
    )
    
    # speed = distance / time
    # time = distance / speed
    df2 = df2.assign(
        car_trip_time_hr = df2.miles_traveled.divide(AVG_SPEED)
    ).drop(columns = "feet_traveled")
    
    return df2

In [32]:
df = calculate_distance_traveled(parallel)
df2 = calculate_time_traveled(df)
df2.head()

Unnamed: 0,itp_id,route_id,miles_traveled,car_trip_time_hr
0,182,10-13153,19.505628,0.487641
1,182,106-13153,70.819636,1.770491
2,182,117-13153,17.439304,0.435983
3,182,120-13153,28.118439,0.702961
4,182,125-13153,19.722584,0.493065


Comparison should be against bus's travel time along that route.

Can we pick one that is midday, one of the faster trips? Should be probably around 75th or 80th percentile.

Then see how long it takes for the bus to make that trip.

Actually, that travel time is in the data warehouse. Do another query, grab all the travel times, see if one can be selected for 75th or 80th percentile and if it's still less than 2x car trip time, then it can be selected as "viable parallel" route.

`views.gtfs_schedule_fact_daily_trips` has the `service_hours` column...should grab that in original query because later I drop a bunch of trips to get down to unique route, and select longest trip.

Unnamed: 0,itp_id,route_id,miles_traveled,car_trip_time_hr
0,182,10-13153,19.505628,0.487641
1,182,106-13153,70.819636,1.770491
2,182,117-13153,17.439304,0.435983
3,182,120-13153,28.118439,0.702961
4,182,125-13153,19.722584,0.493065
...,...,...,...,...
73,182,901-13153,62.055422,1.551386
74,182,910-13153,292.252678,7.306317
75,182,92-13153,26.032409,0.650810
76,182,94-13153,17.611668,0.440292
