# Estimate car vs bus travel time

* Pull out parallel routes. Run `setup_parallel_trips_with_stops.py`
* Make car travel down same route as the bus.
* `osmx` snaps to nodes, but even for every 5th bus stop, it's snapping to same node.
* `osrm` wasn't able to be installed in Hub
* `valhalla`? Kuan Butt's blog?

#### Quick and Dirty Approach
* Based on distance traveled, estimate car travel time with some assumptions (35, 40 mph?)
* For now, estimate car travel with lower mph assumption, so that some viable routes can be pulled. Don't want bus to look worse than it is (mid-day, free-flowing), and compare it to car travel (which is probably estimated during free-flowing too)

Later, swap out car travel time estimation with other approaches. Maybe use Google API to do requests.

In [1]:
#https://stackoverflow.com/questions/55162077/how-to-get-the-driving-distance-between-two-geographical-coordinates-using-pytho
import dotenv
import geopandas as gpd
import os
import pandas as pd

from siuba import *

from shared_utils import geography_utils

dotenv.load_dotenv("_env")

interfere with sqlalchemy_bigquery.
pybigquery should be uninstalled.
  return _bootstrap._gcd_import(name[level:], package, level)
E0406 23:35:10.078006158    2246 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies
E0406 23:35:12.416892764    2246 fork_posix.cc:70]           Fork support is only compatible with the epoll1 and poll polling strategies


True

In [2]:
GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]

In [3]:
df = gpd.read_parquet("./data/parallel_trips_with_stops.parquet")

In [4]:
trip_group = ["calitp_itp_id", "route_id", "trip_id", "shape_id"]

def subset_stops(df, n):
    # https://stackoverflow.com/questions/25055712/pandas-every-nth-row
    # Maybe not use every bus stop, since bus stops are spaced fairly closely
    # Maybe every other, every 3rd? want to mimic the bus route, do not want
    # to stray too far

    # df = df.iloc[::3]
    df["stop_rank"] = df.groupby(trip_group).cumcount() + 1
    
    subset = df[df.stop_rank % n == 0]
    
    return subset
    

In [5]:
# Selecting 2 trips
keep_trips = [
    -7505741281882708052,
    -8806955513757008482 
]

df = df[df.trip_key.isin(keep_trips)].reset_index(drop=True)
subset = subset_stops(df, 3)

In [6]:
def select_origin_destination(df):
    df = df.assign(
        origin = (df.sort_values(trip_group + ["stop_sequence"])
                  .groupby(trip_group)["stop_sequence"]
                  .transform("min")
                 ),
        destination = (df.sort_values(trip_group + ["stop_sequence"])
                       .groupby(trip_group)["stop_sequence"]
                       .transform("max")
                      )
    )
    
    df2 = (df[(df.stop_sequence == df.origin) | 
             (df.stop_sequence == df.destination)]
           .reset_index(drop=True)
           .drop(columns = ["origin", "destination"])
          )
    
    df2 = df2.assign(
        longitude = df2.geometry.x,
        latitude = df2.geometry.y,
    ).drop(columns = "geometry")
    
    return df2

In [22]:
# Wrangle it so there are columns with previous point and current point in the same row
def add_previous_location(df):
    df = df.assign(
        prev_longitude = (df.sort_values(trip_group + ["stop_sequence"])
                        .groupby(trip_group)["longitude"]
                        .apply(lambda x: x.shift(1))
                       ),
        prev_latitude = (df.sort_values(trip_group + ["stop_sequence"])
                         .groupby(trip_group)["latitude"]
                         .apply(lambda x: x.shift(1))
        )
    )
    
    # Only keep the observation that has start_geom (drop the first obs for each trip grouping)
    df2 = df[df.prev_longitude.notna()].reset_index(drop=True)
    
    # Make tupes instead of just floats
    df3 = df2.assign(
        start = df2.apply(lambda x: (x.prev_latitude, x.prev_longitude), axis=1),
        end = df2.apply(lambda x: (x.latitude, x.longitude), axis=1)        
        #start = df2.apply(lambda x: (x.prev_longitude, x.prev_latitude), axis=1),
        #end = df2.apply(lambda x: (x.longitude, x.latitude), axis=1)

    ).drop(columns = ["latitude", "longitude", "prev_latitude", "prev_longitude"])
    
    return df3

In [23]:
od = select_origin_destination(df)
od

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,stop_sequence,stop_id,departure_time,shape_id,route_id,service_hours,stop_rank,longitude,latitude
0,4,2022-01-06,-7505741281882708052,13277020,True,Thursday,1,4370,06:41:00,shp-10-10,10,0.45,1,-122.160201,37.721266
1,4,2022-01-06,-7505741281882708052,13277020,True,Thursday,36,1582,07:08:00,shp-10-10,10,0.45,36,-122.08718,37.670239
2,182,2022-01-06,-8806955513757008482,10910002100513-DEC21,True,Thursday,1,30019,05:13:00,9100210_DEC21,910-13153,0.816667,1,-118.045132,34.072191
3,182,2022-01-06,-8806955513757008482,10910002100513-DEC21,True,Thursday,21,30005,06:02:00,9100210_DEC21,910-13153,0.816667,21,-118.287306,33.869338


In [24]:
od2 = add_previous_location(od)
od2

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,stop_sequence,stop_id,departure_time,shape_id,route_id,service_hours,stop_rank,start,end
0,4,2022-01-06,-7505741281882708052,13277020,True,Thursday,36,1582,07:08:00,shp-10-10,10,0.45,36,"(37.721266, -122.160201)","(37.670239, -122.08718)"
1,182,2022-01-06,-8806955513757008482,10910002100513-DEC21,True,Thursday,21,30005,06:02:00,9100210_DEC21,910-13153,0.816667,21,"(34.072191, -118.045132)","(33.869338, -118.287306)"


In [25]:
#https://www.geeksforgeeks.org/python-calculate-distance-duration-two-places-using-google-distance-matrix-api/

In [None]:
!pip install googlemaps

In [10]:
import googlemaps

In [11]:
gmaps = googlemaps.Client(key=GOOGLE_API_KEY)

In [26]:
origin = od2.start.tolist()
dest = od2.end.tolist()

In [27]:
print(type(origin))
print(type(origin[0]))

<class 'list'>
<class 'tuple'>


In [28]:
for o, d in zip(origin, dest):
    print(o, d)

(37.721266, -122.160201) (37.670239, -122.08718)
(34.072191, -118.045132) (33.869338, -118.287306)


In [31]:
#https://www.linkedin.com/pulse/calculating-distances-using-python-google-maps-r%C3%A9gis-nisengwe?articleId=6625061973447053312

actual_duration = []

for o, d in zip(origin, dest):
    #result = gmaps.distance_matrix(o, d, mode='driving')["rows"][0]["elements"][0]["duration"]["value"]  
   
    result = result/3600
    actual_duration.append(result)
    
od2["duration (Hours)"] = actual_duration
od2


Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,stop_sequence,stop_id,departure_time,shape_id,route_id,service_hours,stop_rank,start,end,duration (Hours)
0,4,2022-01-06,-7505741281882708052,13277020,True,Thursday,36,1582,07:08:00,shp-10-10,10,0.45,36,"(37.721266, -122.160201)","(37.670239, -122.08718)",0.256944
1,182,2022-01-06,-8806955513757008482,10910002100513-DEC21,True,Thursday,21,30005,06:02:00,9100210_DEC21,910-13153,0.816667,21,"(34.072191, -118.045132)","(33.869338, -118.287306)",0.556389


In [32]:
od2.to_parquet("test_od.parquet")

In [None]:
#https://faun.pub/using-google-maps-distance-matrix-api-to-create-a-distance-table-93419c409d4f

In [None]:
unique_routes = df[["calitp_itp_id", "route_id"]].drop_duplicates()
num_routes = len(unique_routes)

print(f"# unique routes: {num_routes}")

print(f"1st pass + 25% of stops in subset: {num_routes + 0.25*len(subset)}")
print(f"1st pass + 50% of stops in subset: {num_routes + 0.5*len(subset)}")
print(f"1st pass + 75% of stops in subset: {num_routes + 0.75*len(subset)}")
print(f"Upper bound: do not get rid of any routes, take every 3rd stop: {len(subset)}")

In [None]:
keep_trips = [-7505741281882708052]
df[df.trip_key.isin(keep_trips)]

In [None]:
subset[subset.trip_key.isin(keep_trips)]

Don't like how `osmx` is returning the same nodes for bus stops, even at every 5th bus stop.

`osrm` doesn't install bc of some `GDAL` dependencies.

Can Google API be used? But need to check terms and conditions if we can make requests to calculate travel time or even grab speed limits through the
[Python package](https://github.com/googlemaps/google-maps-services-python)

At minimum, can calculate distance between stops, sum it up, and for cars, set an assumption of 30 mph or 45 mph. If we can't use Google API to grab speed limit, then we will hard code it.

In [None]:
def calculate_distance_traveled(df):
    group_cols = ["calitp_itp_id", "route_id"]
    sort_cols = group_cols + ["stop_sequence"]
    
    df = df.to_crs(shared_utils.geography_utils.CA_StatePlane)
    
    # Distance traveled
    df = df.assign(
        # Previous geometry
        start = (df.sort_values(sort_cols)
                 .groupby(group_cols)["geometry"]
                 .apply(lambda x: x.shift(1))),
        end = (df.sort_values(sort_cols)
               .groupby(group_cols)["geometry"]
               .apply(lambda x: x.shift(0))
              )
    )
    
    df = df.assign(
        feet_traveled = df.end.distance(df.start) 
    ).drop(columns = ["start", "end"])
        
    return df
            

In [None]:
df = calculate_distance_traveled(parallel)

In [None]:
def calculate_time_traveled(df):
    # Use a set of assumptions
    
    AVG_SPEED = 40
    
    df = df.assign(
        max_stop = (df.groupby(["itp_id", "route_id", "trip_id"])
                    ["stop_sequence"].transform("max"))
    )
    
    df2 = shared_utils.geography_utils.aggregate_by_geography(
        df,
        group_cols = ["itp_id", "route_id", "trip_id", 
                     "trip_first_departure_ts", "trip_last_arrival_ts"],
        sum_cols = ["feet_traveled"], 
        mean_cols = ["service_hours", "max_stop"]
    )
    
    df2 = df2.assign(
        miles_traveled = df2.feet_traveled.divide(
            shared_utils.geography_utils.FEET_PER_MI)
    
    )
    
    # speed = distance / time
    # time = distance / speed
    df2 = df2.assign(
        car_trip_time_hr = df2.miles_traveled.divide(AVG_SPEED),
    ).drop(columns = "feet_traveled")
        
    return df2

In [None]:
df2 = calculate_time_traveled(df)