# Estimate car vs bus travel time

* Pull out parallel routes. Run `setup_parallel_trips_with_stops.py`
* Make car travel down same route as the bus. Use `googlemaps`, and set the departure hour to be the same time as bus route's.
    * `osmx` snaps to nodes, but even for every 5th bus stop, it's snapping to same node.
    * `osrm` wasn't able to be installed in Hub

In [None]:
#!pip install -r requirements.txt

In [1]:
#https://stackoverflow.com/questions/55162077/how-to-get-the-driving-distance-between-two-geographical-coordinates-using-pytho
import dotenv
import geopandas as gpd
import googlemaps
import intake
import os
import pandas as pd

from datetime import datetime
from siuba import *

dotenv.load_dotenv("_env")

catalog = intake.open_catalog("./*.yml")

GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]



In [2]:
def grab_latlon_make_tuple(df, point_geom_col):
    # Change this from shapely point to floats
    df = df.assign(
        longitude = df[point_geom_col].x,
        latitude = df[point_geom_col].y,
    )
    
    # Make tuples instead of just floats
    df2 = (df.assign(
            new = df.apply(lambda x: (x.latitude, x.longitude), axis=1)        
        ).rename(columns = {"new": f"{point_geom_col}_tuple"})
          .drop(columns = ["latitude", "longitude"])
    )
    
    return df2

In [3]:
def data_wrangling(df):
    # Move this step into script later? 
    # Or just leave in this notebook, wrangle it into what goes into googlemaps
    df = df.assign(
        # Just grab time because googlemaps doesn't want past date 
        departure_time = pd.to_datetime(df.departure_time, errors="coerce").dt.time,
    )
    
    df2 = (grab_latlon_make_tuple(df, "geometry")
           .drop(columns = "geometry")
           .rename(columns = {"geometry_tuple": "geometry"})
          )
    
    return df2

In [4]:
df = catalog.parallel_trips_with_stops.read()

df = data_wrangling(df)

# Selecting 2 trips
keep_trips = [
    -7505741281882708052,
    -8806955513757008482 
]

df = df[df.trip_key.isin(keep_trips)].reset_index(drop=True)

In [5]:
trip_group = ["calitp_itp_id", "route_id", "trip_id", "shape_id"]

def subset_stops(df, n):
    # https://stackoverflow.com/questions/25055712/pandas-every-nth-row
    # Maybe not use every bus stop, since bus stops are spaced fairly closely
    # Maybe every other, every 3rd? want to mimic the bus route, do not want
    # to stray too far

    # df = df.iloc[::3]
    df["stop_rank"] = df.groupby(trip_group).cumcount() + 1
    
    subset = df[df.stop_rank % n == 0]
    
    return subset

In [6]:
# For now, take every 3rd stop, stop 3, 6, 9..
# Then, append 1st and last stops on to be origin/destination.
# Potentially increase this to every 5th?
subset = subset_stops(df, 3)

In [7]:
def select_origin_destination(df):
    df = df.assign(
        origin = (df.sort_values(trip_group + ["stop_sequence"])
                  .groupby(trip_group)["stop_sequence"]
                  .transform("min")
                 ),
        destination = (df.sort_values(trip_group + ["stop_sequence"])
                       .groupby(trip_group)["stop_sequence"]
                       .transform("max")
                      )
    )
    
    df2 = (df[(df.stop_sequence == df.origin) | 
             (df.stop_sequence == df.destination)]
           .reset_index(drop=True)
          )
    
    
    # Wrangle it so there are columns with previous point and current point in the same row
    df2 = df2.assign(
        previous = (df2.sort_values(trip_group + ["stop_sequence"])
                        .groupby(trip_group)["geometry"]
                        .apply(lambda x: x.shift(1))
                       ),
    )
    
    # Only keep the observation that has start_geom (drop the first obs for each trip grouping)
    df3 = (df2[df2.previous.notna()]
           [trip_group + ["departure_time", "geometry", "previous"]]
           .reset_index(drop=True)
           .rename(columns = {"geometry": "destination", 
                              "previous": "origin"})
          )
    
    return df3

In [8]:
od = select_origin_destination(df)
od

Unnamed: 0,calitp_itp_id,route_id,trip_id,shape_id,departure_time,destination,origin
0,4,10,13277020,shp-10-10,07:08:00,"(37.670239, -122.08718)","(37.721266, -122.160201)"
1,182,910-13153,10910002100513-DEC21,9100210_DEC21,06:02:00,"(33.869338, -118.287306)","(34.072191, -118.045132)"


In [9]:
# Try googlemaps.directions(), which allows for waypoints put in as an array
def assemble_waypoints(df):
    # Take all the stops in between origin/destination, put tuples into a list
    #https://stackoverflow.com/questions/22219004/how-to-group-dataframe-rows-into-list-in-pandas-groupby
    waypoint_df = (
        df.sort_values(trip_group + ["stop_sequence"])
        .groupby(trip_group)
        .agg({"geometry": lambda x: list(x)})
        .reset_index()
        .rename(columns = {"geometry": "waypoints"})
    )
    
    return waypoint_df

In [10]:
waypoints_by_trip = assemble_waypoints(subset)
waypoints_by_trip

Unnamed: 0,calitp_itp_id,route_id,trip_id,shape_id,waypoints
0,4,10,13277020,shp-10-10,"[(37.724727, -122.157549), (37.722488, -122.15..."
1,182,910-13153,10910002100513-DEC21,9100210_DEC21,"[(34.055644, -118.21099), (34.054253, -118.243..."


In [11]:
final = pd.merge(od, waypoints_by_trip, 
                 on = trip_group, how = "inner", validate = "1:1"
                )

final.head()

Unnamed: 0,calitp_itp_id,route_id,trip_id,shape_id,departure_time,destination,origin,waypoints
0,4,10,13277020,shp-10-10,07:08:00,"(37.670239, -122.08718)","(37.721266, -122.160201)","[(37.724727, -122.157549), (37.722488, -122.15..."
1,182,910-13153,10910002100513-DEC21,9100210_DEC21,06:02:00,"(33.869338, -118.287306)","(34.072191, -118.045132)","[(34.055644, -118.21099), (34.054253, -118.243..."


In [None]:
#https://www.geeksforgeeks.org/python-calculate-distance-duration-two-places-using-google-distance-matrix-api/

In [12]:
gmaps = googlemaps.Client(key=GOOGLE_API_KEY)

In [13]:
origin = final.origin.tolist()
dest = final.destination.tolist()
departures = final.departure_time.tolist()
waypts = final.waypoints.tolist()

print(f"type of origin input: {type(origin)}")
print(f"type of first row: {type(origin[0])}")

for i, tup in enumerate(zip(origin, dest)):
    if i == 0:
        print(i, tup)
        print(tup[0])
        print(departures[i])
        print(waypts[i])
        print(type(waypts[i]))

type of origin input: <class 'list'>
type of first row: <class 'tuple'>
0 ((37.721266, -122.160201), (37.670239, -122.08718))
(37.721266, -122.160201)
07:08:00
[(37.724727, -122.157549), (37.722488, -122.152545), (37.717496, -122.145962), (37.71253, -122.139188), (37.705447, -122.12906), (37.699342, -122.121197), (37.698266, -122.124512), (37.695795, -122.11529), (37.690661, -122.107995), (37.683028, -122.097653), (37.67769, -122.090881), (37.670239, -122.08718)]
<class 'list'>


In [14]:
# Just do 1 query
beginning = (37.721266, -122.160201)
ending = (37.670239, -122.08718)
departure1 = departures[0]
waypts1 = waypts[0]

In [15]:
gmaps.directions(beginning, ending, mode='driving', 
                 departure_time=departure1, waypoints=waypts1)

HTTPError: HTTP Error: 400

### Using distance_matrix - works, but API key now restricted to `directions`

* https://traveltime.com/blog/google-distance-matrix-api-traveltime-search-api - there's 25 requests limit for `distance_matrix`

In [None]:
#https://www.linkedin.com/pulse/calculating-distances-using-python-google-maps-r%C3%A9gis-nisengwe?articleId=6625061973447053312
'''
actual_duration = []

for i, od_tup in enumerate(zip(origin, dest)):
    result = gmaps.distance_matrix(od_tup[0], od_tup[1], mode='driving', departure_time=departures[i])["rows"][0]["elements"][0]["duration"]["value"]  
   
    result = result/3600
    actual_duration.append(result)
    
final["duration (Hours)"] = actual_duration
final
#final.to_parquet("test_od.parquet")
'''

In [None]:
#https://faun.pub/using-google-maps-distance-matrix-api-to-create-a-distance-table-93419c409d4f

### Estimate how many requests

In [None]:
unique_routes = df[["calitp_itp_id", "route_id"]].drop_duplicates()
num_routes = len(unique_routes)

print(f"# unique routes: {num_routes}")

print(f"1st pass + 25% of stops in subset: {num_routes + 0.25*len(subset)}")
print(f"1st pass + 50% of stops in subset: {num_routes + 0.5*len(subset)}")
print(f"1st pass + 75% of stops in subset: {num_routes + 0.75*len(subset)}")
print(f"Upper bound: do not get rid of any routes, take every 3rd stop: {len(subset)}")

In [None]:
keep_trips = [-7505741281882708052]
df[df.trip_key.isin(keep_trips)]

In [None]:
subset[subset.trip_key.isin(keep_trips)]

Don't like how `osmx` is returning the same nodes for bus stops, even at every 5th bus stop.

`osrm` doesn't install bc of some `GDAL` dependencies.

Can Google API be used? But need to check terms and conditions if we can make requests to calculate travel time or even grab speed limits through the
[Python package](https://github.com/googlemaps/google-maps-services-python)

At minimum, can calculate distance between stops, sum it up, and for cars, set an assumption of 30 mph or 45 mph. If we can't use Google API to grab speed limit, then we will hard code it.

In [None]:
def calculate_distance_traveled(df):
    group_cols = ["calitp_itp_id", "route_id"]
    sort_cols = group_cols + ["stop_sequence"]
    
    df = df.to_crs(shared_utils.geography_utils.CA_StatePlane)
    
    # Distance traveled
    df = df.assign(
        # Previous geometry
        start = (df.sort_values(sort_cols)
                 .groupby(group_cols)["geometry"]
                 .apply(lambda x: x.shift(1))),
        end = (df.sort_values(sort_cols)
               .groupby(group_cols)["geometry"]
               .apply(lambda x: x.shift(0))
              )
    )
    
    df = df.assign(
        feet_traveled = df.end.distance(df.start) 
    ).drop(columns = ["start", "end"])
        
    return df
            

In [None]:
df = calculate_distance_traveled(parallel)

In [None]:
def calculate_time_traveled(df):
    # Use a set of assumptions
    
    AVG_SPEED = 40
    
    df = df.assign(
        max_stop = (df.groupby(["itp_id", "route_id", "trip_id"])
                    ["stop_sequence"].transform("max"))
    )
    
    df2 = shared_utils.geography_utils.aggregate_by_geography(
        df,
        group_cols = ["itp_id", "route_id", "trip_id", 
                     "trip_first_departure_ts", "trip_last_arrival_ts"],
        sum_cols = ["feet_traveled"], 
        mean_cols = ["service_hours", "max_stop"]
    )
    
    df2 = df2.assign(
        miles_traveled = df2.feet_traveled.divide(
            shared_utils.geography_utils.FEET_PER_MI)
    
    )
    
    # speed = distance / time
    # time = distance / speed
    df2 = df2.assign(
        car_trip_time_hr = df2.miles_traveled.divide(AVG_SPEED),
    ).drop(columns = "feet_traveled")
        
    return df2

In [None]:
df2 = calculate_time_traveled(df)