# Estimate car vs bus travel time

* Pull out parallel routes. Run `setup_parallel_trips_with_stops.py`
* Make car travel down same route as the bus. Use `googlemaps`, and set the departure hour to be the same time as bus route's.
    * `osmx` snaps to nodes, but even for every 5th bus stop, it's snapping to same node.
    * `osrm` wasn't able to be installed in Hub

In [None]:
#!pip install -r requirements.txt

In [1]:
#https://stackoverflow.com/questions/55162077/how-to-get-the-driving-distance-between-two-geographical-coordinates-using-pytho
import dotenv
import geopandas as gpd
import googlemaps
import intake
import os
import pandas as pd

from datetime import datetime, timedelta
from siuba import *

dotenv.load_dotenv("_env")

catalog = intake.open_catalog("./*.yml")

GOOGLE_API_KEY = os.environ["GOOGLE_API_KEY"]



In [2]:
def grab_latlon_make_tuple(df, point_geom_col):
    # Change this from shapely point to floats
    df = df.assign(
        longitude = df[point_geom_col].x,
        latitude = df[point_geom_col].y,
    )
    
    # Make tuples instead of just floats
    df2 = (df.assign(
            new = df.apply(lambda x: (x.latitude, x.longitude), axis=1)        
        ).rename(columns = {"new": f"{point_geom_col}_tuple"})
          .drop(columns = ["latitude", "longitude"])
    )
    
    return df2

In [3]:
def data_wrangling(df):
    # Move this step into script later? 
    # Or just leave in this notebook, wrangle it into what goes into googlemaps
    
    df = df.assign(
        # Just grab time because googlemaps doesn't want past date 
        departure_date = pd.to_datetime(df.departure_time, errors="coerce").dt.date,
        departure_time2 = pd.to_datetime
    )
    
    
    df2 = (grab_latlon_make_tuple(df, "geometry")
           .drop(columns = "geometry")
           .rename(columns = {"geometry_tuple": "geometry"})
          )
    
    return df2

In [86]:
df = catalog.parallel_trips_with_stops.read()


In [80]:
df = df.head()

In [81]:
import warehouse_queries
SELECTED_DATE = warehouse_queries.dates["thurs"]


from datetime import date

year = int(SELECTED_DATE.split('-')[0])
month = int(SELECTED_DATE.split('-')[1])
day = int(SELECTED_DATE.split('-')[2])

In [98]:
df[df.departure_time.str.contains("24:")]

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [97]:
df = df.assign(
    departure_time = df.apply(lambda x: 
                              datetime.strptime(x.departure_time, "%H:%M:%S").time(), 
                              axis=1)
)

ValueError: time data '24:00:33' does not match format '%H:%M:%S'

In [78]:
df

Unnamed: 0,calitp_itp_id,date,trip_key,trip_id,is_in_service,day_name,stop_sequence,stop_id,departure_time,shape_id,route_id,service_hours,geometry
0,4,2022-01-06,-7505741281882708052,13277020,True,Thursday,1,4370,NaT,shp-10-10,10,0.45,POINT (-122.16020 37.72127)
1,4,2022-01-06,-7505741281882708052,13277020,True,Thursday,2,4318,NaT,shp-10-10,10,0.45,POINT (-122.15635 37.72287)
2,4,2022-01-06,-7505741281882708052,13277020,True,Thursday,3,4316,NaT,shp-10-10,10,0.45,POINT (-122.15755 37.72473)
3,4,2022-01-06,-7505741281882708052,13277020,True,Thursday,4,4281,NaT,shp-10-10,10,0.45,POINT (-122.15637 37.72521)
4,4,2022-01-06,-7505741281882708052,13277020,True,Thursday,5,4446,NaT,shp-10-10,10,0.45,POINT (-122.15411 37.72361)


In [76]:
df.assign(
    trip_datetime = df.apply(lambda x: 
                             datetime.combine(date(year, month, day), x.departure_time)
                            ), axis=1)


AttributeError: 'Series' object has no attribute 'departure_time'

In [60]:
datetime.combine(date(year, month, day), df.trip_departure[0])

datetime.datetime(2022, 1, 6, 6, 41)

In [39]:
date.strftime(SELECTED_DATE, format="%Y %b %d")

TypeError: descriptor 'strftime' for 'datetime.date' objects doesn't apply to a 'str' object

In [37]:
from datetime import date

In [4]:
df = catalog.parallel_trips_with_stops.read()

df = data_wrangling(df)

every 3rd 

* origin = 1
* destination >= 77 & destination <= 79
* divisible by 3, remainder 1
* 4, 7, 10, 13, 16, 
* 19, 22, 25, 28, 31
* 34, 37, 40, 43, 46,
* 49, 52, 55, 58, 61,
* 64, 67, 70, 73, 76,

every 4th
* origin = 1
* destination >= 79 & destination <= 105
* divisible by 4, remainder 1
* 5, 9, 13, 17, 21,
* 25, 29, 33, 37, 41, 
* 45, 49, 53, 57, 61
* 65, 69, 73, 77, 81,
* 85, 89, 93, 97, 101

every 5th
* origin = 1
* destination >= 126 & destination <= 131
* divisible by 5, remainder 1
* 6, 11, 16, 21, 26,
* 31, 36, 41, 46, 51,
* 56, 61, 66, 71, 76,
* 81, 86, 91, 96, 101,
* 106, 111, 116, 121, 126

In [5]:
# (x - first_waypoint_rank) / divisible_by) + 1 = 25
# 25 waypoints in the middle, then allow origin and destination to bookend it
# origin = stop1
# destination is >= last_waypoint_rank & <= last_waypoint_rank + divisible_by
# can allow destination to be last_waypoint_rank

def last_waypoint_rank(divisible_by):
    first = 1
    first_waypoint = first + divisible_by
    # (x- first_waypoint) / divisible_by) + 1 = 25
    # Solve for x
    x = (divisible_by * 24) + first_waypoint
    return x, x+divisible_by

In [6]:
trip_group = ["calitp_itp_id", "route_id", "trip_id", "shape_id"]

def subset_stops(df):
    # https://stackoverflow.com/questions/25055712/pandas-every-nth-row
    # df = df.iloc[::3]
    df["stop_rank"] = df.groupby(trip_group).cumcount() + 1
    df["max_stop"] = df.groupby(trip_group)["stop_rank"].transform("max")
    
    # every 3rd for shorter routes
    # every 4th, 5th for longer ones...stay under 25 waypoints
    every_third, dest3_max = last_waypoint_rank(divisible_by=3)
    every_fourth, dest4_max = last_waypoint_rank(divisible_by=4)
    every_fifth, dest5_max = last_waypoint_rank(divisible_by=5)
    # Do we want to go beyond every 5th? Maybe need to investigate 
    
    def tag_waypoints(row):
        flag = 0
        if row.max_stop <= dest3_max:
            # Want remainder of 1, because if we are keeping stop 1, then 3rd stop is stop 4.
            if row.stop_rank % 3 == 1:
                flag=1
        elif (row.max_stop > dest3_max) and (row.max_stop <= dest4_max):
            if row.stop_rank % 4 == 1:
                flag = 1
        elif (row.max_stop > dest4_max) and (row.max_stop <= dest5_max):
            if row.stop_rank % 5 == 1:
                flag = 1
        return flag

    df["is_waypoint"] = df.apply(tag_waypoints, axis=1)
    df["is_od"] = df.apply(lambda x: 
                           1 if ((x.stop_rank==1) or (x.stop_rank==x.max_stop))
                           else 0, axis=1)
                           
    # Can have up to 25 waypoints to not be charged
    # we also have origin and destination
    subset = (df[(df.is_waypoint==1) | 
                (df.is_od==1) 
               ].drop(columns = ["stop_rank", "max_stop"])
              .reset_index(drop=True)
             )
    
    return subset

In [7]:
subset = subset_stops(df)

In [8]:
# Selecting 2 trips
keep_trips = [
    -7505741281882708052,
    -8806955513757008482 
]

subset = subset[subset.trip_key.isin(keep_trips)].reset_index(drop=True)

In [None]:
# if there are any keys that don't have duration_in_traffic, throw an error and it's not getting what we expect


In [None]:
# if it feels too slow, parallelize this in some way
# async io it!
# functools cache?
# write a function, take beginning, end, waypoints, departure time, and have that
# function reference the map's call

In [13]:
def select_origin_destination(df):
    df = df[df.is_od==1].reset_index(drop=True)
    
    # Wrangle it so there are columns with previous point and current point in the same row
    df = df.assign(
        previous = (df.sort_values(trip_group + ["stop_sequence"])
                        .groupby(trip_group)["geometry"]
                        .apply(lambda x: x.shift(1))
                       ),
    )
    
    # Only keep the observation that has start_geom (drop the first obs for each trip grouping)
    df2 = (df[df.previous.notna()]
           [trip_group + ["departure_time", "geometry", "previous"]]
           .reset_index(drop=True)
           .rename(columns = {"geometry": "destination", 
                              "previous": "origin"})
          )
    
    return df2

In [14]:
od = select_origin_destination(subset)
od

Unnamed: 0,calitp_itp_id,route_id,trip_id,shape_id,departure_time,destination,origin
0,4,10,13277020,shp-10-10,07:08:00,"(37.670239, -122.08718)","(37.721266, -122.160201)"
1,182,910-13153,10910002100513-DEC21,9100210_DEC21,06:02:00,"(33.869338, -118.287306)","(34.072191, -118.045132)"


In [15]:
# Try googlemaps.directions(), which allows for waypoints put in as an array
def assemble_waypoints(df):
    df = df[(df.is_waypoint==1) & (df.is_od==0)].reset_index(drop=True)
    
    # Take all the stops in between origin/destination, put tuples into a list
    #https://stackoverflow.com/questions/22219004/how-to-group-dataframe-rows-into-list-in-pandas-groupby
    waypoint_df = (
        df.sort_values(trip_group + ["stop_sequence"])
        .groupby(trip_group)
        .agg({"geometry": lambda x: list(x)})
        .reset_index()
        .rename(columns = {"geometry": "waypoints"})
    )
    
    return waypoint_df

In [16]:
waypoints_by_trip = assemble_waypoints(subset)
waypoints_by_trip

Unnamed: 0,calitp_itp_id,route_id,trip_id,shape_id,waypoints
0,4,10,13277020,shp-10-10,"[(37.72521, -122.156367), (37.720677, -122.150..."
1,182,910-13153,10910002100513-DEC21,9100210_DEC21,"[(34.053943, -118.232454), (34.054622, -118.24..."


In [17]:
final = pd.merge(od, waypoints_by_trip, 
                 on = trip_group, how = "inner", validate = "1:1"
                )

final.head()

Unnamed: 0,calitp_itp_id,route_id,trip_id,shape_id,departure_time,destination,origin,waypoints
0,4,10,13277020,shp-10-10,07:08:00,"(37.670239, -122.08718)","(37.721266, -122.160201)","[(37.72521, -122.156367), (37.720677, -122.150..."
1,182,910-13153,10910002100513-DEC21,9100210_DEC21,06:02:00,"(33.869338, -118.287306)","(34.072191, -118.045132)","[(34.053943, -118.232454), (34.054622, -118.24..."


In [None]:
#https://www.geeksforgeeks.org/python-calculate-distance-duration-two-places-using-google-distance-matrix-api/

In [None]:
gmaps = googlemaps.Client(key=GOOGLE_API_KEY)

In [None]:
one_year_later = (datetime.now() + timedelta(weeks=52))

In [20]:
final

Unnamed: 0,calitp_itp_id,route_id,trip_id,shape_id,departure_time,destination,origin,waypoints
0,4,10,13277020,shp-10-10,07:08:00,"(37.670239, -122.08718)","(37.721266, -122.160201)","[(37.72521, -122.156367), (37.720677, -122.150..."
1,182,910-13153,10910002100513-DEC21,9100210_DEC21,06:02:00,"(33.869338, -118.287306)","(34.072191, -118.045132)","[(34.053943, -118.232454), (34.054622, -118.24..."


In [18]:
timedelta?

[0;31mInit signature:[0m [0mtimedelta[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m/[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Difference between two datetime values.

timedelta(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0)

All arguments are optional and default to 0.
Arguments may be integers or floats, and may be positive or negative.
[0;31mFile:[0m           /opt/conda/lib/python3.9/datetime.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     _Timedelta


In [None]:
# maxsize=None, cache 

In [None]:
origin = final.origin.tolist()
dest = final.destination.tolist()
#departures = final.departure_time.tolist()
departures = [one_hr_from_now]
waypts = final.waypoints.tolist()

print(f"type of origin input: {type(origin)}")
print(f"type of first row: {type(origin[0])}")

for i, tup in enumerate(zip(origin, dest)):
    if i == 0:
        print(i, tup)
        print(tup[0])
        print(departures[i])
        print(waypts[i])
        print(type(waypts[i]))

In [None]:
# Just do 1 query
beginning = (37.721266, -122.160201)
ending = (37.670239, -122.08718)
departure1 = departures[0]
waypts1 = waypts[0]

In [None]:
# save as dictionary, save as pickle. request/response, save as pickle to be "cache"

In [None]:
gmaps.directions(beginning, ending, 
                 mode='driving', 
                 #departure_time=departure1, 
                 #waypoints=waypts1
                )

In [None]:
test_result = gmaps.directions(beginning, ending, 
                 mode='driving', 
                 #departure_time=departure1, 
                 #waypoints=waypts1
                )

In [None]:
my_dict = test_result[0]

In [None]:
for key in my_dict.keys():
    print(key)

In [None]:
my_dict["summary"]

In [None]:
my_dict["legs"]

In [None]:
journey = my_dict["legs"][0]

In [None]:
for key, value in journey.items():
    print(key)
    print(value)

In [None]:
# add 7 days in the future
# test a week, 6 weeks from now, google maps probs does day of week/time of day, not seasonal
# timedelta as number of weeks (integer), add 1, adjust that forward
# <25 waypoints, <10 waypoints, or else pay extra, 
# if you have more than 25, you have to break it into 2 requests

In [None]:
gmaps.directions(beginning, ending, 
                 mode='driving', 
                 departure_time=departure1, 
                 #waypoints=waypts1
                )

In [None]:
via = gmaps.directions(beginning, ending, 
                 mode='driving', 
                 departure_time=datetime.now() + timedelta(hours=4), 
                 waypoints=[f"via:{lat},{lon}" for lat, lon in waypts1]
                #waypoints=waypts1
                )
waypoints = gmaps.directions(beginning, ending, 
                 mode='driving', 
                 departure_time=datetime.now() + timedelta(hours=4), 
                 #waypoints=[f"via:{lat},{lon}" for lat, lon in waypts1]
                waypoints=waypts1
                )
# Try using via, not using stopovers, which will return duration_in_traffic


In [None]:
via[0].keys(), waypoints[0].keys()

In [None]:
via[0]["legs"][0]["duration_in_traffic"], via[0]["legs"][0]["duration"],sum(leg["duration"]["value"] for leg in waypoints[0]["legs"])



In [None]:
waypoints[0]["legs"][0]

In [None]:
set(type(leg["duration"]) for leg in waypoints[0]["legs"])


In [None]:
sum(leg["duration"] for leg in waypoints[0]["legs"])


In [None]:
via[0]["legs"][0].keys(), waypoints[0]["legs"][0].keys()

In [None]:
via2[0]["legs"][0]

In [None]:
departure1

In [None]:
tomorrow = datetime.now() + timedelta(days=1)

In [None]:
with_waypoints = gmaps.directions(beginning, ending, 
                 mode='driving', 
                 departure_time=departure1, 
                 waypoints=waypts1
                )

In [None]:
with_waypoints = gmaps.directions(beginning, ending, 
                 mode='driving', 
                 departure_time=tomorrow, 
                 #waypoints=waypts1
                )

In [None]:
gmaps.directions(beginning, ending, 
                 mode='driving', 
                 departure_time=datetime.now() + timedelta(hours=-1), 
                 #waypoints=waypts1
                )

In [None]:
w1 = with_waypoints[0]

In [None]:
#https://stackoverflow.com/questions/15380712/how-to-decode-polylines-from-google-maps-direction-api-in-php
def decode_polyline(polyline_str):
    index, lat, lng = 0, 0, 0
    coordinates = []
    changes = {'latitude': 0, 'longitude': 0}

    # Coordinates have variable length when encoded, so just keep
    # track of whether we've hit the end of the string. In each
    # while loop iteration, a single coordinate is decoded.
    while index < len(polyline_str):
        # Gather lat/lon changes, store them in a dictionary to apply them later
        for unit in ['latitude', 'longitude']: 
            shift, result = 0, 0

            while True:
                byte = ord(polyline_str[index]) - 63
                index+=1
                result |= (byte & 0x1f) << shift
                shift += 5
                if not byte >= 0x20:
                    break

            if (result & 1):
                changes[unit] = ~(result >> 1)
            else:
                changes[unit] = (result >> 1)

        lat += changes['latitude']
        lng += changes['longitude']

        coordinates.append((lat / 100000.0, lng / 100000.0))

    return coordinates

In [None]:
w1

In [None]:
step_by_step = w1['legs'][0]['steps']

In [None]:
step_by_step

In [None]:
polyline = w1["overview_polyline"]

In [None]:
polyline

In [None]:
#https://github.com/geodav-tech/decode-google-maps-polyline
polyline_decoded = decode_polyline(polyline["points"])

In [None]:
polyline_gdf = (pd.DataFrame(polyline_decoded, )
                .rename(columns = {0: "longitude", 1: "latitude"})
               )

polyline_gdf = gpd.GeoDataFrame(polyline_gdf, 
                                geometry=gpd.points_from_xy(polyline_gdf.longitude, 
                                                           polyline_gdf.latitude), 
                                crs="EPSG:4326")

In [None]:
polyline_gdf.columns

In [None]:
import shapely

polyline_gdf['group'] = 1

polyline_gdf2 = polyline_gdf.groupby("group")["geometry"].apply(
    lambda x: shapely.geometry.LineString(x.tolist())).reset_index()


In [None]:
polyline_gdf2.plot()

### Using distance_matrix - works, but API key now restricted to `directions`

* https://traveltime.com/blog/google-distance-matrix-api-traveltime-search-api - there's 25 requests limit for `distance_matrix`

In [None]:
#https://www.linkedin.com/pulse/calculating-distances-using-python-google-maps-r%C3%A9gis-nisengwe?articleId=6625061973447053312
'''
actual_duration = []

for i, od_tup in enumerate(zip(origin, dest)):
    result = gmaps.distance_matrix(od_tup[0], od_tup[1], mode='driving', departure_time=departures[i])["rows"][0]["elements"][0]["duration"]["value"]  
   
    result = result/3600
    actual_duration.append(result)
    
final["duration (Hours)"] = actual_duration
final
#final.to_parquet("test_od.parquet")
'''

In [None]:
#https://faun.pub/using-google-maps-distance-matrix-api-to-create-a-distance-table-93419c409d4f

### Estimate how many requests

In [None]:
unique_routes = df[["calitp_itp_id", "route_id"]].drop_duplicates()
num_routes = len(unique_routes)

print(f"# unique routes: {num_routes}")

print(f"1st pass + 25% of stops in subset: {num_routes + 0.25*len(subset)}")
print(f"1st pass + 50% of stops in subset: {num_routes + 0.5*len(subset)}")
print(f"1st pass + 75% of stops in subset: {num_routes + 0.75*len(subset)}")
print(f"Upper bound: do not get rid of any routes, take every 3rd stop: {len(subset)}")

In [None]:
full_df = catalog.parallel_trips_with_stops.read()

In [None]:
unique_routes = full_df[["calitp_itp_id", "route_id"]].drop_duplicates()
len(unique_routes)

In [None]:
full_df[["calitp_itp_id", "route_id"]].value_counts()

In [None]:
ful

In [None]:
keep_trips = [-7505741281882708052]
df[df.trip_key.isin(keep_trips)]

In [None]:
subset[subset.trip_key.isin(keep_trips)]

Don't like how `osmx` is returning the same nodes for bus stops, even at every 5th bus stop.

`osrm` doesn't install bc of some `GDAL` dependencies.

Can Google API be used? But need to check terms and conditions if we can make requests to calculate travel time or even grab speed limits through the
[Python package](https://github.com/googlemaps/google-maps-services-python)

At minimum, can calculate distance between stops, sum it up, and for cars, set an assumption of 30 mph or 45 mph. If we can't use Google API to grab speed limit, then we will hard code it.

In [None]:
def calculate_distance_traveled(df):
    group_cols = ["calitp_itp_id", "route_id"]
    sort_cols = group_cols + ["stop_sequence"]
    
    df = df.to_crs(shared_utils.geography_utils.CA_StatePlane)
    
    # Distance traveled
    df = df.assign(
        # Previous geometry
        start = (df.sort_values(sort_cols)
                 .groupby(group_cols)["geometry"]
                 .apply(lambda x: x.shift(1))),
        end = (df.sort_values(sort_cols)
               .groupby(group_cols)["geometry"]
               .apply(lambda x: x.shift(0))
              )
    )
    
    df = df.assign(
        feet_traveled = df.end.distance(df.start) 
    ).drop(columns = ["start", "end"])
        
    return df
            

In [None]:
df = calculate_distance_traveled(parallel)

In [None]:
def calculate_time_traveled(df):
    # Use a set of assumptions
    
    AVG_SPEED = 40
    
    df = df.assign(
        max_stop = (df.groupby(["itp_id", "route_id", "trip_id"])
                    ["stop_sequence"].transform("max"))
    )
    
    df2 = shared_utils.geography_utils.aggregate_by_geography(
        df,
        group_cols = ["itp_id", "route_id", "trip_id", 
                     "trip_first_departure_ts", "trip_last_arrival_ts"],
        sum_cols = ["feet_traveled"], 
        mean_cols = ["service_hours", "max_stop"]
    )
    
    df2 = df2.assign(
        miles_traveled = df2.feet_traveled.divide(
            shared_utils.geography_utils.FEET_PER_MI)
    
    )
    
    # speed = distance / time
    # time = distance / speed
    df2 = df2.assign(
        car_trip_time_hr = df2.miles_traveled.divide(AVG_SPEED),
    ).drop(columns = "feet_traveled")
        
    return df2

In [None]:
df2 = calculate_time_traveled(df)