In [33]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
import time
from datetime import datetime
from functools import reduce
import calendar
from haversine import haversine, Unit
from scipy.optimize import linear_sum_assignment
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [34]:
#Define presto credentials
presto_host = 'presto.processing.yoda.run'
presto_port = '80'
username = 'aditya.bhattar@rapido.bike'

#Create connection to presto host
connection = presto.connect(presto_host,presto_port,username = username)

In [35]:
#Load the csv with hyderabad area1 hexes
hyderabad_zone1 = pd.read_csv('/Users/rapido/Desktop/batching_demand/hyderabad/hyderabad_location_data_level1_demand_zone1.csv')
hyderabad_zone1_hexes = list(hyderabad_zone1['customer_location_hex_8'].unique())
hyderabad_zone2 = pd.read_csv('/Users/rapido/Desktop/batching_demand/hyderabad/hyderabad_location_data_level1_demand_zone2.csv')
hyderabad_zone2_hexes = list(hyderabad_zone2['customer_location_hex_8'].unique())
print(hyderabad_zone1_hexes)
print(hyderabad_zone2_hexes)

['8860a24a61fffff', '8860a24a65fffff', '8860a24a67fffff', '8860a24a6dfffff', '8860a24b51fffff', '8860a25995fffff', '8860a25997fffff', '8860a259b1fffff', '8860a259b9fffff', '8860a259bbfffff', '8860a259bdfffff', '8860a24a29fffff', '8860a24a2dfffff', '8860a24a63fffff', '8860a24a69fffff', '8860a24a6bfffff', '8860a24b53fffff', '8860a24b55fffff', '8860a24b57fffff', '8860a24b59fffff', '8860a24b5bfffff', '8860a24b5dfffff', '8860a25983fffff', '8860a25987fffff', '8860a25991fffff', '8860a25993fffff', '8860a2599dfffff', '8860a259a3fffff', '8860a259abfffff', '8860a259b3fffff', '8860a259b5fffff', '8860a259b7fffff']
['8860a25903fffff', '8860a25915fffff', '8860a2591dfffff', '8860a25939fffff', '8860a25957fffff', '8860a25901fffff', '8860a25907fffff', '8860a2590bfffff', '8860a25911fffff', '8860a25917fffff', '8860a25919fffff', '8860a25931fffff', '8860a2593bfffff', '8860a25951fffff', '8860a25953fffff', '8860a25955fffff']


In [4]:
#Combine both lists
hyderabad_zone_hexes = hyderabad_zone1_hexes + hyderabad_zone2_hexes

In [37]:
#Query data from order_logs_snapshot

q = """
select order_id, accept_to_pickup_distance, captain_location_hex_8, captain_location_latitude,
captain_location_longitude, city_name, customer_location_hex_8, drop_location_hex_8, drop_location_latitude, drop_location_longitude, customer_location_latitude, 
customer_location_longitude, hhmmss, hour, quarter_hour, service_obj_service_name, time_bucket, weekday, yyyymmdd,
order_status, cancel_reason, updated_epoch, event_type, captain_id, spd_fraud_flag
from orders.order_logs_immutable
where yyyymmdd = '20221103'
and service_obj_service_name = 'Link'
and customer_location_hex_8 in ('8860a24a61fffff', '8860a24a65fffff', '8860a24a67fffff', '8860a24a6dfffff', '8860a24b51fffff', '8860a25995fffff',
 '8860a25997fffff', '8860a259b1fffff', '8860a259b9fffff', '8860a259bbfffff', '8860a259bdfffff', '8860a24a29fffff', '8860a24a2dfffff', '8860a24a63fffff',
  '8860a24a69fffff', '8860a24a6bfffff', '8860a24b53fffff', '8860a24b55fffff', '8860a24b57fffff', '8860a24b59fffff', '8860a24b5bfffff', '8860a24b5dfffff',
   '8860a25983fffff', '8860a25987fffff', '8860a25991fffff', '8860a25993fffff', '8860a2599dfffff', '8860a259a3fffff', '8860a259abfffff', '8860a259b3fffff',
    '8860a259b5fffff', '8860a259b7fffff', '8860a25903fffff', '8860a25915fffff', '8860a2591dfffff', '8860a25939fffff', '8860a25957fffff', '8860a25901fffff',
     '8860a25907fffff', '8860a2590bfffff', '8860a25911fffff', '8860a25917fffff', '8860a25919fffff', '8860a25931fffff', '8860a2593bfffff', '8860a25951fffff',
      '8860a25953fffff', '8860a25955fffff')
"""

#Load data into pandas table
df_copy = pd.read_sql(q, connection)

In [38]:
#Load a copy of the df
df = df_copy.copy()
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,updated_epoch,event_type,captain_id,spd_fraud_flag
0,6362ba1152c77607a2297e01,0.529,8860a259b9fffff,17.443369,78.391136,Hyderabad,8860a25995fffff,8860a259b9fffff,17.443369,78.391136,17.437603,78.382309,1225,0,0,Link,00:00,4,20221103,dropped,Wrong Pickup Location,1667415621119,rider_rated,61fa7690ed2eba60a67f6068,False
1,6362bb49affd8608e7faf071,,,,,Hyderabad,8860a25997fffff,8860a259b5fffff,17.456866,78.386731,17.434292,78.375549,1737,0,15,Link,00:00,4,20221103,requested,,1667414857977,order_requested,,
2,6362bd1552c77607a2297ffe,,8860a24a63fffff,17.446578,78.35509,Hyderabad,8860a24a63fffff,8860a24a61fffff,17.44543,78.359689,17.441141,78.356644,2517,0,15,Link,00:00,4,20221103,new,,1667415374569,rider_busy,62c9b1a63159e35d3605ae3b,
3,6362c29d52c77607a229832d,2.733,8860a25903fffff,17.439617,78.456336,Hyderabad,8860a25903fffff,8860a25965fffff,17.45564,78.488839,17.439617,78.456336,4853,0,45,Link,00:59,4,20221103,new,Taking longer than expected,1667416754786,order_updated,616d26574c6ba1799bd85cd9,
4,6362b96e8143791dd512b47b,0.439,8860a259bdfffff,17.450441,78.391747,Hyderabad,8860a259bdfffff,8860a259bdfffff,17.450441,78.391747,17.450525,78.392166,942,0,0,Link,00:00,4,20221103,dropped,Asked to pay extra,1667415619615,order_updated,61c18c413beb3e5813c936c8,False


In [39]:
df['hour'].value_counts()

09    31357
08    22344
07     7783
06     3902
00     2991
05     1968
01     1590
02     1259
04     1248
03     1229
Name: hour, dtype: int64

In [8]:
#Make columns for second and minute and date
df['second'] = df['hhmmss'].apply(lambda x: x[4:])
df['minute'] = df['hhmmss'].apply(lambda x: x[2:4])
df['date'] = df['yyyymmdd'].apply(lambda x: x[6:])
df['weekday_name'] = df['weekday'].apply(lambda x: calendar.day_name[(x-1)])

#Create a column to get the time of the day
df['time_period'] = np.where(df['hour'].isin(['08', '09', '10', '11']), 'morning_peak', np.where(df['hour'].isin(['17', '18', '19', '20', '21']), 'evening_peak',
    np.where(df['hour'].isin(['12, 13, 14, 15, 16']), 'afternoon', np.where(df['hour'].isin(['00', '01', '02', '03', '04', '05', '06', '07']),
     'rest_morning', 'rest_evening'))))

#Make a column to combine cust_lat_long and cap_lat_long
df['cust_lat_long'] = df[['customer_location_latitude', 'customer_location_longitude']].values.tolist()
df['cap_lat_long'] = df[['captain_location_latitude', 'captain_location_longitude']].values.tolist()

In [9]:
#Create 20-sec batches
df['second'] = df['second'].astype(float)
df['minute'] = df['minute'].astype(float)
df['20_sec_batch'] = pd.cut(df['second'], bins = [-1, 20, 40, 61], labels = ['0-20', '20-40', '40-60'])
df['count'] = 1
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,updated_epoch,event_type,captain_id,spd_fraud_flag,second,minute,date,weekday_name,time_period,cust_lat_long,cap_lat_long,20_sec_batch,count
0,63481b3280bdba25b5bee82d,0.695,8860a25915fffff,17.434954,78.445152,Hyderabad,8860a25915fffff,8860a25915fffff,17.435685,78.444598,17.434954,78.445152,193538,19,1930,Link,19:59,4,20221013,new,Asked to cancel & take offline ride,1665670713528,order_updated,603a3dc735f8d486ddd50e09,,38.0,35.0,13,Thursday,evening_peak,"[17.434953689575195, 78.44515228271484]","[17.434953689575195, 78.44515228271484]",20-40,1
1,634814af03d82e099a9d60e8,2.533,8860a25955fffff,17.423778,78.456983,Hyderabad,8860a2590bfffff,8860a259e9fffff,17.43405,78.426835,17.425371,78.457306,190751,19,1900,Link,19:00,4,20221013,new,Heavy Traffic,1665668299479,rider_cancelled,5d7b36f5a24c96105e4abd0a,,51.0,7.0,13,Thursday,evening_peak,"[17.425371170043945, 78.45730590820312]","[17.4237781, 78.4569829]",40-60,1
2,634814d8b82ff53f57e4c3b5,1.303,8860a24a65fffff,17.448734,78.364761,Hyderabad,8860a25995fffff,8860a24a65fffff,17.448734,78.364761,17.434431,78.382133,190832,19,1900,Link,19:00,4,20221013,dropped,,1665669662114,rider_rated,62e94bd9a00164e7bbd646ac,False,32.0,8.0,13,Thursday,evening_peak,"[17.434431076049805, 78.38213348388672]","[17.448734283447266, 78.36476135253906]",20-40,1
3,63481ba906757d0c08786ddf,0.291,8860a25987fffff,17.439788,78.39599,Hyderabad,8860a25987fffff,8860a259a9fffff,17.44756,78.409639,17.440163,78.395866,193737,19,1930,Link,19:59,4,20221013,started,,1665670323966,started,5c359ca04a267149c76a49f9,,37.0,37.0,13,Thursday,evening_peak,"[17.440162658691406, 78.39586639404297]","[17.4397875, 78.3959898]",20-40,1
4,63481503b82ff53f57e4c3ff,,8860a259b9fffff,17.443739,78.387792,Hyderabad,8860a25995fffff,8860a25987fffff,17.440281,78.395312,17.434612,78.38649,190915,19,1900,Link,19:00,4,20221013,new,,1665668598154,rider_busy,5fa68698f1caf9e6ed4db49e,,15.0,9.0,13,Thursday,evening_peak,"[17.434612274169922, 78.38648986816406]","[17.443739369353857, 78.38779181241989]",0-20,1


In [10]:
#Filter the df on pinged captains
df_filter = df[(df['event_type'] == 'accepted') | (df['event_type'] == 'rider_reject') | (df['event_type'] == 'rider_busy')]

#Create a column to identify the zone
df_filter['zone_name'] = np.where(df_filter['customer_location_hex_8'].isin(hyderabad_zone1_hexes), 'zone1', 'zone2')

#Convert epoch time to timestamp
df_filter['timestamp'] = df_filter['updated_epoch'].apply(lambda x: datetime.utcfromtimestamp(x/1000))

#Sort the df by order_id and timestamp to get the first captain pinged for each order
df_filter = df_filter.sort_values(by = ['order_id', 'timestamp'], ascending=True)

#Keep only the minimum timestamp for each order it
df_min_timestamp = df_filter.groupby(['order_id'])['timestamp'].min().reset_index()
df_min_timestamp['min_time'] = 'Yes'
df_filter = df_filter.merge(df_min_timestamp, on = ['order_id', 'timestamp'], how = 'left')
df_filter['min_time'] = df_filter['min_time'].fillna('No')
df_old_FM = df_filter[df_filter['min_time'] == 'Yes']

In [11]:
#Query data from driving_distance_and_time
q_dist = """
select * from experiments.driving_distance_and_time
where city_name in ('Hyderabad')
and hex_resolution = 8
and day_name in ('Thursday', 'Friday')
and time_period in ('evening_peak')
"""

#Load the data into pandas table
df_distance_copy = pd.read_sql(q_dist, connection)

In [12]:
#View the distance dataset
df_distance = df_distance_copy.copy()
df_distance.head()

Unnamed: 0,day_name,destination_hex,distance_in_km,hex_resolution,num_observations,ridetime_in_minutes,run_date,source_hex,time_period,city_name
0,Friday,8860a25a01fffff,4.867,8,239,15.348962,20221101,8860a25b53fffff,evening_peak,Hyderabad
1,Thursday,8860b1964dfffff,4.7195,8,43,14.673308,20221101,8860b52cbbfffff,evening_peak,Hyderabad
2,Thursday,8860b52eabfffff,6.196,8,35,15.099967,20221101,8860b52cc7fffff,evening_peak,Hyderabad
3,Friday,8860b52eb9fffff,7.1056,8,48,19.125,20221101,8860b52ecdfffff,evening_peak,Hyderabad
4,Thursday,8860a25b0dfffff,6.5123,8,349,22.509042,20221101,8860a25861fffff,evening_peak,Hyderabad


In [13]:
#Filter df_distance only on columns and hexes required
df_distance_req = df_distance[['day_name', 'destination_hex', 'distance_in_km', 'source_hex', 'time_period']]

df_distance_req = df_distance_req[df_distance_req['destination_hex'].isin(hyderabad_zone_hexes)]
df_distance_req['zone_name'] = np.where(df_distance_req['destination_hex'].isin(hyderabad_zone1_hexes), 'zone1', 'zone2')
df_distance_req_zone1 = df_distance_req[df_distance_req['zone_name'] == 'zone1']
df_distance_req_zone2 = df_distance_req[df_distance_req['zone_name'] == 'zone2']

In [14]:
#Create a column to combine customer anc captain hex
df_old_FM['captain_customer_hex'] = df_old_FM['captain_location_hex_8'] + df_old_FM['customer_location_hex_8']

#Rename columns in df_distance to match the column name in df_old_FM
df_distance = df_distance.rename(columns={'day_name':'weekday_name'})
df_distance['captain_customer_hex'] = df_distance['source_hex'] + df_distance['destination_hex']

#Create a time_period column in the df
df_old_FM['time_period'] = np.where(df_old_FM['hour'].isin(['08', '09', '10', '11']), 'morning_peak',
 np.where(df_old_FM['hour'].isin(['17', '18', '19', '20', '21']), 'evening_peak',
 np.where(df_old_FM['hour'].isin(['12, 13, 14, 15, 16']), 'afternoon',
 np.where(df_old_FM['hour'].isin(['00', '01', '02', '03', '04', '05', '06', '07']), 'rest_morning', 'rest_evening'))))

#Merge the dfs to get the distance
df_old_FM = df_old_FM.merge(df_distance[['captain_customer_hex', 'time_period', 'weekday_name', 'distance_in_km']],
 on = ['captain_customer_hex', 'time_period', 'weekday_name'], how = 'left')
df_old_FM['distance_in_km'] = df_old_FM['distance_in_km'].fillna('NA')
df_old_FM.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,updated_epoch,event_type,captain_id,spd_fraud_flag,second,minute,date,weekday_name,time_period,cust_lat_long,cap_lat_long,20_sec_batch,count,zone_name,timestamp,min_time,captain_customer_hex,distance_in_km
0,632c6358853c020ac79c52b3,,8860a25983fffff,17.440459,78.39182,Hyderabad,8860a259b9fffff,8860a24a67fffff,17.446842,78.358806,17.440592,78.390579,190000,19,1900,Link,19:00,4,20220922,new,,1663853433221,rider_busy,61bb40a51bd8ae4298b0a001,,0.0,0.0,22,Thursday,evening_peak,"[17.44059181213379, 78.39057922363281]","[17.440459440173676, 78.39182049036026]",0-20,1,zone1,2022-09-22 13:30:33.221,Yes,8860a25983fffff8860a259b9fffff,1.45415
1,632c6358fb0939303ec2197e,,8860a259bbfffff,17.44122,78.37749,Hyderabad,8860a259bbfffff,8860a24a67fffff,17.450502,78.360756,17.442455,78.377113,190000,19,1900,Link,19:00,4,20220922,new,,1663853418063,rider_reject,622a09c1abd7a60c0d7cea0f,,0.0,0.0,22,Thursday,evening_peak,"[17.442455291748047, 78.37711334228516]","[17.44122, 78.3774899]",0-20,1,zone1,2022-09-22 13:30:18.063,Yes,8860a259bbfffff8860a259bbfffff,0.9339
2,632c635975037c499a37e32b,,8860a259b9fffff,17.445556,78.391166,Hyderabad,8860a259b9fffff,8860a25995fffff,17.438841,78.387243,17.446436,78.391006,190001,19,1900,Link,19:00,4,20220922,onTheWay,,1663853415539,accepted,5fe5fc0e4319a75193b5e879,,1.0,0.0,22,Thursday,evening_peak,"[17.446435928344727, 78.39100646972656]","[17.445556, 78.3911665]",0-20,1,zone1,2022-09-22 13:30:15.539,Yes,8860a259b9fffff8860a259b9fffff,1.47325
3,632c6359a7e427527b099b23,,8860a25915fffff,17.435429,78.444204,Hyderabad,8860a25915fffff,8860a25939fffff,17.442631,78.446161,17.436409,78.444016,190001,19,1900,Link,19:00,4,20220922,onTheWay,,1663853426837,accepted,6138c1fe8709d2b536976892,,1.0,0.0,22,Thursday,evening_peak,"[17.43640899658203, 78.44401550292969]","[17.4354285, 78.4442041]",0-20,1,zone2,2022-09-22 13:30:26.837,Yes,8860a25915fffff8860a25915fffff,1.445675
4,632c635a49a7d319a06dff21,,8860a24a6dfffff,17.442518,78.369961,Hyderabad,8860a24a6dfffff,8860a24b51fffff,17.465565,78.367022,17.441399,78.370155,190002,19,1900,Link,19:00,4,20220922,onTheWay,,1663853411934,accepted,61bc2393907accd595c41110,,2.0,0.0,22,Thursday,evening_peak,"[17.44139862060547, 78.37015533447266]","[17.4425177704524, 78.36996115744114]",0-20,1,zone1,2022-09-22 13:30:11.934,Yes,8860a24a6dfffff8860a24a6dfffff,1.178


In [15]:
#Calculate the haversine distance
cap_lat_long = df_old_FM['cap_lat_long'].values.tolist()
cust_lat_long = df_old_FM['cust_lat_long'].values.tolist()

distances = []
for x in range(0, len(cap_lat_long)):
    individual_distance = round(haversine(cap_lat_long[x], cust_lat_long[x]), 3)
    distances.append(individual_distance)

df_old_FM['haversine_distance'] = distances
df_old_FM['old_FM'] = np.where(df_old_FM['distance_in_km'] == 'NA', df_old_FM['haversine_distance'], df_old_FM['distance_in_km'])

#Get the average FM by date and hour
df_old_FM_average = df_old_FM.groupby(['zone_name', 'yyyymmdd', 'hour'])['old_FM'].median().reset_index()
df_old_FM_average.head(10)

Unnamed: 0,zone_name,yyyymmdd,hour,old_FM
0,zone1,20220922,19,1.47325
1,zone1,20220922,20,1.47325
2,zone1,20220923,19,1.4976
3,zone1,20220923,20,1.4975
4,zone1,20220929,19,1.856612
5,zone1,20220929,20,1.66475
6,zone1,20220930,19,1.72315
7,zone1,20220930,20,1.72315
8,zone1,20221006,19,2.1995
9,zone1,20221006,20,2.237


In [16]:
#Calculate the FM as per batching

#Create seperate dfs for zone1 and zone2
df_filter_zone1 = df_filter[df_filter['zone_name'] == 'zone1']
df_filter_zone2 = df_filter[df_filter['zone_name'] == 'zone2']

In [17]:
df_filter_zone1['date'].unique()

array(['22', '23', '29', '30', '06', '07', '13', '14'], dtype=object)

In [19]:
df_sample = df_filter_zone1[df_filter_zone1['date'] == '22']
df_sample = df_sample[df_sample['hour'] == '19']
df_distance_day = df_distance_req_zone1[df_distance_req_zone1['day_name'] == 'Thursday']
captains_available = df_sample.groupby(['hour', 'minute', '20_sec_batch'])['captain_location_hex_8'].apply(list).reset_index()
captains_available2 = df_sample.groupby(['hour', 'minute', '20_sec_batch'])['cap_lat_long'].apply(list).reset_index()
captains_available = captains_available.rename(columns = {'captain_location_hex_8':'cap_location_hexes'})
captains_available2 = captains_available2.rename(columns = {'cap_lat_long':'cap_location_lat_long'})

captains_available = captains_available.merge(captains_available2, on = ['hour', 'minute', '20_sec_batch'], how = 'left')

#Merge the captain locations with our df
df_sample = df_sample.merge(captains_available, on = ['hour', 'minute', '20_sec_batch'], how = 'left')
#Combine the source and destination hex and make a dictionary to get distance between each hex combo
df_distance_day['source_destination'] = df_distance_day['source_hex'] + df_distance_day['destination_hex']
distance_dict = dict(zip(df_distance_day['source_destination'], df_distance_day['distance_in_km']))

#Get the distance between customer and each captain available
customer_hexes = df_sample['customer_location_hex_8'].values.tolist()
captain_hexes = df_sample['cap_location_hexes'].values.tolist()
cap_lat_long = df_sample['cap_location_lat_long'].values.tolist()
cust_lat_long = df_sample['cust_lat_long'].values.tolist()

i = 0
captain_distances = []
for x in captain_hexes:
    customer_distance = []
    for hex in range(0, len(x)):
        key_hex = x[hex] + customer_hexes[i]
        if key_hex in distance_dict:
            individual_distance = distance_dict[key_hex]
        else:
            individual_distance = round(haversine(cap_lat_long[i][hex], cust_lat_long[i]), 3)
        customer_distance.append(individual_distance)
    captain_distances.append(customer_distance)
    i += 1

#Add columns for captain distances and etas to each customer
df_sample['captain_distances'] = captain_distances
df_sample.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,updated_epoch,event_type,captain_id,spd_fraud_flag,second,minute,date,weekday_name,time_period,cust_lat_long,cap_lat_long,20_sec_batch,count,zone_name,timestamp,min_time,cap_location_hexes,cap_location_lat_long,captain_distances
0,632c6358853c020ac79c52b3,,8860a25983fffff,17.440459,78.39182,Hyderabad,8860a259b9fffff,8860a24a67fffff,17.446842,78.358806,17.440592,78.390579,190000,19,1900,Link,19:00,4,20220922,new,,1663853433221,rider_busy,61bb40a51bd8ae4298b0a001,,0.0,0.0,22,Thursday,evening_peak,"[17.44059181213379, 78.39057922363281]","[17.440459440173676, 78.39182049036026]",0-20,1,zone1,2022-09-22 13:30:33.221,Yes,"[8860a25983fffff, 8860a25987fffff, 8860a259b9f...","[[17.440459440173676, 78.39182049036026], [17....","[1.4541499999999998, 1.98191875, 1.47324999999..."
1,632c6358853c020ac79c52b3,,8860a25987fffff,17.442689,78.392619,Hyderabad,8860a259b9fffff,8860a24a67fffff,17.446842,78.358806,17.440592,78.390579,190000,19,1900,Link,19:00,4,20220922,new,,1663853442427,rider_reject,61d541f9ac13993022b43e5f,,0.0,0.0,22,Thursday,evening_peak,"[17.44059181213379, 78.39057922363281]","[17.4426893, 78.3926188]",0-20,1,zone1,2022-09-22 13:30:42.427,No,"[8860a25983fffff, 8860a25987fffff, 8860a259b9f...","[[17.440459440173676, 78.39182049036026], [17....","[1.4541499999999998, 1.98191875, 1.47324999999..."
2,632c6358853c020ac79c52b3,,8860a259b9fffff,17.441377,78.390454,Hyderabad,8860a259b9fffff,8860a24a67fffff,17.446842,78.358806,17.440592,78.390579,190000,19,1900,Link,19:00,4,20220922,new,,1663853449463,rider_reject,602f3e65aa7697b969b87b2f,,0.0,0.0,22,Thursday,evening_peak,"[17.44059181213379, 78.39057922363281]","[17.4413768, 78.3904543]",0-20,1,zone1,2022-09-22 13:30:49.463,No,"[8860a25983fffff, 8860a25987fffff, 8860a259b9f...","[[17.440459440173676, 78.39182049036026], [17....","[1.4541499999999998, 1.98191875, 1.47324999999..."
3,632c6358853c020ac79c52b3,,8860a25987fffff,17.442185,78.392642,Hyderabad,8860a259b9fffff,8860a24a67fffff,17.446842,78.358806,17.440592,78.390579,190000,19,1900,Link,19:00,4,20220922,onTheWay,,1663853455397,accepted,5b5408d27be3fd0fcd84056a,,0.0,0.0,22,Thursday,evening_peak,"[17.44059181213379, 78.39057922363281]","[17.4421846, 78.392642]",0-20,1,zone1,2022-09-22 13:30:55.397,No,"[8860a25983fffff, 8860a25987fffff, 8860a259b9f...","[[17.440459440173676, 78.39182049036026], [17....","[1.4541499999999998, 1.98191875, 1.47324999999..."
4,632c6358853c020ac79c52b3,2.385,8860a259b9fffff,17.440984,78.391127,Hyderabad,8860a259b9fffff,8860a24a67fffff,17.446842,78.358806,17.440592,78.390579,190000,19,1900,Link,19:00,4,20220922,onTheWay,Long Pickup,1663855146585,accepted,629c6b231fdbdecc5de12fa4,,0.0,0.0,22,Thursday,evening_peak,"[17.44059181213379, 78.39057922363281]","[17.4409839, 78.3911273]",0-20,1,zone1,2022-09-22 13:59:06.585,No,"[8860a25983fffff, 8860a25987fffff, 8860a259b9f...","[[17.440459440173676, 78.39182049036026], [17....","[1.4541499999999998, 1.98191875, 1.47324999999..."


In [24]:
df_sample['captain_max'] = df_sample['captain_distances'].apply(lambda x: np.mean(x))
df_sample['captain_max'].mean()

3.1862509004475434

In [32]:
df_batches = df_sample.groupby(['minute', '20_sec_batch'])['captain_distances'].apply(list).reset_index()
df_batches = df_batches.dropna()
df_batches['shape_0'] = df_batches['captain_distances'].apply(lambda x: np.shape(x)[0])
df_batches['shape_1'] = df_batches['captain_distances'].apply(lambda x: np.shape(x)[1])
df_batches['diff'] = df_batches['shape_0'] - df_batches['shape_1']
df_batches['diff'].sum()

0

In [25]:
#Filter the df by hour to get hourly reduction in fm
hours_zone1 = {}
dates_zone1 = {}
for date in df_filter_zone1['date'].unique().tolist():
    df_date = df_filter_zone1[df_filter_zone1['date'] == date]
    for hour in df_date['hour'].unique().tolist():
        df_hourly = df_date[df_date['hour'] == hour]
        df_distance_day = df_distance_req_zone1[df_distance_req_zone1['day_name'] == df_hourly['weekday_name'].values[0]]

        #Get the captains available for every 20_sec_batch
        captains_available = df_hourly.groupby(['hour', 'minute', '20_sec_batch'])['captain_location_hex_8'].apply(list).reset_index()
        captains_available2 = df_hourly.groupby(['hour', 'minute', '20_sec_batch'])['cap_lat_long'].apply(list).reset_index()
        captains_available = captains_available.rename(columns = {'captain_location_hex_8':'cap_location_hexes'})
        captains_available2 = captains_available2.rename(columns = {'cap_lat_long':'cap_location_lat_long'})
        #captains_available['number_of_captains'] = captains_available['cap_location_hexes'].apply(lambda x: len(x))
        captains_available = captains_available.merge(captains_available2, on = ['hour', 'minute', '20_sec_batch'], how = 'left')

        #Merge the captain locations with our df
        df_hourly = df_hourly.merge(captains_available, on = ['hour', 'minute', '20_sec_batch'], how = 'left')
        
        #Filter the distance df based on the hour
        if hour in ['08', '09', '10', '11']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'morning_peak']
        elif hour in ['17', '18', '19', '20', '21']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'evening_peak']
        elif hour in ['12, 13, 14, 15, 16']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'afternoon']
        elif hour in ['00', '01', '02', '03', '04', '05', '06', '07']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'rest_morning']
        else:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'rest_evening']

        #Combine the source and destination hex and make a dictionary to get distance between each hex combo
        df_distance_filter['source_destination'] = df_distance_filter['source_hex'] + df_distance_filter['destination_hex']
        distance_dict = dict(zip(df_distance_filter['source_destination'], df_distance_filter['distance_in_km']))

        #Get the distance between customer and each captain available
        customer_hexes = df_hourly['customer_location_hex_8'].values.tolist()
        captain_hexes = df_hourly['cap_location_hexes'].values.tolist()
        cap_lat_long = df_hourly['cap_location_lat_long'].values.tolist()
        cust_lat_long = df_hourly['cust_lat_long'].values.tolist()

        i = 0
        captain_distances = []
        for x in captain_hexes:
            customer_distance = []
            for hex in range(0, len(x)):
                key_hex = x[hex] + customer_hexes[i]
                if key_hex in distance_dict:
                    individual_distance = distance_dict[key_hex]
                else:
                    individual_distance = round(haversine(cap_lat_long[i][hex], cust_lat_long[i]), 3)
                customer_distance.append(individual_distance)
            captain_distances.append(customer_distance)
            i += 1

        #Add columns for captain distances and etas to each customer
        df_hourly['captain_distances'] = captain_distances

        #Get the nxn matrix for each 20_sec_batch
        df_batches = df_hourly.groupby(['minute', '20_sec_batch'])['captain_distances'].apply(list).reset_index()
        df_batches = df_batches.dropna()

        #Get the row_index and column index suitable for each customer
        optimal_distance = []
        for x in df_batches['captain_distances'].values.tolist():
            try:
                row_ind, col_ind = linear_sum_assignment(x)
            except Exception as e:
                continue
            optimal_distance.append(col_ind)

        #Get the optimal distance for each customer
        captain_customer_dist = df_batches['captain_distances'].values.tolist()
        i = 0
        min_distances = []
        for x in captain_customer_dist:
            j = 0
            for y in x:
                if type(y) == float:
                    continue
                min_distance = y[optimal_distance[i][j]]
                j += 1
                min_distances.append(min_distance)
            i += 1

        #Add the new_dist to the df
        df_hourly['new_FM'] = min_distances

        #Get the mean FM for each 20_sec_batch (old and new)
        fm_min = df_hourly.groupby(['minute', '20_sec_batch'])['new_FM'].median().reset_index()
        fm_min_value = fm_min['new_FM'].median()
        hours_zone1[hour] = fm_min_value
    dates_zone1[date] = hours_zone1
    print('done for date {}.'.format(date))

done for date 22.
done for date 23.
done for date 29.
done for date 30.
done for date 06.
done for date 07.
done for date 13.
done for date 14.


In [57]:
#Filter the df by hour to get hourly reduction in fm
hours_zone2 = {}
dates_zone2 = {}
for date in df_filter_zone2['date'].unique().tolist():
    df_date = df_filter_zone2[df_filter_zone2['date'] == date]
    for hour in df_date['hour'].unique().tolist():
        df_hourly = df_date[df_date['hour'] == hour]
        df_distance_day = df_distance_req_zone2[df_distance_req_zone2['day_name'] == df_hourly['weekday_name'].values[0]]

        #Get the captains available for every 20_sec_batch
        captains_available = df_hourly.groupby(['hour', 'minute', '20_sec_batch'])['captain_location_hex_8'].apply(list).reset_index()
        captains_available2 = df_hourly.groupby(['hour', 'minute', '20_sec_batch'])['cap_lat_long'].apply(list).reset_index()
        captains_available = captains_available.rename(columns = {'captain_location_hex_8':'cap_location_hexes'})
        captains_available2 = captains_available2.rename(columns = {'cap_lat_long':'cap_location_lat_long'})
        #captains_available['number_of_captains'] = captains_available['cap_location_hexes'].apply(lambda x: len(x))
        captains_available = captains_available.merge(captains_available2, on = ['hour', 'minute', '20_sec_batch'], how = 'left')

        #Merge the captain locations with our df
        df_hourly = df_hourly.merge(captains_available, on = ['hour', 'minute', '20_sec_batch'], how = 'left')
        
        #Filter the distance df based on the hour
        if hour in ['08', '09', '10', '11']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'morning_peak']
        elif hour in ['17', '18', '19', '20', '21']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'evening_peak']
        elif hour in ['12, 13, 14, 15, 16']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'afternoon']
        elif hour in ['00', '01', '02', '03', '04', '05', '06', '07']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'rest_morning']
        else:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'rest_evening']

        #Combine the source and destination hex and make a dictionary to get distance between each hex combo
        df_distance_filter['source_destination'] = df_distance_filter['source_hex'] + df_distance_filter['destination_hex']
        distance_dict = dict(zip(df_distance_filter['source_destination'], df_distance_filter['distance_in_km']))

        #Get the distance between customer and each captain available
        customer_hexes = df_hourly['customer_location_hex_8'].values.tolist()
        captain_hexes = df_hourly['cap_location_hexes'].values.tolist()
        cap_lat_long = df_hourly['cap_location_lat_long'].values.tolist()
        cust_lat_long = df_hourly['cust_lat_long'].values.tolist()

        i = 0
        captain_distances = []
        for x in captain_hexes:
            customer_distance = []
            for hex in range(0, len(x)):
                key_hex = x[hex] + customer_hexes[i]
                if key_hex in distance_dict:
                    individual_distance = distance_dict[key_hex]
                else:
                    individual_distance = round(haversine(cap_lat_long[i][hex], cust_lat_long[i]), 3)
                customer_distance.append(individual_distance)
            captain_distances.append(customer_distance)
            i += 1

        #Add columns for captain distances and etas to each customer
        df_hourly['captain_distances'] = captain_distances

        #Get the nxn matrix for each 20_sec_batch
        df_batches = df_hourly.groupby(['minute', '20_sec_batch'])['captain_distances'].apply(list).reset_index()
        df_batches = df_batches.dropna()

        #Get the row_index and column index suitable for each customer
        optimal_distance = []
        for x in df_batches['captain_distances'].values.tolist():
            try:
                row_ind, col_ind = linear_sum_assignment(x)
            except Exception as e:
                continue
            optimal_distance.append(col_ind)

        #Get the optimal distance for each customer
        captain_customer_dist = df_batches['captain_distances'].values.tolist()
        i = 0
        min_distances = []
        for x in captain_customer_dist:
            j = 0
            for y in x:
                if type(y) == float:
                    continue
                min_distance = y[optimal_distance[i][j]]
                j += 1
                min_distances.append(min_distance)
            i += 1

        #Add the new_dist to the df
        df_hourly['new_FM'] = min_distances

        #Get the mean FM for each 20_sec_batch (old and new)
        fm_min = df_hourly.groupby(['minute', '20_sec_batch'])['new_FM'].median().reset_index()
        fm_min_value = fm_min['new_FM'].median()
        hours_zone2[hour] = fm_min_value
    dates_zone2[date] = hours_zone2
    print('done for date {}.'.format(date))

done for date 19.
done for date 20.
done for date 21.
done for date 22.
done for date 23.
done for date 24.
done for date 25.
done for date 26.
done for date 27.
done for date 28.
done for date 29.
done for date 30.
done for date 01.
done for date 02.
done for date 03.
done for date 04.
done for date 05.
done for date 06.
done for date 07.
done for date 08.
done for date 09.
done for date 10.
done for date 11.
done for date 12.
done for date 13.
done for date 14.
done for date 15.
done for date 16.


In [58]:
#Create a df from the dates dict
new_FM_zone1 = pd.DataFrame(dates_zone1).T
new_FM_zone1 = new_FM_zone1.reset_index()
new_FM_zone1 = pd.melt(new_FM_zone1, id_vars='index')
new_FM_zone1.columns = ['date', 'hour', 'new_FM']
new_FM_zone1 = new_FM_zone1.sort_values(by = ['date', 'hour'], ascending=True)
new_FM_zone1['zone_name'] = 'zone1'

new_FM_zone2 = pd.DataFrame(dates_zone2).T
new_FM_zone2 = new_FM_zone2.reset_index()
new_FM_zone2 = pd.melt(new_FM_zone2, id_vars='index')
new_FM_zone2.columns = ['date', 'hour', 'new_FM']
new_FM_zone2 = new_FM_zone2.sort_values(by = ['date', 'hour'], ascending=True)
new_FM_zone2['zone_name'] = 'zone2'

df_new_FM = pd.concat([new_FM_zone1, new_FM_zone2], axis=0)

#Merge old and new_FM
df_old_FM_average['date'] = df_old_FM_average['yyyymmdd'].apply(lambda x: x[6:])
hourly_decrease = df_old_FM_average.merge(df_new_FM, on = ['zone_name', 'date', 'hour'], how = 'left')
hourly_decrease['change_in_FM_%'] = round((hourly_decrease['new_FM']/hourly_decrease['old_FM'] - 1), 4)
hourly_decrease

Unnamed: 0,zone_name,yyyymmdd,hour,old_FM,date,new_FM,change_in_FM_%
0,zone1,20220919,00,1.52350,19,1.434669,-0.0583
1,zone1,20220919,01,1.64550,19,1.730000,0.0514
2,zone1,20220919,02,1.64550,19,1.976544,0.2012
3,zone1,20220919,03,1.71800,19,1.941250,0.1299
4,zone1,20220919,04,1.75450,19,1.997875,0.1387
...,...,...,...,...,...,...,...
1339,zone2,20221016,19,1.66475,16,1.779600,0.0690
1340,zone2,20221016,20,1.65375,16,1.734563,0.0489
1341,zone2,20221016,21,1.66050,16,1.752525,0.0554
1342,zone2,20221016,22,1.73400,16,1.814500,0.0464


In [59]:
hourly_decrease.groupby(['zone_name'])['change_in_FM_%'].median()

zone_name
zone1    0.07380
zone2    0.06425
Name: change_in_FM_%, dtype: float64

In [60]:
hourly_decrease.to_csv('change_in_FM_Batching.csv', index = False)