In [1]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
from haversine import haversine, Unit
from scipy.optimize import linear_sum_assignment
from datetime import datetime
import calendar
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [2]:
#Define presto credentials
presto_host = 'presto.processing.yoda.run'
presto_port = '80'
username = 'aditya.bhattar@rapido.bike'

#Create connection to presto host
connection = presto.connect(presto_host,presto_port,username = username)

In [4]:
#Query data from order_logs_snapshot

q = """
select order_id, accept_to_pickup_distance, city_name, customer_location_hex_8, customer_location_latitude, 
customer_location_longitude, hhmmss, hour, quarter_hour, service_obj_service_name, time_bucket, weekday, yyyymmdd,
order_status, cancel_reason, eta, ride_time, amount, actual_price, captain_location_latitude, captain_location_longitude, captain_location_hex_8
from orders.order_logs_snapshot
where yyyymmdd >= '20220919'
and yyyymmdd <= '20221016'
and city_name in ('Hyderabad')
and service_obj_service_name = 'Link'
and customer_location_hex_8 in ('8860a24a61fffff', '8860a24a65fffff', '8860a24a67fffff', '8860a24a6dfffff', '8860a24b51fffff', '8860a25995fffff',
 '8860a25997fffff', '8860a259b1fffff', '8860a259b9fffff', '8860a259bbfffff', '8860a259bdfffff', '8860a24a05fffff', '8860a24a0dfffff', '8860a24a21fffff',
  '8860a24a25fffff', '8860a24a29fffff', '8860a24a2dfffff', '8860a24a45fffff', '8860a24a4dfffff', '8860a24a63fffff', '8860a24a69fffff', '8860a24a6bfffff',
   '8860a24b09fffff', '8860a24b0bfffff', '8860a24b19fffff', '8860a24b1dfffff', '8860a24b43fffff', '8860a24b49fffff', '8860a24b4bfffff', '8860a24b53fffff',
    '8860a24b55fffff', '8860a24b57fffff', '8860a24b59fffff', '8860a24b5bfffff', '8860a24b5dfffff', '8860a25981fffff', '8860a25983fffff', '8860a25985fffff',
     '8860a25987fffff', '8860a2598bfffff', '8860a25991fffff', '8860a25993fffff', '8860a25999fffff', '8860a2599bfffff', '8860a2599dfffff', '8860a259a1fffff',
      '8860a259a3fffff', '8860a259a7fffff', '8860a259a9fffff', '8860a259abfffff', '8860a259b3fffff', '8860a259b5fffff', '8860a259b7fffff', '8860a259d7fffff')
"""

#Load data into pandas table
df_copy = pd.read_sql(q, connection)

In [5]:
#Query data from driving_distance_and_time
q_dist = """
select * from experiments.driving_distance_and_time
where city_name in ('Hyderabad')
and hex_resolution = 8
and destination_hex in ('8860a24a61fffff', '8860a24a65fffff', '8860a24a67fffff', '8860a24a6dfffff', '8860a24b51fffff', '8860a25995fffff',
 '8860a25997fffff', '8860a259b1fffff', '8860a259b9fffff', '8860a259bbfffff', '8860a259bdfffff', '8860a24a05fffff', '8860a24a0dfffff', '8860a24a21fffff',
  '8860a24a25fffff', '8860a24a29fffff', '8860a24a2dfffff', '8860a24a45fffff', '8860a24a4dfffff', '8860a24a63fffff', '8860a24a69fffff', '8860a24a6bfffff',
   '8860a24b09fffff', '8860a24b0bfffff', '8860a24b19fffff', '8860a24b1dfffff', '8860a24b43fffff', '8860a24b49fffff', '8860a24b4bfffff', '8860a24b53fffff',
    '8860a24b55fffff', '8860a24b57fffff', '8860a24b59fffff', '8860a24b5bfffff', '8860a24b5dfffff', '8860a25981fffff', '8860a25983fffff', '8860a25985fffff',
     '8860a25987fffff', '8860a2598bfffff', '8860a25991fffff', '8860a25993fffff', '8860a25999fffff', '8860a2599bfffff', '8860a2599dfffff', '8860a259a1fffff',
      '8860a259a3fffff', '8860a259a7fffff', '8860a259a9fffff', '8860a259abfffff', '8860a259b3fffff', '8860a259b5fffff', '8860a259b7fffff', '8860a259d7fffff')
"""

#Load the data into pandas table
df_distance_copy = pd.read_sql(q_dist, connection)

In [6]:
#Create a copy of the df retrived from presto
df = df_copy.copy()
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8
0,632cd2377d42681b1c4b6f44,0.637,Hyderabad,8860a259b3fffff,17.450648,78.379647,25303,2,245,Link,02:59,5,20220923,dropped,,3.0,6.564883,45.0,45.0,17.463287,78.365051,8860a24b51fffff
1,632cc72d853c020ac79cf471,0.518,Hyderabad,8860a24a65fffff,17.453394,78.370651,20557,2,200,Link,02:00,5,20220923,dropped,,2.0,8.49245,50.0,68.0,17.484137,78.388374,8860a24b6bfffff
2,632ccae449a7d319a06ea312,0.957,Hyderabad,8860a259a3fffff,17.459871,78.395187,22148,2,215,Link,02:00,5,20220923,dropped,,1.0,26.09555,59.0,59.0,17.451332,78.407204,8860a259a9fffff
3,632cd208853c020ac79cf749,2.221,Hyderabad,8860a24b51fffff,17.465565,78.368118,25216,2,245,Link,02:59,5,20220923,customerCancelled,Change of plans,3.0,33.533333,209.0,,17.467451,78.385201,8860a24b4bfffff
4,632cc818853c020ac79cf4b7,,Hyderabad,8860a24b55fffff,17.476186,78.369955,20952,2,200,Link,02:00,5,20220923,customerCancelled,order cancelled before rider accepted,6.0,11.133333,57.0,,17.476183,78.369956,8860a24b55fffff


In [7]:
#View the distance dataset
df_distance = df_distance_copy.copy()
df_distance.head()

Unnamed: 0,day_name,destination_hex,distance_in_km,hex_resolution,num_observations,ridetime_in_minutes,run_date,source_hex,time_period,city_name
0,Wednesday,8860a24b51fffff,23.851,8,1,40.016667,20221108,8860a24021fffff,afternoon,Hyderabad
1,Tuesday,8860a25993fffff,21.974,8,3,35.75,20221108,8860a24029fffff,evening_peak,Hyderabad
2,Monday,8860a259bdfffff,25.084,8,2,40.5,20221108,8860a24061fffff,evening_peak,Hyderabad
3,Tuesday,8860a24b57fffff,22.0621,8,1,20.696667,20221108,8860a24065fffff,morning_peak,Hyderabad
4,Sunday,8860a259b1fffff,23.586,8,1,22.695,20221108,8860a24065fffff,evening_peak,Hyderabad


In [8]:
df_distance = df_distance[['day_name', 'destination_hex', 'distance_in_km', 'source_hex', 'time_period']]

In [9]:
#Make columns for second and minute and date
df['second'] = df['hhmmss'].apply(lambda x: x[4:])
df['minute'] = df['hhmmss'].apply(lambda x: x[2:4])
df['date'] = df['yyyymmdd'].apply(lambda x: x[6:])
df['weekday_name'] = df['weekday'].apply(lambda x: calendar.day_name[(x-1)])

#Create a column to get the time of the day
df['time_period'] = np.where(df['hour'].isin(['08', '09', '10', '11']), 'morning_peak', np.where(df['hour'].isin(['17', '18', '19', '20', '21']), 'evening_peak',
    np.where(df['hour'].isin(['12, 13, 14, 15, 16']), 'afternoon', np.where(df['hour'].isin(['00', '01', '02', '03', '04', '05', '06', '07']),
     'rest_morning', 'rest_evening'))))


#Make a column to combine cust_lat_long and cap_lat_long
df['cust_lat_long'] = df[['customer_location_latitude', 'customer_location_longitude']].values.tolist()
df['cap_lat_long'] = df[['captain_location_latitude', 'captain_location_longitude']].values.tolist()
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8,second,minute,date,weekday_name,time_period,cust_lat_long,cap_lat_long
0,632cd2377d42681b1c4b6f44,0.637,Hyderabad,8860a259b3fffff,17.450648,78.379647,25303,2,245,Link,02:59,5,20220923,dropped,,3.0,6.564883,45.0,45.0,17.463287,78.365051,8860a24b51fffff,3,53,23,Friday,rest_morning,"[17.450648412344258, 78.3796465129517]","[17.463287353515625, 78.36505126953125]"
1,632cc72d853c020ac79cf471,0.518,Hyderabad,8860a24a65fffff,17.453394,78.370651,20557,2,200,Link,02:00,5,20220923,dropped,,2.0,8.49245,50.0,68.0,17.484137,78.388374,8860a24b6bfffff,57,5,23,Friday,rest_morning,"[17.453393936157227, 78.37065124511719]","[17.4841365814209, 78.38837432861328]"
2,632ccae449a7d319a06ea312,0.957,Hyderabad,8860a259a3fffff,17.459871,78.395187,22148,2,215,Link,02:00,5,20220923,dropped,,1.0,26.09555,59.0,59.0,17.451332,78.407204,8860a259a9fffff,48,21,23,Friday,rest_morning,"[17.459871292114258, 78.39518737792969]","[17.451332092285156, 78.4072036743164]"
3,632cd208853c020ac79cf749,2.221,Hyderabad,8860a24b51fffff,17.465565,78.368118,25216,2,245,Link,02:59,5,20220923,customerCancelled,Change of plans,3.0,33.533333,209.0,,17.467451,78.385201,8860a24b4bfffff,16,52,23,Friday,rest_morning,"[17.465564727783203, 78.36811828613281]","[17.467451095581055, 78.38520050048828]"
4,632cc818853c020ac79cf4b7,,Hyderabad,8860a24b55fffff,17.476186,78.369955,20952,2,200,Link,02:00,5,20220923,customerCancelled,order cancelled before rider accepted,6.0,11.133333,57.0,,17.476183,78.369956,8860a24b55fffff,52,9,23,Friday,rest_morning,"[17.47618597002465, 78.36995503872481]","[17.47618324590442, 78.3699560445532]"


In [10]:
#Create 20-sec batches
df['second'] = df['second'].astype(float)
df['minute'] = df['minute'].astype(float)
df['20_sec_batch'] = pd.cut(df['second'], bins = [-1, 20, 40, 61], labels = ['0-20', '20-40', '40-60'])
df['count'] = 1
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8,second,minute,date,weekday_name,time_period,cust_lat_long,cap_lat_long,20_sec_batch,count
0,632cd2377d42681b1c4b6f44,0.637,Hyderabad,8860a259b3fffff,17.450648,78.379647,25303,2,245,Link,02:59,5,20220923,dropped,,3.0,6.564883,45.0,45.0,17.463287,78.365051,8860a24b51fffff,3.0,53.0,23,Friday,rest_morning,"[17.450648412344258, 78.3796465129517]","[17.463287353515625, 78.36505126953125]",0-20,1
1,632cc72d853c020ac79cf471,0.518,Hyderabad,8860a24a65fffff,17.453394,78.370651,20557,2,200,Link,02:00,5,20220923,dropped,,2.0,8.49245,50.0,68.0,17.484137,78.388374,8860a24b6bfffff,57.0,5.0,23,Friday,rest_morning,"[17.453393936157227, 78.37065124511719]","[17.4841365814209, 78.38837432861328]",40-60,1
2,632ccae449a7d319a06ea312,0.957,Hyderabad,8860a259a3fffff,17.459871,78.395187,22148,2,215,Link,02:00,5,20220923,dropped,,1.0,26.09555,59.0,59.0,17.451332,78.407204,8860a259a9fffff,48.0,21.0,23,Friday,rest_morning,"[17.459871292114258, 78.39518737792969]","[17.451332092285156, 78.4072036743164]",40-60,1
3,632cd208853c020ac79cf749,2.221,Hyderabad,8860a24b51fffff,17.465565,78.368118,25216,2,245,Link,02:59,5,20220923,customerCancelled,Change of plans,3.0,33.533333,209.0,,17.467451,78.385201,8860a24b4bfffff,16.0,52.0,23,Friday,rest_morning,"[17.465564727783203, 78.36811828613281]","[17.467451095581055, 78.38520050048828]",0-20,1
4,632cc818853c020ac79cf4b7,,Hyderabad,8860a24b55fffff,17.476186,78.369955,20952,2,200,Link,02:00,5,20220923,customerCancelled,order cancelled before rider accepted,6.0,11.133333,57.0,,17.476183,78.369956,8860a24b55fffff,52.0,9.0,23,Friday,rest_morning,"[17.47618597002465, 78.36995503872481]","[17.47618324590442, 78.3699560445532]",40-60,1


In [11]:
#Load csv file of identified hexes
df_level_1 = pd.read_csv('hyderabad_location_data_level1_demand_zone1.csv')

#Create list of hexes
level1_hexes = list(df_level_1['customer_location_hex_8'].unique())

#Create a column to identify cancellation reason
df['new_cancel_reason'] = np.where(df['order_status'] == 'customerCancelled', np.where(df['cancel_reason'] == 'order cancelled before rider accepted', 'COBRA',
 np.where(df['cancel_reason'] == 'Order cancelled before rider was mapped', 'COBRM', 'OCARA')), 'noCancellation')

#Filter only on the required hexesnand morning 9 AM
df_filter = df
df_filter = df_filter.sort_values(by = ['minute', 'second'], ascending = True).reset_index(drop = True)
df_filter = df_filter[(df_filter['order_status'] == 'dropped') | (df_filter['new_cancel_reason'] == 'OCARA')]
df_filter.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8,second,minute,date,weekday_name,time_period,cust_lat_long,cap_lat_long,20_sec_batch,count,new_cancel_reason
1,633d4088142b3340a75951df,0.032,Hyderabad,8860a25985fffff,17.44776,78.407043,140000,14,1400,Link,14:00,3,20221005,customerCancelled,Customer asked to cancel,2.0,8.433504,37.0,,17.448044,78.407006,8860a25985fffff,0.0,0.0,5,Wednesday,rest_evening,"[17.4477596282959, 78.40704345703125]","[17.4480442, 78.4070057]",0-20,1,OCARA
2,634901e81ccf816c21e13112,2.274,Hyderabad,8860a25985fffff,17.448074,78.404861,120000,12,1200,Link,12:00,5,20221014,dropped,Other,4.0,47.115033,149.0,149.0,17.399302,78.483719,8860a25b3dfffff,0.0,0.0,14,Friday,rest_evening,"[17.448074340820312, 78.40486145019531]","[17.399301528930664, 78.48371887207031]",0-20,1,noCancellation
3,632996287ff9294e27578787,0.888,Hyderabad,8860a259b3fffff,17.454187,78.377304,160000,16,1600,Link,16:00,2,20220920,dropped,,4.0,17.747567,56.0,56.0,17.480566,78.350952,8860a24b11fffff,0.0,0.0,20,Tuesday,rest_evening,"[17.454187393188477, 78.37730407714844]","[17.480566024780273, 78.3509521484375]",0-20,1,noCancellation
4,634ac3e8ce096e5e8d57c499,1.136,Hyderabad,8860a24a6bfffff,17.435829,78.360077,200000,20,2000,Link,20:00,6,20221015,dropped,Heavy Traffic,6.0,14.423733,63.0,63.0,17.462366,78.35688,8860a24b53fffff,0.0,0.0,15,Saturday,evening_peak,"[17.435829162597656, 78.36007690429688]","[17.462366104125977, 78.35688018798828]",0-20,1,noCancellation
5,6348e5c8d0f58d5e6e48b570,2.167,Hyderabad,8860a24a63fffff,17.441532,78.357361,100000,10,1000,Link,10:00,5,20221014,dropped,,1.0,6.258167,29.0,29.0,17.446686,78.351234,8860a24a29fffff,0.0,0.0,14,Friday,morning_peak,"[17.441532135009766, 78.35736083984375]","[17.446685791015625, 78.35123443603516]",0-20,1,noCancellation


In [12]:
df_distance = df_distance

In [13]:
#Filter the df by hour to get hourly reduction in fm
hours = {}
dates = {}
for date in df_filter['yyyymmdd'].unique().tolist():
    df_date = df_filter[df_filter['yyyymmdd'] == date]
    for hour in df_date['hour'].unique().tolist():
        df_hourly = df_date[df_date['hour'] == hour]
        df_distance_day = df_distance[df_distance['day_name'] == df_hourly['weekday_name'].values[0]]

        #Get the captains available for every 20_sec_batch
        captains_available = df_hourly.groupby(['hour', 'minute', '20_sec_batch'])['captain_location_hex_8'].apply(list).reset_index()
        captains_available2 = df_hourly.groupby(['hour', 'minute', '20_sec_batch'])['cap_lat_long'].apply(list).reset_index()
        captains_available = captains_available.rename(columns = {'captain_location_hex_8':'cap_location_hexes'})
        captains_available2 = captains_available2.rename(columns = {'cap_lat_long':'cap_location_lat_long'})
        #captains_available['number_of_captains'] = captains_available['cap_location_hexes'].apply(lambda x: len(x))
        captains_available = captains_available.merge(captains_available2, on = ['hour', 'minute', '20_sec_batch'], how = 'left')

        #Merge the captain locations with our df
        df_hourly = df_hourly.merge(captains_available, on = ['hour', 'minute', '20_sec_batch'], how = 'left')
        
        #Filter the distance df based on the hour
        if hour in ['08', '09', '10', '11']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'morning_peak']
        elif hour in ['17', '18', '19', '20', '21']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'evening_peak']
        elif hour in ['12, 13, 14, 15, 16']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'afternoon']
        elif hour in ['00', '01', '02', '03', '04', '05', '06', '07']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'rest_morning']
        else:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'rest_evening']

        #Combine the source and destination hex and make a dictionary to get distance between each hex combo
        df_distance_filter['source_destination'] = df_distance_filter['source_hex'] + df_distance_filter['destination_hex']
        distance_dict = dict(zip(df_distance_filter['source_destination'], df_distance_filter['distance_in_km']))

        #Get the distance between customer and each captain available
        customer_hexes = df_hourly['customer_location_hex_8'].values.tolist()
        captain_hexes = df_hourly['cap_location_hexes'].values.tolist()
        cap_lat_long = df_hourly['cap_location_lat_long'].values.tolist()
        cust_lat_long = df_hourly['cust_lat_long'].values.tolist()

        i = 0
        captain_distances = []
        for x in captain_hexes:
            customer_distance = []
            for hex in range(0, len(x)):
                key_hex = x[hex] + customer_hexes[i]
                if key_hex in distance_dict:
                    individual_distance = distance_dict[key_hex]
                else:
                    individual_distance = round(haversine(cap_lat_long[i][hex], cust_lat_long[i]), 3)
                customer_distance.append(individual_distance)
            captain_distances.append(customer_distance)
            i += 1

        #Add columns for captain distances and etas to each customer
        df_hourly['captain_distances'] = captain_distances

        #Get the nxn matrix for each 20_sec_batch
        df_batches = df_hourly.groupby(['minute', '20_sec_batch'])['captain_distances'].apply(list).reset_index()
        df_batches = df_batches.dropna()

        #Get the row_index and column index suitable for each customer
        optimal_distance = []
        for x in df_batches['captain_distances'].values.tolist():
            try:
                row_ind, col_ind = linear_sum_assignment(x)
            except Exception as e:
                continue
            optimal_distance.append(col_ind)

        #Get the optimal distance for each customer
        captain_customer_dist = df_batches['captain_distances'].values.tolist()
        i = 0
        min_distances = []
        for x in captain_customer_dist:
            j = 0
            for y in x:
                if type(y) == float:
                    continue
                min_distance = y[optimal_distance[i][j]]
                j += 1
                min_distances.append(min_distance)
            i += 1

        #Add the new_dist to the df
        df_hourly['new_FM'] = min_distances

        #Get the mean FM for each 20_sec_batch (old and new)
        fm_min = df_hourly.groupby(['minute', '20_sec_batch'])['new_FM'].mean().reset_index()
        hours[hour] = fm_min['new_FM'].median()
    dates[date] = hours
    print('Done for date {}'.format(date))

Done for date 20221005
Done for date 20221014
Done for date 20220920
Done for date 20221015
Done for date 20220930
Done for date 20220929
Done for date 20221011
Done for date 20221008
Done for date 20220922
Done for date 20221010
Done for date 20221007
Done for date 20220921
Done for date 20221013
Done for date 20220928
Done for date 20221012
Done for date 20220919
Done for date 20220927
Done for date 20221004
Done for date 20221001
Done for date 20221016
Done for date 20221009
Done for date 20220924
Done for date 20220926
Done for date 20221006
Done for date 20220923
Done for date 20221003
Done for date 20220925
Done for date 20221002


In [20]:
#Create a df from the dates dict
hourly_decrease = pd.DataFrame(dates).T
hourly_decrease = hourly_decrease.reset_index()
hourly_decrease = pd.melt(hourly_decrease, id_vars='index')
hourly_decrease.columns = ['yyyymmdd', 'hour', 'new_FM']
hourly_decrease = hourly_decrease.sort_values(by = ['date', 'hour'], ascending=True)
old_FM = pd.read_csv('/Users/rapido/Desktop/batching_demand/APR_and_ME/hyderabad_old_FM.csv')
final_df = old_FM.merge(hourly_decrease, on = ['yyyymmdd', 'hour'], how = 'left')

In [21]:
hourly_decrease

Unnamed: 0,date,hour,FM_decrease
594,01,00,-0.0767
370,01,01,-0.0470
6,01,02,0.0378
566,01,03,-0.0172
650,01,04,-0.0111
...,...,...,...
109,30,19,-0.0421
137,30,20,-0.0628
333,30,21,-0.0342
473,30,22,-0.0669


In [114]:
#Get all the captains lat_long in a particular batch
captains_available = df_filter.groupby(['hour', 'minute', '20_sec_batch'])['captain_location_hex_8'].apply(list).reset_index()
captains_available2 = df_filter.groupby(['hour', 'minute', '20_sec_batch'])['cap_lat_long'].apply(list).reset_index()
captains_available = captains_available.rename(columns = {'captain_location_hex_8':'cap_location_hexes'})
captains_available2 = captains_available2.rename(columns = {'cap_lat_long':'cap_location_lat_long'})
captains_available['number_of_captains'] = captains_available['cap_location_hexes'].apply(lambda x: len(x))
captains_available = captains_available.merge(captains_available2, on = ['hour', 'minute', '20_sec_batch'], how = 'left')
captains_available.head()

Unnamed: 0,hour,minute,20_sec_batch,cap_location_hexes,number_of_captains,cap_location_lat_long
0,9,0.0,0-20,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90..."
1,9,0.0,20-40,"[88618925ddfffff, 88618925e9fffff, 8861892437f...",8,"[[12.917550086975098, 77.60970306396484], [12...."
2,9,0.0,40-60,"[8861892513fffff, 88618924edfffff, 8861892eadf...",11,"[[12.930061340332031, 77.63341522216797], [12...."
3,9,1.0,0-20,"[88618925d1fffff, 88618925ebfffff, 8861892589f...",10,"[[12.9166304, 77.599671], [12.943676948547363,..."
4,9,1.0,20-40,"[88618925e9fffff, 8861892437fffff, 886189251bf...",9,"[[12.938621520996094, 77.63064575195312], [12...."


In [115]:
#Merge the captain locations with our df
df_filter = df_filter.merge(captains_available, on = ['hour', 'minute', '20_sec_batch'], how = 'left')
df_filter.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8,second,minute,date,cust_lat_long,cap_lat_long,20_sec_batch,count,cap_location_hexes,number_of_captains,cap_location_lat_long
0,632d283ba01239177d03c2d8,1.591,Bangalore,88618925ebfffff,12.939281,77.616455,90003,9,900,Link,09:00,5,20220923,customerCancelled,order cancelled before rider accepted,3.0,12.75,62.0,,12.939875,77.616455,88618925ebfffff,3.0,0.0,23,"[12.93928050994873, 77.616455078125]","[12.939874649047852, 77.616455078125]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90..."
1,632d283b35fcca18e8c58ba8,2.102,Bangalore,88618925cbfffff,12.916869,77.612197,90003,9,900,Link,09:00,5,20220923,dropped,,4.0,21.34315,83.0,83.0,12.908797,77.65197,886189255bfffff,3.0,0.0,23,"[12.91686891775971, 77.61219724685861]","[12.908797264099121, 77.65196990966797]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90..."
2,632d283fd5acdf01c6241e88,,Bangalore,8861892513fffff,12.934423,77.629784,90007,9,900,Link,09:00,5,20220923,customerCancelled,order cancelled before rider accepted,2.0,20.666667,96.0,,12.934423,77.629784,8861892513fffff,7.0,0.0,23,"[12.934423446655273, 77.6297836303711]","[12.934423446655273, 77.6297836303711]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90..."
3,632d2841a01239177d03c2e0,,Bangalore,88618925c3fffff,12.925748,77.608879,90009,9,900,Link,09:00,5,20220923,customerCancelled,Cancellation because of nudged order booking,2.0,29.016667,149.0,,12.925817,77.608795,88618925c3fffff,9.0,0.0,23,"[12.925747871398926, 77.60887908935547]","[12.925817489624023, 77.60879516601562]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90..."
4,632d2845349a2d74ba4f3673,3.699,Bangalore,88618925cbfffff,12.917367,77.613884,90013,9,900,Link,09:00,5,20220923,dropped,,6.0,9.516033,54.0,54.0,12.905402,77.602966,8861892433fffff,13.0,0.0,23,"[12.917367, 77.613884]","[12.905402183532715, 77.60296630859375]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90..."


In [116]:
#Create a dictionary out of the df_distance table for each destination, source combo
df_distance['source_destination'] = df_distance['source_hex'] + df_distance['destination_hex']
distance_dict = dict(zip(df_distance['source_destination'], df_distance['distance_in_km']))
eta_dict = dict(zip(df_distance['source_destination'], df_distance['ridetime_in_minutes']))

In [117]:
#Get the distance between customer and each captain available
customer_hexes = df_filter['customer_location_hex_8'].values.tolist()
captain_hexes = df_filter['cap_location_hexes'].values.tolist()
cap_lat_long = df_filter['cap_location_lat_long'].values.tolist()
cust_lat_long = df_filter['cust_lat_long'].values.tolist()

i = 0
captain_distances = []
etas = []
for x in captain_hexes:
    customer_distance = []
    customer_eta = []
    for hex in range(0, len(x)):
        key_hex = x[hex] + customer_hexes[i]
        if key_hex in distance_dict:
            individual_distance = distance_dict[key_hex]
            individual_eta = eta_dict[key_hex]
        else:
            individual_distance = round(haversine(cap_lat_long[i][hex], cust_lat_long[i]), 3)
            individual_eta = 'NA'
        customer_distance.append(individual_distance)
        customer_eta.append(individual_eta)
    captain_distances.append(customer_distance)
    etas.append(customer_eta)
    i += 1
len(captain_distances)

1605

In [118]:
#Add columns for captain distances and etas to each customer
df_filter['captain_distances'] = captain_distances
df_filter['captain_etas'] = etas
df_filter.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8,second,minute,date,cust_lat_long,cap_lat_long,20_sec_batch,count,cap_location_hexes,number_of_captains,cap_location_lat_long,captain_distances,captain_etas
0,632d283ba01239177d03c2d8,1.591,Bangalore,88618925ebfffff,12.939281,77.616455,90003,9,900,Link,09:00,5,20220923,customerCancelled,order cancelled before rider accepted,3.0,12.75,62.0,,12.939875,77.616455,88618925ebfffff,3.0,0.0,23,"[12.93928050994873, 77.616455078125]","[12.939874649047852, 77.616455078125]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90...","[0.8529, 6.552, 2.044, 2.9290000000000003, 5.8...","[2.4475625, 19.583333333333336, 6.471616666666..."
1,632d283b35fcca18e8c58ba8,2.102,Bangalore,88618925cbfffff,12.916869,77.612197,90003,9,900,Link,09:00,5,20220923,dropped,,4.0,21.34315,83.0,83.0,12.908797,77.65197,886189255bfffff,3.0,0.0,23,"[12.91686891775971, 77.61219724685861]","[12.908797264099121, 77.65196990966797]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90...","[4.12995, 4.678, 3.8225, 1.689, 2.424, 3.88305...","[12.142620833333334, 16.05, 11.3537375, 4.8256..."
2,632d283fd5acdf01c6241e88,,Bangalore,8861892513fffff,12.934423,77.629784,90007,9,900,Link,09:00,5,20220923,customerCancelled,order cancelled before rider accepted,2.0,20.666667,96.0,,12.934423,77.629784,8861892513fffff,7.0,0.0,23,"[12.934423446655273, 77.6297836303711]","[12.934423446655273, 77.6297836303711]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90...","[2.1305, 4.968, 1.0899999999999999, 3.79925, 6...","[7.273083333333334, 13.908333333333333, 2.9711..."
3,632d2841a01239177d03c2e0,,Bangalore,88618925c3fffff,12.925748,77.608879,90009,9,900,Link,09:00,5,20220923,customerCancelled,Cancellation because of nudged order booking,2.0,29.016667,149.0,,12.925817,77.608795,88618925c3fffff,9.0,0.0,23,"[12.925747871398926, 77.60887908935547]","[12.925817489624023, 77.60879516601562]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90...","[3.19095, 6.218500000000001, 3.756999999999999...","[9.816804166666667, 17.816666666666666, 10.439..."
4,632d2845349a2d74ba4f3673,3.699,Bangalore,88618925cbfffff,12.917367,77.613884,90013,9,900,Link,09:00,5,20220923,dropped,,6.0,9.516033,54.0,54.0,12.905402,77.602966,8861892433fffff,13.0,0.0,23,"[12.917367, 77.613884]","[12.905402183532715, 77.60296630859375]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90...","[4.12995, 4.678, 3.8225, 1.689, 2.424, 3.88305...","[12.142620833333334, 16.05, 11.3537375, 4.8256..."


In [119]:
#Get the nxn matrix for each 20_sec_batch
df_batches = df_filter.groupby(['minute', '20_sec_batch'])['captain_distances'].apply(list).reset_index()

#Get the row_index and column index suitable for each customer
optimal_distance = []
for x in df_batches['captain_distances'].values.tolist():
    row_ind, col_ind = linear_sum_assignment(x)
    optimal_distance.append(col_ind)

In [120]:
#Get the optimal distance for each customer
captain_customer_dist = df_batches['captain_distances'].values.tolist()
i = 0
min_distances = []
for x in captain_customer_dist:
    j = 0
    for y in x:
        min_distance = y[optimal_distance[i][j]]
        j += 1
        min_distances.append(min_distance)
    i += 1

#Add the new_dist to the df
df_filter['new_FM'] = min_distances
df_filter.head()


Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8,second,minute,date,cust_lat_long,cap_lat_long,20_sec_batch,count,cap_location_hexes,number_of_captains,cap_location_lat_long,captain_distances,captain_etas,new_FM
0,632d283ba01239177d03c2d8,1.591,Bangalore,88618925ebfffff,12.939281,77.616455,90003,9,900,Link,09:00,5,20220923,customerCancelled,order cancelled before rider accepted,3.0,12.75,62.0,,12.939875,77.616455,88618925ebfffff,3.0,0.0,23,"[12.93928050994873, 77.616455078125]","[12.939874649047852, 77.616455078125]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90...","[0.8529, 6.552, 2.044, 2.9290000000000003, 5.8...","[2.4475625, 19.583333333333336, 6.471616666666...",0.8529
1,632d283b35fcca18e8c58ba8,2.102,Bangalore,88618925cbfffff,12.916869,77.612197,90003,9,900,Link,09:00,5,20220923,dropped,,4.0,21.34315,83.0,83.0,12.908797,77.65197,886189255bfffff,3.0,0.0,23,"[12.91686891775971, 77.61219724685861]","[12.908797264099121, 77.65196990966797]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90...","[4.12995, 4.678, 3.8225, 1.689, 2.424, 3.88305...","[12.142620833333334, 16.05, 11.3537375, 4.8256...",4.678
2,632d283fd5acdf01c6241e88,,Bangalore,8861892513fffff,12.934423,77.629784,90007,9,900,Link,09:00,5,20220923,customerCancelled,order cancelled before rider accepted,2.0,20.666667,96.0,,12.934423,77.629784,8861892513fffff,7.0,0.0,23,"[12.934423446655273, 77.6297836303711]","[12.934423446655273, 77.6297836303711]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90...","[2.1305, 4.968, 1.0899999999999999, 3.79925, 6...","[7.273083333333334, 13.908333333333333, 2.9711...",1.09
3,632d2841a01239177d03c2e0,,Bangalore,88618925c3fffff,12.925748,77.608879,90009,9,900,Link,09:00,5,20220923,customerCancelled,Cancellation because of nudged order booking,2.0,29.016667,149.0,,12.925817,77.608795,88618925c3fffff,9.0,0.0,23,"[12.925747871398926, 77.60887908935547]","[12.925817489624023, 77.60879516601562]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90...","[3.19095, 6.218500000000001, 3.756999999999999...","[9.816804166666667, 17.816666666666666, 10.439...",1.556
4,632d2845349a2d74ba4f3673,3.699,Bangalore,88618925cbfffff,12.917367,77.613884,90013,9,900,Link,09:00,5,20220923,dropped,,6.0,9.516033,54.0,54.0,12.905402,77.602966,8861892433fffff,13.0,0.0,23,"[12.917367, 77.613884]","[12.905402183532715, 77.60296630859375]",0-20,1,"[88618925ebfffff, 886189255bfffff, 8861892513f...",8,"[[12.939874649047852, 77.616455078125], [12.90...","[4.12995, 4.678, 3.8225, 1.689, 2.424, 3.88305...","[12.142620833333334, 16.05, 11.3537375, 4.8256...",1.689


In [121]:
#Calculate old_FM
df_filter['cap_cust_hex'] = df_filter['customer_location_hex_8'] + df_filter['captain_location_hex_8']
captain_customer_dist = []
for x in df_filter['cap_cust_hex'].values.tolist():
    if x in distance_dict:
        distance = distance_dict[x]
    else:
        distance = 'NA'
    captain_customer_dist.append(distance)

cap_lat_long = df_filter['cap_lat_long'].values.tolist()
cust_lat_long = df_filter['cust_lat_long'].values.tolist()

haversine_cust_cap = []
for i in range(0, len(cap_lat_long)):
    haversine_distance = round(haversine(cap_lat_long[i], cust_lat_long[i]), 3)
    haversine_cust_cap.append(haversine_distance)

for i in range(0, len(captain_customer_dist)):
    if captain_customer_dist[i] == 'NA':
        captain_customer_dist[i] = haversine_cust_cap[i]

df_filter['old_FM'] = captain_customer_dist

In [122]:
#Get the mean FM for each 20_sec_batch (old and new)
fm_min = df_filter.groupby(['minute', '20_sec_batch'])['old_FM', 'new_FM'].mean().reset_index()

#Add a column for percentage increase/decrease
fm_min['%_change_dist'] = (fm_min['new_FM']/fm_min['old_FM'] - 1)
fm_min.head()

Unnamed: 0,minute,20_sec_batch,old_FM,new_FM,%_change_dist
0,0.0,0-20,1.920762,1.8357,-0.044286
1,0.0,20-40,2.763603,2.696269,-0.024365
2,0.0,40-60,3.276243,2.641,-0.193894
3,1.0,0-20,2.09737,1.74647,-0.167305
4,1.0,20-40,2.761722,2.526594,-0.085138


In [123]:
fm_min['%_change_dist'].mean()

-0.14132733641868933

In [125]:
fm_min['old_FM'].mean()

2.78594652201984

In [126]:
fm_min['new_FM'].mean()


2.360244409980259

In [127]:
fm_min['new_FM'].mean()/fm_min['old_FM'].mean() - 1

-0.1528034040405566