In [11]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [13]:
#Import the sourcing datasets for hyderabad
df_all = pd.read_csv('sourcing_data_delhi.csv')
df_all.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,zone_name
0,634715ffd9dc7714194dda72,0.88,883da10349fffff,28.485403,77.106049,Delhi,883da10341fffff,28.480841,77.108727,10111,1,100,Link,01:00,4,20221013,customerCancelled,Asked to pay extra,zone1
1,6347194f1b4dce7515f90579,1.382,883da1a921fffff,28.611126,77.355629,Delhi,883da1a935fffff,28.614914,77.376091,11519,1,115,Link,01:00,4,20221013,dropped,,zone2
2,63471b7f19caea40e13ad083,3.46,883da11b29fffff,28.390757,76.953621,Delhi,883da111a1fffff,28.494998,77.08772,12439,1,115,Link,01:00,4,20221013,dropped,,zone1
3,6347166619caea40e13acd77,2.399,883da11ac3fffff,28.473682,77.047981,Delhi,883da11ac3fffff,28.473665,77.048088,10254,1,100,Link,01:00,4,20221013,customerCancelled,Wrong pickup location,zone1
4,63471d7b19caea40e13ad174,1.443,883da11853fffff,28.434048,77.047363,Delhi,883da11a91fffff,28.472065,77.083122,13307,1,130,Link,01:59,4,20221013,dropped,,zone1


In [14]:
#Create a count column
df_all['count'] = 1

#Create a new column for cancel_reason to identify OCARA, COBRA and COBRM
df_all['new_cancel_reason'] = np.where(df_all['order_status'] == 'customerCancelled', 
    np.where(df_all['cancel_reason'] == 'order cancelled before rider accepted', 'COBRA',
    np.where(df_all['cancel_reason'] == 'Order cancelled before rider was mapped', 'COBRM', 'OCARA')), 'no_cancellation')

#Create a column to indicate final order status
df_all['final_order_status'] = np.where(df_all['order_status'] == 'customerCancelled', df_all['new_cancel_reason'], df_all['order_status'])
df_all.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,zone_name,count,new_cancel_reason,final_order_status
0,634715ffd9dc7714194dda72,0.88,883da10349fffff,28.485403,77.106049,Delhi,883da10341fffff,28.480841,77.108727,10111,1,100,Link,01:00,4,20221013,customerCancelled,Asked to pay extra,zone1,1,OCARA,OCARA
1,6347194f1b4dce7515f90579,1.382,883da1a921fffff,28.611126,77.355629,Delhi,883da1a935fffff,28.614914,77.376091,11519,1,115,Link,01:00,4,20221013,dropped,,zone2,1,no_cancellation,dropped
2,63471b7f19caea40e13ad083,3.46,883da11b29fffff,28.390757,76.953621,Delhi,883da111a1fffff,28.494998,77.08772,12439,1,115,Link,01:00,4,20221013,dropped,,zone1,1,no_cancellation,dropped
3,6347166619caea40e13acd77,2.399,883da11ac3fffff,28.473682,77.047981,Delhi,883da11ac3fffff,28.473665,77.048088,10254,1,100,Link,01:00,4,20221013,customerCancelled,Wrong pickup location,zone1,1,OCARA,OCARA
4,63471d7b19caea40e13ad174,1.443,883da11853fffff,28.434048,77.047363,Delhi,883da11a91fffff,28.472065,77.083122,13307,1,130,Link,01:59,4,20221013,dropped,,zone1,1,no_cancellation,dropped


In [15]:
#Create a list of hexes belonging to both of the batching areas
zone1_hexes = list(df_all[df_all['zone_name'] == 'zone1']['customer_location_hex_8'].unique())
zone2_hexes = list(df_all[df_all['zone_name'] == 'zone2']['customer_location_hex_8'].unique())

#First we will check the captain_hexes for orders that resulted in drop
df_all['captain_customer_same_area'] = np.where(df_all['zone_name'] == 'zone1',
 np.where(df_all['captain_location_hex_8'].isin(zone1_hexes), 'sourced_from_batching_area', 'sourced_from_outside_batching_area'),
 np.where(df_all['captain_location_hex_8'].isin(zone2_hexes), 'sourced_from_batching_area', 'sourced_from_outside_batching_area'))

#Check the number of orders serviced within the batching area vs outside the batching area by zone
sourced_zone = pd.pivot_table(data = df_all, index = ['zone_name', 'yyyymmdd', 'hour', 'final_order_status'], columns = ['captain_customer_same_area'], 
values = 'count', aggfunc='sum', fill_value=0).reset_index().rename_axis(None, axis=1)

#Create a column for total rides
sourced_zone['total'] = sourced_zone[['sourced_from_batching_area', 'sourced_from_outside_batching_area']].sum(axis = 1)

#Calculate the percentage of each order_status
sourcing_zone = ['sourced_from_batching_area', 'sourced_from_outside_batching_area']
for zone in sourcing_zone:
  sourced_zone['{}_%'.format(zone)] = round(sourced_zone[zone]/sourced_zone['total'], 4)
sourced_zone.head()

Unnamed: 0,zone_name,yyyymmdd,hour,final_order_status,sourced_from_batching_area,sourced_from_outside_batching_area,total,sourced_from_batching_area_%,sourced_from_outside_batching_area_%
0,zone1,20220919,0,COBRA,28,0,28,1.0,0.0
1,zone1,20220919,0,OCARA,62,4,66,0.9394,0.0606
2,zone1,20220919,0,dropped,53,36,89,0.5955,0.4045
3,zone1,20220919,1,COBRA,20,0,20,1.0,0.0
4,zone1,20220919,1,OCARA,26,3,29,0.8966,0.1034


In [16]:
#Check if the captain was sourced from the exact same hex where the customer location is
df_all['captain_customer_same_hex'] = np.where(df_all['captain_location_hex_8'] == df_all['customer_location_hex_8'], 'sourced_from_same_hex',
 'sourced_from_different_hex')

#Check the number of orders serviced from the same hex
sourced_hex = pd.pivot_table(data = df_all, index = ['zone_name', 'yyyymmdd', 'hour', 'final_order_status'], columns = ['captain_customer_same_hex'], 
values = 'count', aggfunc='sum', fill_value=0).reset_index().rename_axis(None, axis=1)

#Create a column for total rides
sourced_hex['total'] = sourced_hex[['sourced_from_same_hex', 'sourced_from_different_hex']].sum(axis = 1)

#Calculate the percentage of each order_status
sourcing_hex = ['sourced_from_same_hex', 'sourced_from_different_hex']
for hex in sourcing_hex:
  sourced_hex['{}_%'.format(hex)] = round(sourced_hex[hex]/sourced_hex['total'], 4)
sourced_hex.head()

Unnamed: 0,zone_name,yyyymmdd,hour,final_order_status,sourced_from_different_hex,sourced_from_same_hex,total,sourced_from_same_hex_%,sourced_from_different_hex_%
0,zone1,20220919,0,COBRA,3,25,28,0.8929,0.1071
1,zone1,20220919,0,OCARA,30,36,66,0.5455,0.4545
2,zone1,20220919,0,dropped,85,4,89,0.0449,0.9551
3,zone1,20220919,1,COBRA,0,20,20,1.0,0.0
4,zone1,20220919,1,OCARA,13,16,29,0.5517,0.4483


In [17]:
#Check for the combinations for each customer-captain hex
df_dropped = df_all[df_all['final_order_status'] == 'dropped']
hex_combos = df_dropped.groupby(['customer_location_hex_8', 'captain_location_hex_8'])['count'].sum().reset_index()
hex_combos = hex_combos.sort_values(by = ['customer_location_hex_8', 'count'], ascending = False).reset_index(drop = True)

#Get the total orders done by customer  location
customer_orders = df_dropped.groupby(['customer_location_hex_8'])['count'].sum().reset_index()
customer_orders = customer_orders.rename(columns = {'count': 'total'})

#Merge customer orders with hex_combos
hex_combos = hex_combos.merge(customer_orders, on = 'customer_location_hex_8', how = 'left')
hex_combos['%_of_orders'] = round(hex_combos['count']/hex_combos['total'], 4)
hex_combos.head(10)

Unnamed: 0,customer_location_hex_8,captain_location_hex_8,count,total,%_of_orders
0,883da1a93dfffff,883da1a939fffff,1487,15765,0.0943
1,883da1a93dfffff,883da1a931fffff,1167,15765,0.074
2,883da1a93dfffff,883da1a93dfffff,1147,15765,0.0728
3,883da1a93dfffff,883da1a927fffff,799,15765,0.0507
4,883da1a93dfffff,883da1a935fffff,724,15765,0.0459
5,883da1a93dfffff,883da1a903fffff,690,15765,0.0438
6,883da1a93dfffff,883da1a923fffff,573,15765,0.0363
7,883da1a93dfffff,883da1a93bfffff,556,15765,0.0353
8,883da1a93dfffff,883da1a937fffff,491,15765,0.0311
9,883da1a93dfffff,883da1a907fffff,482,15765,0.0306


In [18]:
#Check the percentile distribution for each type of order per day
percentile_FM = df_all.groupby(['zone_name', 'final_order_status', 'yyyymmdd',
 'hour'])['accept_to_pickup_distance'].describe(percentiles = [0.25, 0.5, 0.75, 0.9, 0.99]).reset_index()
percentile_FM = percentile_FM.fillna(0)
percentile_FM.head()

Unnamed: 0,zone_name,final_order_status,yyyymmdd,hour,count,mean,std,min,25%,50%,75%,90%,99%,max
0,zone1,COBRA,20220919,0,14.0,0.9215,0.759827,0.008,0.155,0.9555,1.386,1.7277,2.40216,2.502
1,zone1,COBRA,20220919,1,9.0,1.049667,0.666439,0.052,0.707,1.035,1.398,1.7892,2.07432,2.106
2,zone1,COBRA,20220919,2,2.0,1.254,1.390172,0.271,0.7625,1.254,1.7455,2.0404,2.21734,2.237
3,zone1,COBRA,20220919,3,3.0,1.349,0.763905,0.788,0.914,1.04,1.6295,1.9832,2.19542,2.219
4,zone1,COBRA,20220919,4,3.0,1.584667,1.825834,0.075,0.57,1.065,2.3395,3.1042,3.56302,3.614


In [19]:
#Save all the datasets to file
sourced_zone.to_csv('delhi/sourcing_from_batch.csv', index = False)
sourced_hex.to_csv('delhi/sourcing_from_hex.csv', index = False)
hex_combos.to_csv('delhi/hex_combos.csv', index = False)
percentile_FM.to_csv('delhi/fm_percentile_dist.csv', index = False)