In [1]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [2]:
#Define presto credentials
presto_host = 'presto.processing.yoda.run'
presto_port = '80'
username = 'aditya.bhattar@rapido.bike'

#Create connection to presto host
connection = presto.connect(presto_host,presto_port,username = username)

In [3]:
#Query data from order_logs_snapshot

q = """
select order_id, accept_to_pickup_distance, captain_location_hex_8, captain_location_latitude,
captain_location_longitude, city_name, customer_location_hex_8, customer_location_latitude, 
customer_location_longitude, hhmmss, hour, quarter_hour, service_obj_service_name, time_bucket, weekday, yyyymmdd,
order_status, cancel_reason
from orders.order_logs_snapshot
where yyyymmdd >= '20220919'
and yyyymmdd <= '20220925'
and city_name in ('Hyderabad')
and service_obj_service_name = 'Link'
"""

#Load data into pandas table
df_copy = pd.read_sql(q, connection)

In [4]:
#Create a copy of the df retrived from presto
df = df_copy.copy()

In [5]:
#View the dataset
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason
0,6329de449aa7cf28bdeb8a6f,0.267,8860a24b67fffff,17.498598,78.390015,Hyderabad,8860a24b0dfffff,17.489359,78.384377,210740,21,2100,Link,21:00,2,20220920,dropped,
1,6329de67242ca34c4a39f43c,,8860b52cdbfffff,17.408791,78.508835,Hyderabad,8860b52cdbfffff,17.408791,78.508835,210815,21,2100,Link,21:00,2,20220920,customerCancelled,order cancelled before rider accepted
2,6329df096ad6d118594e06b3,0.786,8860a24b5bfffff,17.462053,78.362251,Hyderabad,8860a259b1fffff,17.449514,78.382484,211057,21,2100,Link,21:00,2,20220920,dropped,
3,6329df937d42681b1c467d9a,0.866,8860a24b51fffff,17.468519,78.366844,Hyderabad,8860a258a9fffff,17.39978,78.38221,211315,21,2100,Link,21:00,2,20220920,dropped,
4,6329e0329aa7cf28bdeb8e3b,1.117,8860b196c9fffff,17.463011,78.452927,Hyderabad,8860b196d1fffff,17.469362,78.427406,211554,21,2115,Link,21:00,2,20220920,dropped,Heavy Traffic


In [6]:
#Make columns for second and minute and date
df['second'] = df['hhmmss'].apply(lambda x: x[4:])
df['minute'] = df['hhmmss'].apply(lambda x: x[2:4])
df['date'] = df['yyyymmdd'].apply(lambda x: x[6:])
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date
0,6329de449aa7cf28bdeb8a6f,0.267,8860a24b67fffff,17.498598,78.390015,Hyderabad,8860a24b0dfffff,17.489359,78.384377,210740,21,2100,Link,21:00,2,20220920,dropped,,40,7,20
1,6329de67242ca34c4a39f43c,,8860b52cdbfffff,17.408791,78.508835,Hyderabad,8860b52cdbfffff,17.408791,78.508835,210815,21,2100,Link,21:00,2,20220920,customerCancelled,order cancelled before rider accepted,15,8,20
2,6329df096ad6d118594e06b3,0.786,8860a24b5bfffff,17.462053,78.362251,Hyderabad,8860a259b1fffff,17.449514,78.382484,211057,21,2100,Link,21:00,2,20220920,dropped,,57,10,20
3,6329df937d42681b1c467d9a,0.866,8860a24b51fffff,17.468519,78.366844,Hyderabad,8860a258a9fffff,17.39978,78.38221,211315,21,2100,Link,21:00,2,20220920,dropped,,15,13,20
4,6329e0329aa7cf28bdeb8e3b,1.117,8860b196c9fffff,17.463011,78.452927,Hyderabad,8860b196d1fffff,17.469362,78.427406,211554,21,2115,Link,21:00,2,20220920,dropped,Heavy Traffic,54,15,20


In [7]:
#Create 15-sec batches and 20-sec batches
df['second'] = df['second'].astype(float)
df['minute'] = df['minute'].astype(float)
df['20_sec_batch'] = pd.cut(df['second'], bins = [-1, 20, 40, 61], labels = ['0-20', '20-40', '40-60'])
df['count'] = 1
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date,20_sec_batch,count
0,6329de449aa7cf28bdeb8a6f,0.267,8860a24b67fffff,17.498598,78.390015,Hyderabad,8860a24b0dfffff,17.489359,78.384377,210740,21,2100,Link,21:00,2,20220920,dropped,,40.0,7.0,20,20-40,1
1,6329de67242ca34c4a39f43c,,8860b52cdbfffff,17.408791,78.508835,Hyderabad,8860b52cdbfffff,17.408791,78.508835,210815,21,2100,Link,21:00,2,20220920,customerCancelled,order cancelled before rider accepted,15.0,8.0,20,0-20,1
2,6329df096ad6d118594e06b3,0.786,8860a24b5bfffff,17.462053,78.362251,Hyderabad,8860a259b1fffff,17.449514,78.382484,211057,21,2100,Link,21:00,2,20220920,dropped,,57.0,10.0,20,40-60,1
3,6329df937d42681b1c467d9a,0.866,8860a24b51fffff,17.468519,78.366844,Hyderabad,8860a258a9fffff,17.39978,78.38221,211315,21,2100,Link,21:00,2,20220920,dropped,,15.0,13.0,20,0-20,1
4,6329e0329aa7cf28bdeb8e3b,1.117,8860b196c9fffff,17.463011,78.452927,Hyderabad,8860b196d1fffff,17.469362,78.427406,211554,21,2115,Link,21:00,2,20220920,dropped,Heavy Traffic,54.0,15.0,20,40-60,1


In [8]:
#Add column to identify the hexes
hyderabad_zone1 = pd.read_csv('/Users/rapido/Desktop/batching_demand/hyderabad/hyderabad_location_data_level1_demand_zone1.csv')
hyderabad_zone2 = pd.read_csv('/Users/rapido/Desktop/batching_demand/hyderabad/hyderabad_location_data_level1_demand_zone2.csv')

hyderabad_zone1_hexes = list(hyderabad_zone1['customer_location_hex_8'].unique())
hyderabad_zone2_hexes = list(hyderabad_zone2['customer_location_hex_8'].unique())

df['zone_name'] = np.where(df['customer_location_hex_8'].isin(hyderabad_zone1_hexes), 'zone1',
     np.where(df['customer_location_hex_8'].isin(hyderabad_zone2_hexes), 'zone2', 'rest_of_hyderabad'))
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date,20_sec_batch,count,zone_name
0,6329de449aa7cf28bdeb8a6f,0.267,8860a24b67fffff,17.498598,78.390015,Hyderabad,8860a24b0dfffff,17.489359,78.384377,210740,21,2100,Link,21:00,2,20220920,dropped,,40.0,7.0,20,20-40,1,rest_of_hyderabad
1,6329de67242ca34c4a39f43c,,8860b52cdbfffff,17.408791,78.508835,Hyderabad,8860b52cdbfffff,17.408791,78.508835,210815,21,2100,Link,21:00,2,20220920,customerCancelled,order cancelled before rider accepted,15.0,8.0,20,0-20,1,rest_of_hyderabad
2,6329df096ad6d118594e06b3,0.786,8860a24b5bfffff,17.462053,78.362251,Hyderabad,8860a259b1fffff,17.449514,78.382484,211057,21,2100,Link,21:00,2,20220920,dropped,,57.0,10.0,20,40-60,1,zone1
3,6329df937d42681b1c467d9a,0.866,8860a24b51fffff,17.468519,78.366844,Hyderabad,8860a258a9fffff,17.39978,78.38221,211315,21,2100,Link,21:00,2,20220920,dropped,,15.0,13.0,20,0-20,1,rest_of_hyderabad
4,6329e0329aa7cf28bdeb8e3b,1.117,8860b196c9fffff,17.463011,78.452927,Hyderabad,8860b196d1fffff,17.469362,78.427406,211554,21,2115,Link,21:00,2,20220920,dropped,Heavy Traffic,54.0,15.0,20,40-60,1,rest_of_hyderabad


In [9]:
#Create a column to identify OCARA, COBRA and COBRM
df['new_cancel_reason'] = np.where(df['order_status'] == 'customerCancelled', np.where(df['cancel_reason'] == 'order cancelled before rider accepted', 'COBRA',
    np.where(df['cancel_reason'] == 'Order cancelled before rider was mapped', 'COBRM', 'OCARA')), df['order_status'])
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date,20_sec_batch,count,zone_name,new_cancel_reason
0,6329de449aa7cf28bdeb8a6f,0.267,8860a24b67fffff,17.498598,78.390015,Hyderabad,8860a24b0dfffff,17.489359,78.384377,210740,21,2100,Link,21:00,2,20220920,dropped,,40.0,7.0,20,20-40,1,rest_of_hyderabad,dropped
1,6329de67242ca34c4a39f43c,,8860b52cdbfffff,17.408791,78.508835,Hyderabad,8860b52cdbfffff,17.408791,78.508835,210815,21,2100,Link,21:00,2,20220920,customerCancelled,order cancelled before rider accepted,15.0,8.0,20,0-20,1,rest_of_hyderabad,COBRA
2,6329df096ad6d118594e06b3,0.786,8860a24b5bfffff,17.462053,78.362251,Hyderabad,8860a259b1fffff,17.449514,78.382484,211057,21,2100,Link,21:00,2,20220920,dropped,,57.0,10.0,20,40-60,1,zone1,dropped
3,6329df937d42681b1c467d9a,0.866,8860a24b51fffff,17.468519,78.366844,Hyderabad,8860a258a9fffff,17.39978,78.38221,211315,21,2100,Link,21:00,2,20220920,dropped,,15.0,13.0,20,0-20,1,rest_of_hyderabad,dropped
4,6329e0329aa7cf28bdeb8e3b,1.117,8860b196c9fffff,17.463011,78.452927,Hyderabad,8860b196d1fffff,17.469362,78.427406,211554,21,2115,Link,21:00,2,20220920,dropped,Heavy Traffic,54.0,15.0,20,40-60,1,rest_of_hyderabad,dropped


In [10]:
#Filter the df on the 3 zones
df_zone1 = df[df['zone_name'] == 'zone1']
df_zone2 = df[df['zone_name'] == 'zone2']
df_zone_rest = df[df['zone_name'] == 'rest_of_hyderabad']

In [21]:
#Check the end order state for all orders occurring in a 20_sec_batch for zone1
mistiming_pivot_zone1 = pd.pivot_table(data = df_zone1, index = ['date', 'hour', 'minute', '20_sec_batch'], columns = 'new_cancel_reason', values = 'count',
aggfunc='sum', fill_value=0).reset_index().rename_axis(None, axis=1)

#Get the percentage for all order types
mistiming_pivot_zone1['total'] = mistiming_pivot_zone1[['COBRA', 'COBRM', 'OCARA', 'aborted', 'dropped', 'expired', 'new']].sum(axis = 1)
mistiming_pivot_zone1['dropped_%'] = round(mistiming_pivot_zone1['dropped']/mistiming_pivot_zone1['total'], 4)
mistiming_pivot_zone1['COBRA_%'] = round(mistiming_pivot_zone1['COBRA']/mistiming_pivot_zone1['total'], 4)
mistiming_pivot_zone1['COBRM_%'] = round(mistiming_pivot_zone1['COBRM']/mistiming_pivot_zone1['total'], 4)
mistiming_pivot_zone1['OCARA_%'] = round(mistiming_pivot_zone1['OCARA']/mistiming_pivot_zone1['total'], 4)
mistiming_pivot_zone1['expiry_%'] = round(mistiming_pivot_zone1['expired']/mistiming_pivot_zone1['total'], 4)

#Get the average of each percentage by hour
mistiming_grouped_zone1 = mistiming_pivot_zone1.groupby(['date', 'hour'])['dropped_%', 'COBRA_%', 'COBRM_%', 'OCARA_%', 'expiry_%'].mean().reset_index()
mistiming_grouped_zone1.head()

Unnamed: 0,date,hour,dropped_%,COBRA_%,COBRM_%,OCARA_%,expiry_%
0,19,0,0.637534,0.116319,0.0,0.246146,0.0
1,19,1,0.62458,0.083333,0.0,0.292087,0.0
2,19,2,0.591666,0.1,0.0,0.308334,0.0
3,19,3,0.732558,0.069767,0.0,0.197674,0.0
4,19,4,0.612245,0.102041,0.0,0.285714,0.0


In [22]:
#Check the end order state for all orders occurring in a 20_sec_batch for zone2
mistiming_pivot_zone2 = pd.pivot_table(data = df_zone2, index = ['date', 'hour', 'minute', '20_sec_batch'], columns = 'new_cancel_reason', values = 'count',
aggfunc='sum', fill_value=0).reset_index().rename_axis(None, axis=1)

#Get the percentage for all order types
mistiming_pivot_zone2['total'] = mistiming_pivot_zone2[['COBRA', 'COBRM', 'OCARA', 'dropped', 'expired', 'aborted']].sum(axis = 1)
mistiming_pivot_zone2['dropped_%'] = round(mistiming_pivot_zone2['dropped']/mistiming_pivot_zone2['total'], 4)
mistiming_pivot_zone2['COBRA_%'] = round(mistiming_pivot_zone2['COBRA']/mistiming_pivot_zone2['total'], 4)
mistiming_pivot_zone2['COBRM_%'] = round(mistiming_pivot_zone2['COBRM']/mistiming_pivot_zone2['total'], 4)
mistiming_pivot_zone2['OCARA_%'] = round(mistiming_pivot_zone2['OCARA']/mistiming_pivot_zone2['total'], 4)
mistiming_pivot_zone2['expiry_%'] = round(mistiming_pivot_zone2['expired']/mistiming_pivot_zone2['total'], 4)

#Get the average of each percentage by hour
mistiming_grouped_zone2 = mistiming_pivot_zone2.groupby(['date', 'hour'])['dropped_%', 'COBRA_%', 'COBRM_%', 'OCARA_%', 'expiry_%'].mean().reset_index()
mistiming_grouped_zone2.head()

Unnamed: 0,date,hour,dropped_%,COBRA_%,COBRM_%,OCARA_%,expiry_%
0,19,0,0.528139,0.101731,0.0,0.370129,0.0
1,19,1,0.54885,0.103448,0.0,0.347702,0.0
2,19,2,0.542857,0.171429,0.0,0.285714,0.0
3,19,3,0.428571,0.035714,0.0,0.535714,0.0
4,19,4,0.701879,0.070423,0.0,0.227699,0.0


In [23]:
#Check the end order state for all orders occurring in a 20_sec_batch for zone2
mistiming_pivot_rest = pd.pivot_table(data = df_zone_rest, index = ['date', 'hour', 'minute', '20_sec_batch'], columns = 'new_cancel_reason', values = 'count',
aggfunc='sum', fill_value=0).reset_index().rename_axis(None, axis=1)

#Get the percentage for all order types
mistiming_pivot_rest['total'] = mistiming_pivot_rest[['COBRA', 'COBRM', 'OCARA', 'aborted', 'dropped', 'expired', 'new']].sum(axis = 1)
mistiming_pivot_rest['dropped_%'] = round(mistiming_pivot_rest['dropped']/mistiming_pivot_rest['total'], 4)
mistiming_pivot_rest['COBRA_%'] = round(mistiming_pivot_rest['COBRA']/mistiming_pivot_rest['total'], 4)
mistiming_pivot_rest['COBRM_%'] = round(mistiming_pivot_rest['COBRM']/mistiming_pivot_rest['total'], 4)
mistiming_pivot_rest['OCARA_%'] = round(mistiming_pivot_rest['OCARA']/mistiming_pivot_rest['total'], 4)
mistiming_pivot_rest['expiry_%'] = round(mistiming_pivot_rest['expired']/mistiming_pivot_rest['total'], 4)

#Get the average of each percentage by hour
mistiming_grouped_rest = mistiming_pivot_rest.groupby(['date', 'hour'])['dropped_%', 'COBRA_%', 'COBRM_%', 'OCARA_%', 'expiry_%'].mean().reset_index()
mistiming_grouped_rest.head()

Unnamed: 0,date,hour,dropped_%,COBRA_%,COBRM_%,OCARA_%,expiry_%
0,19,0,0.482363,0.119944,0.01747,0.370503,0.006501
1,19,1,0.432929,0.113952,0.024112,0.414475,0.010366
2,19,2,0.442133,0.140089,0.018323,0.383804,0.015647
3,19,3,0.403131,0.140816,0.008876,0.430167,0.017007
4,19,4,0.506237,0.126953,0.026841,0.316182,0.022829


In [24]:
#Save the results
mistiming_grouped_zone1.to_csv('mistiming_hyderabad_zone1.csv', index = False)
mistiming_grouped_zone2.to_csv('mistiming_hyderabad_zone2.csv', index = False)
mistiming_grouped_rest.to_csv('mistiming_hyderabad_rest.csv', index = False)