In [1]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
import time
from datetime import datetime
from functools import reduce
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [2]:
#Define presto credentials
presto_host = 'presto.processing.yoda.run'
presto_port = '80'
username = 'aditya.bhattar@rapido.bike'

#Create connection to presto host
connection = presto.connect(presto_host,presto_port,username = username)

In [4]:
#Query data from order_logs_snapshot

q = """
select order_id, accept_to_pickup_distance, captain_location_hex_8, captain_location_latitude,
captain_location_longitude, city_name, customer_location_hex_8, drop_location_hex_8, drop_location_latitude, drop_location_longitude, customer_location_latitude, 
customer_location_longitude, hhmmss, hour, quarter_hour, service_obj_service_name, time_bucket, weekday, yyyymmdd,
order_status, cancel_reason, updated_epoch, event_type
from orders.order_logs_immutable
where yyyymmdd = '20221103'
and city_name in ('Hyderabad')
and service_obj_service_name = 'Link'
and event_type in ('customer_cancelled', 'order_create', 'accepted')
"""

#Load data into pandas table
df_copy = pd.read_sql(q, connection)

In [17]:
#Load and view dataset
df = df_copy.copy()
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,updated_epoch,event_type
0,6362bca452c77607a2297fb3,1.058,8860b19647fffff,17.479614,78.506821,Hyderabad,8860b19641fffff,8860b52c81fffff,17.434812,78.520178,17.471205,78.508568,2324,0,15,Link,00:00,4,20221103,customerCancelled,order cancelled before rider accepted,1667416517678,customer_cancelled
1,6362bf868143791dd512b859,,8860b52e93fffff,17.378974,78.516481,Hyderabad,8860b52e93fffff,8860b52cd1fffff,17.419738,78.516554,17.381273,78.51651,3542,0,30,Link,00:59,4,20221103,onTheWay,,1667415950550,accepted
2,6362c04d52e5fb3bb7c541f0,,,,,Hyderabad,8860b19639fffff,8860b19663fffff,17.499955,78.510204,17.499146,78.477158,3901,0,30,Link,00:59,4,20221103,new,,1667416141956,order_create
3,6362b9a152c77607a2297dbf,0.157,8860b52501fffff,17.336836,78.578972,Hyderabad,8860b52501fffff,8860b5256bfffff,17.333746,78.599311,17.336836,78.578972,1033,0,0,Link,00:00,4,20221103,customerCancelled,Change of plans,1667414573701,customer_cancelled
4,6362b73b1463a17af059a4fb,0.138,8860a25987fffff,17.440216,78.395441,Hyderabad,8860a25981fffff,8860b52ea9fffff,17.401177,78.557812,17.438446,78.397964,19,0,0,Link,00:00,4,20221103,onTheWay,Wrong Pickup Location,1667414092099,accepted


In [18]:
#Create a column to identify OCARA, COBRA and COBRM
df['new_cancel_reason'] = np.where(df['event_type'] == 'customer_cancelled', np.where(df['cancel_reason'] == 'order cancelled before rider accepted', 'COBRA',
    np.where(df['cancel_reason'] == 'Order cancelled before rider was mapped', 'COBRM', 'OCARA')), 'no_cancellation')
df.head()

#Create a timestamp column from the epoch column
df['timestamp'] = df['updated_epoch'].apply(lambda x: datetime.utcfromtimestamp(x/1000))
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,updated_epoch,event_type,new_cancel_reason,timestamp
0,6362bca452c77607a2297fb3,1.058,8860b19647fffff,17.479614,78.506821,Hyderabad,8860b19641fffff,8860b52c81fffff,17.434812,78.520178,17.471205,78.508568,2324,0,15,Link,00:00,4,20221103,customerCancelled,order cancelled before rider accepted,1667416517678,customer_cancelled,COBRA,2022-11-02 19:15:17.678
1,6362bf868143791dd512b859,,8860b52e93fffff,17.378974,78.516481,Hyderabad,8860b52e93fffff,8860b52cd1fffff,17.419738,78.516554,17.381273,78.51651,3542,0,30,Link,00:59,4,20221103,onTheWay,,1667415950550,accepted,no_cancellation,2022-11-02 19:05:50.550
2,6362c04d52e5fb3bb7c541f0,,,,,Hyderabad,8860b19639fffff,8860b19663fffff,17.499955,78.510204,17.499146,78.477158,3901,0,30,Link,00:59,4,20221103,new,,1667416141956,order_create,no_cancellation,2022-11-02 19:09:01.956
3,6362b9a152c77607a2297dbf,0.157,8860b52501fffff,17.336836,78.578972,Hyderabad,8860b52501fffff,8860b5256bfffff,17.333746,78.599311,17.336836,78.578972,1033,0,0,Link,00:00,4,20221103,customerCancelled,Change of plans,1667414573701,customer_cancelled,OCARA,2022-11-02 18:42:53.701
4,6362b73b1463a17af059a4fb,0.138,8860a25987fffff,17.440216,78.395441,Hyderabad,8860a25981fffff,8860b52ea9fffff,17.401177,78.557812,17.438446,78.397964,19,0,0,Link,00:00,4,20221103,onTheWay,Wrong Pickup Location,1667414092099,accepted,no_cancellation,2022-11-02 18:34:52.099


In [19]:
#Add column to identify the hexes
hyderabad_zone1 = pd.read_csv('/Users/rapido/Desktop/batching_demand/hyderabad/hyderabad_location_data_level1_demand_zone1.csv')
hyderabad_zone2 = pd.read_csv('/Users/rapido/Desktop/batching_demand/hyderabad/hyderabad_location_data_level1_demand_zone2.csv')

hyderabad_zone1_hexes = list(hyderabad_zone1['customer_location_hex_8'].unique())
hyderabad_zone2_hexes = list(hyderabad_zone2['customer_location_hex_8'].unique())

df['zone_name'] = np.where(df['customer_location_hex_8'].isin(hyderabad_zone1_hexes), 'zone1',
     np.where(df['customer_location_hex_8'].isin(hyderabad_zone2_hexes), 'zone2', 'rest_of_hyderabad'))
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,updated_epoch,event_type,new_cancel_reason,timestamp,zone_name
0,6362bca452c77607a2297fb3,1.058,8860b19647fffff,17.479614,78.506821,Hyderabad,8860b19641fffff,8860b52c81fffff,17.434812,78.520178,17.471205,78.508568,2324,0,15,Link,00:00,4,20221103,customerCancelled,order cancelled before rider accepted,1667416517678,customer_cancelled,COBRA,2022-11-02 19:15:17.678,rest_of_hyderabad
1,6362bf868143791dd512b859,,8860b52e93fffff,17.378974,78.516481,Hyderabad,8860b52e93fffff,8860b52cd1fffff,17.419738,78.516554,17.381273,78.51651,3542,0,30,Link,00:59,4,20221103,onTheWay,,1667415950550,accepted,no_cancellation,2022-11-02 19:05:50.550,rest_of_hyderabad
2,6362c04d52e5fb3bb7c541f0,,,,,Hyderabad,8860b19639fffff,8860b19663fffff,17.499955,78.510204,17.499146,78.477158,3901,0,30,Link,00:59,4,20221103,new,,1667416141956,order_create,no_cancellation,2022-11-02 19:09:01.956,rest_of_hyderabad
3,6362b9a152c77607a2297dbf,0.157,8860b52501fffff,17.336836,78.578972,Hyderabad,8860b52501fffff,8860b5256bfffff,17.333746,78.599311,17.336836,78.578972,1033,0,0,Link,00:00,4,20221103,customerCancelled,Change of plans,1667414573701,customer_cancelled,OCARA,2022-11-02 18:42:53.701,rest_of_hyderabad
4,6362b73b1463a17af059a4fb,0.138,8860a25987fffff,17.440216,78.395441,Hyderabad,8860a25981fffff,8860b52ea9fffff,17.401177,78.557812,17.438446,78.397964,19,0,0,Link,00:00,4,20221103,onTheWay,Wrong Pickup Location,1667414092099,accepted,no_cancellation,2022-11-02 18:34:52.099,rest_of_hyderabad


In [20]:
#Filter df on the columns needed
df_filter = df[['order_id', 'customer_location_hex_8', 'hour', 'yyyymmdd', 'timestamp', 'event_type', 'new_cancel_reason', 'zone_name']]

#Filter on the respective order_status
df_cobra = df_filter[df_filter['new_cancel_reason'] == 'COBRA']
df_ocara = df_filter[df_filter['new_cancel_reason'] == 'OCARA']
df_cobrm = df_filter[df_filter['new_cancel_reason'] == 'COBRM']
df_order_created = df_filter[df_filter['event_type'] == 'order_create']
df_accepted = df_filter[df_filter['event_type'] == 'accepted']

#Sort the dfs by timestamp
df_cobra = df_cobra.sort_values(by = 'timestamp', ascending=True)
df_cobra = df_cobra.drop_duplicates(subset='order_id')

df_ocara = df_ocara.sort_values(by = 'timestamp', ascending=True)
df_ocara = df_ocara.drop_duplicates(subset='order_id')

df_cobrm = df_cobrm.sort_values(by = 'timestamp', ascending=True)
df_cobrm = df_cobrm.drop_duplicates(subset='order_id')

df_accepted = df_accepted.sort_values(by = 'timestamp', ascending=True)
df_accepted = df_accepted.drop_duplicates(subset='order_id')

#Change the column names
df_cobra = df_cobra[['order_id', 'new_cancel_reason', 'timestamp']]
df_cobra.columns = ['order_id', 'COBRA', 'timestamp_cobra']

df_cobrm = df_cobrm[['order_id', 'new_cancel_reason', 'timestamp']]
df_cobrm.columns = ['order_id', 'COBRM', 'timestamp_cobrm']

df_ocara = df_ocara[['order_id', 'new_cancel_reason', 'timestamp']]
df_ocara.columns = ['order_id', 'OCARA', 'timestamp_ocara']

df_accepted = df_accepted[['order_id', 'event_type', 'timestamp']]
df_accepted.columns = ['order_id', 'Accepted', 'timestamp_accepted']

#Merge the dfs together
dfs_to_merge = [df_order_created, df_cobra, df_cobrm, df_ocara, df_accepted]
df_final = reduce(lambda  left, right: pd.merge(left, right,on=['order_id'], how='left'), dfs_to_merge).fillna('NA')
df_final.head()

Unnamed: 0,order_id,customer_location_hex_8,hour,yyyymmdd,timestamp,event_type,new_cancel_reason,zone_name,COBRA,timestamp_cobra,COBRM,timestamp_cobrm,OCARA,timestamp_ocara,Accepted,timestamp_accepted
0,6362c04d52e5fb3bb7c541f0,8860b19639fffff,0,20221103,2022-11-02 19:09:01.956,order_create,no_cancellation,rest_of_hyderabad,COBRA,2022-11-02 19:11:21.679000,,,,,,
1,6362baac1463a17af059a74a,8860a259c9fffff,0,20221103,2022-11-02 18:45:01.360,order_create,no_cancellation,rest_of_hyderabad,,,,,OCARA,2022-11-02 19:06:35.199000,accepted,2022-11-02 18:45:05.084000
2,6362bbf752e5fb3bb7c53f0b,8860a25b4dfffff,0,20221103,2022-11-02 18:50:31.756,order_create,no_cancellation,rest_of_hyderabad,COBRA,2022-11-02 18:51:27.231000,,,,,,
3,6362c07f52c77607a2298210,8860b52cbbfffff,0,20221103,2022-11-02 19:09:51.703,order_create,no_cancellation,rest_of_hyderabad,,,,,OCARA,2022-11-02 19:14:58.597000,accepted,2022-11-02 19:09:55.390000
4,6362c1b852c77607a22982b8,8860b525a7fffff,0,20221103,2022-11-02 19:15:05.546,order_create,no_cancellation,rest_of_hyderabad,,,,,,,accepted,2022-11-02 19:15:16.282000


In [21]:
#Filter on the respective order_status
df_cobra = df_final[df_final['COBRA'] == 'COBRA']
df_ocara = df_final[df_final['OCARA'] == 'OCARA']
df_cobrm = df_final[df_final['COBRM'] == 'COBRM']
df_accepted = df_final[df_final['Accepted'] == 'accepted']

#Calculate TTC for cobrm, ocara, and cobra and tta
df_cobra['ttc_cobra'] = pd.to_datetime(df_cobra['timestamp_cobra']) - pd.to_datetime(df_cobra['timestamp'])
df_cobra['ttc_cobra'] = df_cobra['ttc_cobra'].apply(lambda x: x.total_seconds())

df_cobrm['ttc_cobrm'] = pd.to_datetime(df_cobrm['timestamp_cobrm']) - pd.to_datetime(df_cobrm['timestamp'])
df_cobrm['ttc_cobrm'] = df_cobrm['ttc_cobrm'].apply(lambda x: x.total_seconds())

df_ocara['ttc_ocara'] = pd.to_datetime(df_ocara['timestamp_ocara']) - pd.to_datetime(df_ocara['timestamp'])
df_ocara['ttc_ocara'] = df_ocara['ttc_ocara'].apply(lambda x: x.total_seconds())

df_accepted['tta'] = pd.to_datetime(df_accepted['timestamp_accepted']) - pd.to_datetime(df_accepted['timestamp'])
df_accepted['tta'] = df_accepted['tta'].apply(lambda x: x.total_seconds())

#Filter only on the new columns made
df_cobra = df_cobra[['order_id', 'ttc_cobra']]
df_cobrm = df_cobrm[['order_id', 'ttc_cobrm']]
df_ocara = df_ocara[['order_id', 'ttc_ocara']]
df_accepted = df_accepted[['order_id', 'tta']]

#Merge the dfs together
dfs_to_merge = [df_final, df_cobra, df_cobrm, df_ocara, df_accepted]
df_final = reduce(lambda  left, right: pd.merge(left, right,on=['order_id'], how='left'), dfs_to_merge)
df_final.head()

Unnamed: 0,order_id,customer_location_hex_8,hour,yyyymmdd,timestamp,event_type,new_cancel_reason,zone_name,COBRA,timestamp_cobra,COBRM,timestamp_cobrm,OCARA,timestamp_ocara,Accepted,timestamp_accepted,ttc_cobra,ttc_cobrm,ttc_ocara,tta
0,6362c04d52e5fb3bb7c541f0,8860b19639fffff,0,20221103,2022-11-02 19:09:01.956,order_create,no_cancellation,rest_of_hyderabad,COBRA,2022-11-02 19:11:21.679000,,,,,,,139.723,NaT,,
1,6362baac1463a17af059a74a,8860a259c9fffff,0,20221103,2022-11-02 18:45:01.360,order_create,no_cancellation,rest_of_hyderabad,,,,,OCARA,2022-11-02 19:06:35.199000,accepted,2022-11-02 18:45:05.084000,,NaT,1293.839,3.724
2,6362bbf752e5fb3bb7c53f0b,8860a25b4dfffff,0,20221103,2022-11-02 18:50:31.756,order_create,no_cancellation,rest_of_hyderabad,COBRA,2022-11-02 18:51:27.231000,,,,,,,55.475,NaT,,
3,6362c07f52c77607a2298210,8860b52cbbfffff,0,20221103,2022-11-02 19:09:51.703,order_create,no_cancellation,rest_of_hyderabad,,,,,OCARA,2022-11-02 19:14:58.597000,accepted,2022-11-02 19:09:55.390000,,NaT,306.894,3.687
4,6362c1b852c77607a22982b8,8860b525a7fffff,0,20221103,2022-11-02 19:15:05.546,order_create,no_cancellation,rest_of_hyderabad,,,,,,,accepted,2022-11-02 19:15:16.282000,,NaT,,10.736


In [24]:
#Create a date column
df_final['date'] = df_final['yyyymmdd'].apply(lambda x: x[6:])
df_final['count'] = 1

#Get average ttc, tta
df_cobra_grouped = df_final[df_final['COBRA'] == 'COBRA'].groupby(['zone_name', 'date', 'hour'])['ttc_cobra'].median().reset_index()
df_ocara_grouped = df_final[df_final['OCARA'] == 'OCARA'].groupby(['zone_name', 'date', 'hour'])['ttc_ocara'].median().reset_index()
#df_cobrm_grouped = df_final[df_final['COBRM'] == 'COBRM'].groupby(['zone_name', 'date', 'hour'])['ttc_cobrm'].median().reset_index()
df_accepted_grouped = df_final[df_final['Accepted'] == 'accepted'].groupby(['zone_name', 'date', 'hour'])['tta'].median().reset_index()

#Merge the dfs together
dfs_to_merge = [df_cobra_grouped, df_ocara_grouped, df_accepted_grouped]
df_metrics_final = reduce(lambda  left, right: pd.merge(left, right,on=['zone_name', 'date', 'hour'], how='left'), dfs_to_merge).fillna('NA')
df_metrics_final.head()

Unnamed: 0,zone_name,date,hour,ttc_cobra,ttc_ocara,tta
0,rest_of_hyderabad,3,0,40.3445,241.408,11.383
1,rest_of_hyderabad,3,1,41.528,269.7055,11.096
2,rest_of_hyderabad,3,2,51.342,283.515,11.1215
3,rest_of_hyderabad,3,3,76.404,344.761,11.807
4,rest_of_hyderabad,3,4,73.827,344.18,12.8115


In [25]:
df_metrics_final[df_metrics_final['zone_name'] == 'zone1']

Unnamed: 0,zone_name,date,hour,ttc_cobra,ttc_ocara,tta
15,zone1,3,0,10.06,367.913,9.7885
16,zone1,3,1,113.5745,351.165,11.0035
17,zone1,3,2,39.032,249.543,10.542
18,zone1,3,3,21.93,182.8965,12.6525
19,zone1,3,4,73.152,215.7755,13.575
20,zone1,3,5,37.336,359.4455,9.49
21,zone1,3,6,3.423,324.926,7.714
22,zone1,3,7,6.162,209.41,6.995
23,zone1,3,8,55.491,185.629,9.0295
24,zone1,3,9,51.18,164.321,9.8385


In [45]:
#Save the results
df_metrics_final.to_csv('ttc_hyderabad.csv', index = False)

In [44]:
df_final.shape

(4560489, 22)