In [1]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
import time
from datetime import datetime
from functools import reduce
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [5]:
#Define presto credentials
presto_host = 'prime-trino.serving.data.production.internal'
presto_port = '80'
username = 'aditya.bhattar@rapido.bike'

#Create connection to presto host
connection = presto.connect(presto_host,presto_port,username = username)

In [7]:
#Query data from order_logs_snapshot

q = """
select order_id, accept_to_pickup_distance, captain_location_hex_8, captain_location_latitude,
captain_location_longitude, city_name, customer_location_hex_8, drop_location_hex_8, drop_location_latitude, drop_location_longitude, customer_location_latitude, 
customer_location_longitude, hhmmss, hour, quarter_hour, service_obj_service_name, time_bucket, weekday, yyyymmdd,
order_status, cancel_reason, updated_epoch, event_type
from orders.order_logs_immutable
where yyyymmdd >= '20221107'
and yyyymmdd <= '20221111'
and city_name in ('Delhi')
and service_obj_service_name = 'Link'
and event_type in ('customer_cancelled', 'order_requested', 'accepted')
"""

#Load data into pandas table
df_copy = pd.read_sql(q, connection)

In [22]:
#Load and view dataset
df = df_copy.copy()
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,updated_epoch,event_type
0,636d75ce26e7152534e21614,4.146,883da102d7fffff,28.541281,77.233696,Delhi,883da102d5fffff,883da10291fffff,28.529666,77.255893,28.541725,77.232414,33606,3,330,Link,03:59,5,20221111,customerCancelled,Drop location denied,1668118250045,customer_cancelled
1,636d78e226e7152534e216ee,,,,,Delhi,883da10647fffff,883da10601fffff,28.582844,77.332756,28.584158,77.312462,34914,3,345,Link,03:59,5,20221111,requested,,1668118754836,order_requested
2,636d76b98049e737e656671a,2.948,883da1184bfffff,28.431278,77.018631,Delhi,883da1184bfffff,883da11a83fffff,28.459147,77.072858,28.431358,77.017555,34001,3,330,Link,03:59,5,20221111,customerCancelled,Taking longer than expected,1668118466983,customer_cancelled
3,636d6f321130da36edf43709,,,,,Delhi,883da11291fffff,883da1129bfffff,28.628199,77.108661,28.625067,77.110588,30754,3,300,Link,03:00,5,20221111,requested,,1668116274065,order_requested
4,636d6fca26e7152534e21493,1.212,8842ca1105fffff,22.03932,73.122238,Delhi,8842ca1105fffff,883da13ae5fffff,28.482238,77.516628,22.03932,73.122238,31026,3,300,Link,03:00,5,20221111,customerCancelled,Change of plans,1668116602653,customer_cancelled


In [23]:
#Create a column to identify OCARA, COBRA and COBRM
df['new_cancel_reason'] = np.where(df['event_type'] == 'customer_cancelled', np.where(df['cancel_reason'] == 'order cancelled before rider accepted', 'COBRA',
    np.where(df['cancel_reason'] == 'Order cancelled before rider was mapped', 'COBRM', 'OCARA')), 'no_cancellation')
df.head()

#Create a timestamp column from the epoch column
df['timestamp'] = df['updated_epoch'].apply(lambda x: datetime.utcfromtimestamp(x/1000))
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,updated_epoch,event_type,new_cancel_reason,timestamp
0,636d75ce26e7152534e21614,4.146,883da102d7fffff,28.541281,77.233696,Delhi,883da102d5fffff,883da10291fffff,28.529666,77.255893,28.541725,77.232414,33606,3,330,Link,03:59,5,20221111,customerCancelled,Drop location denied,1668118250045,customer_cancelled,OCARA,2022-11-10 22:10:50.045
1,636d78e226e7152534e216ee,,,,,Delhi,883da10647fffff,883da10601fffff,28.582844,77.332756,28.584158,77.312462,34914,3,345,Link,03:59,5,20221111,requested,,1668118754836,order_requested,no_cancellation,2022-11-10 22:19:14.836
2,636d76b98049e737e656671a,2.948,883da1184bfffff,28.431278,77.018631,Delhi,883da1184bfffff,883da11a83fffff,28.459147,77.072858,28.431358,77.017555,34001,3,330,Link,03:59,5,20221111,customerCancelled,Taking longer than expected,1668118466983,customer_cancelled,OCARA,2022-11-10 22:14:26.983
3,636d6f321130da36edf43709,,,,,Delhi,883da11291fffff,883da1129bfffff,28.628199,77.108661,28.625067,77.110588,30754,3,300,Link,03:00,5,20221111,requested,,1668116274065,order_requested,no_cancellation,2022-11-10 21:37:54.065
4,636d6fca26e7152534e21493,1.212,8842ca1105fffff,22.03932,73.122238,Delhi,8842ca1105fffff,883da13ae5fffff,28.482238,77.516628,22.03932,73.122238,31026,3,300,Link,03:00,5,20221111,customerCancelled,Change of plans,1668116602653,customer_cancelled,OCARA,2022-11-10 21:43:22.653


In [24]:
#Add column to identify the hexes
delhi_zone1 = pd.read_csv('/Users/rapido/Desktop/batching_demand/delhi/delhi_location_data_level1_demand_zone1.csv')
delhi_zone2 = pd.read_csv('/Users/rapido/Desktop/batching_demand/delhi/delhi_location_data_level1_demand_zone2.csv')

delhi_zone1_hexes = list(delhi_zone1['customer_location_hex_8'].unique())
delhi_zone2_hexes = list(delhi_zone2['customer_location_hex_8'].unique())

df['zone_name'] = np.where(df['customer_location_hex_8'].isin(delhi_zone1_hexes), 'zone1',
     np.where(df['customer_location_hex_8'].isin(delhi_zone2_hexes), 'zone2', 'rest_of_delhi'))
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,updated_epoch,event_type,new_cancel_reason,timestamp,zone_name
0,636d75ce26e7152534e21614,4.146,883da102d7fffff,28.541281,77.233696,Delhi,883da102d5fffff,883da10291fffff,28.529666,77.255893,28.541725,77.232414,33606,3,330,Link,03:59,5,20221111,customerCancelled,Drop location denied,1668118250045,customer_cancelled,OCARA,2022-11-10 22:10:50.045,rest_of_delhi
1,636d78e226e7152534e216ee,,,,,Delhi,883da10647fffff,883da10601fffff,28.582844,77.332756,28.584158,77.312462,34914,3,345,Link,03:59,5,20221111,requested,,1668118754836,order_requested,no_cancellation,2022-11-10 22:19:14.836,rest_of_delhi
2,636d76b98049e737e656671a,2.948,883da1184bfffff,28.431278,77.018631,Delhi,883da1184bfffff,883da11a83fffff,28.459147,77.072858,28.431358,77.017555,34001,3,330,Link,03:59,5,20221111,customerCancelled,Taking longer than expected,1668118466983,customer_cancelled,OCARA,2022-11-10 22:14:26.983,rest_of_delhi
3,636d6f321130da36edf43709,,,,,Delhi,883da11291fffff,883da1129bfffff,28.628199,77.108661,28.625067,77.110588,30754,3,300,Link,03:00,5,20221111,requested,,1668116274065,order_requested,no_cancellation,2022-11-10 21:37:54.065,rest_of_delhi
4,636d6fca26e7152534e21493,1.212,8842ca1105fffff,22.03932,73.122238,Delhi,8842ca1105fffff,883da13ae5fffff,28.482238,77.516628,22.03932,73.122238,31026,3,300,Link,03:00,5,20221111,customerCancelled,Change of plans,1668116602653,customer_cancelled,OCARA,2022-11-10 21:43:22.653,rest_of_delhi


In [25]:
#Filter df on the columns needed
df_filter = df[['order_id', 'customer_location_hex_8', 'hour', 'yyyymmdd', 'timestamp', 'event_type', 'new_cancel_reason', 'zone_name']]

#Filter on the respective order_status
df_cobra = df_filter[df_filter['new_cancel_reason'] == 'COBRA']
df_ocara = df_filter[df_filter['new_cancel_reason'] == 'OCARA']
df_cobrm = df_filter[df_filter['new_cancel_reason'] == 'COBRM']
df_order_created = df_filter[df_filter['event_type'] == 'order_requested']
df_accepted = df_filter[df_filter['event_type'] == 'accepted']

#Sort the dfs by timestamp
df_cobra = df_cobra.sort_values(by = 'timestamp', ascending=True)
df_cobra = df_cobra.drop_duplicates(subset='order_id')

df_ocara = df_ocara.sort_values(by = 'timestamp', ascending=True)
df_ocara = df_ocara.drop_duplicates(subset='order_id')

df_cobrm = df_cobrm.sort_values(by = 'timestamp', ascending=True)
df_cobrm = df_cobrm.drop_duplicates(subset='order_id')

df_accepted = df_accepted.sort_values(by = 'timestamp', ascending=True)
df_accepted = df_accepted.drop_duplicates(subset='order_id')

#Change the column names
df_cobra = df_cobra[['order_id', 'new_cancel_reason', 'timestamp']]
df_cobra.columns = ['order_id', 'COBRA', 'timestamp_cobra']

df_cobrm = df_cobrm[['order_id', 'new_cancel_reason', 'timestamp']]
df_cobrm.columns = ['order_id', 'COBRM', 'timestamp_cobrm']

df_ocara = df_ocara[['order_id', 'new_cancel_reason', 'timestamp']]
df_ocara.columns = ['order_id', 'OCARA', 'timestamp_ocara']

df_accepted = df_accepted[['order_id', 'event_type', 'timestamp']]
df_accepted.columns = ['order_id', 'Accepted', 'timestamp_accepted']

#Merge the dfs together
dfs_to_merge = [df_order_created, df_cobra, df_cobrm, df_ocara, df_accepted]
df_final = reduce(lambda  left, right: pd.merge(left, right,on=['order_id'], how='left'), dfs_to_merge).fillna('NA')
df_final.head()

Unnamed: 0,order_id,customer_location_hex_8,hour,yyyymmdd,timestamp,event_type,new_cancel_reason,zone_name,COBRA,timestamp_cobra,COBRM,timestamp_cobrm,OCARA,timestamp_ocara,Accepted,timestamp_accepted
0,636d78e226e7152534e216ee,883da10647fffff,3,20221111,2022-11-10 22:19:14.836,order_requested,no_cancellation,rest_of_delhi,,,,,,,accepted,2022-11-10 22:19:33.489000
1,636d6f321130da36edf43709,883da11291fffff,3,20221111,2022-11-10 21:37:54.065,order_requested,no_cancellation,rest_of_delhi,,,,,,,,
2,636d78c2b865071698672fcf,883da10647fffff,3,20221111,2022-11-10 22:18:42.174,order_requested,no_cancellation,rest_of_delhi,COBRA,2022-11-10 22:19:03.982000,,,,,,
3,636d7b4e8049e737e6566859,883da11aadfffff,3,20221111,2022-11-10 22:29:34.857,order_requested,no_cancellation,rest_of_delhi,,,,,,,accepted,2022-11-10 22:30:05.789000
4,636d756a26e7152534e215fe,883da102cdfffff,3,20221111,2022-11-10 22:04:26.073,order_requested,no_cancellation,rest_of_delhi,,,,,,,,


In [26]:
#Filter on the respective order_status
df_cobra = df_final[df_final['COBRA'] == 'COBRA']
df_ocara = df_final[df_final['OCARA'] == 'OCARA']
df_cobrm = df_final[df_final['COBRM'] == 'COBRM']
df_accepted = df_final[df_final['Accepted'] == 'accepted']

#Calculate TTC for cobrm, ocara, and cobra and tta
df_cobra['ttc_cobra'] = pd.to_datetime(df_cobra['timestamp_cobra']) - pd.to_datetime(df_cobra['timestamp'])
df_cobra['ttc_cobra'] = df_cobra['ttc_cobra'].apply(lambda x: x.total_seconds())

df_cobrm['ttc_cobrm'] = pd.to_datetime(df_cobrm['timestamp_cobrm']) - pd.to_datetime(df_cobrm['timestamp'])
df_cobrm['ttc_cobrm'] = df_cobrm['ttc_cobrm'].apply(lambda x: x.total_seconds())

df_ocara['ttc_ocara'] = pd.to_datetime(df_ocara['timestamp_ocara']) - pd.to_datetime(df_ocara['timestamp'])
df_ocara['ttc_ocara'] = df_ocara['ttc_ocara'].apply(lambda x: x.total_seconds())

df_accepted['tta'] = pd.to_datetime(df_accepted['timestamp_accepted']) - pd.to_datetime(df_accepted['timestamp'])
df_accepted['tta'] = df_accepted['tta'].apply(lambda x: x.total_seconds())

#Filter only on the new columns made
df_cobra = df_cobra[['order_id', 'ttc_cobra']]
df_cobrm = df_cobrm[['order_id', 'ttc_cobrm']]
df_ocara = df_ocara[['order_id', 'ttc_ocara']]
df_accepted = df_accepted[['order_id', 'tta']]

#Merge the dfs together
dfs_to_merge = [df_final, df_cobra, df_cobrm, df_ocara, df_accepted]
df_final = reduce(lambda  left, right: pd.merge(left, right,on=['order_id'], how='left'), dfs_to_merge).fillna('NA')
df_final.head()

Unnamed: 0,order_id,customer_location_hex_8,hour,yyyymmdd,timestamp,event_type,new_cancel_reason,zone_name,COBRA,timestamp_cobra,COBRM,timestamp_cobrm,OCARA,timestamp_ocara,Accepted,timestamp_accepted,ttc_cobra,ttc_cobrm,ttc_ocara,tta
0,636d78e226e7152534e216ee,883da10647fffff,3,20221111,2022-11-10 22:19:14.836,order_requested,no_cancellation,rest_of_delhi,,,,,,,accepted,2022-11-10 22:19:33.489000,,,,18.653
1,636d6f321130da36edf43709,883da11291fffff,3,20221111,2022-11-10 21:37:54.065,order_requested,no_cancellation,rest_of_delhi,,,,,,,,,,,,
2,636d78c2b865071698672fcf,883da10647fffff,3,20221111,2022-11-10 22:18:42.174,order_requested,no_cancellation,rest_of_delhi,COBRA,2022-11-10 22:19:03.982000,,,,,,,21.808,,,
3,636d7b4e8049e737e6566859,883da11aadfffff,3,20221111,2022-11-10 22:29:34.857,order_requested,no_cancellation,rest_of_delhi,,,,,,,accepted,2022-11-10 22:30:05.789000,,,,30.932
4,636d756a26e7152534e215fe,883da102cdfffff,3,20221111,2022-11-10 22:04:26.073,order_requested,no_cancellation,rest_of_delhi,,,,,,,,,,,,


In [29]:
#Create a date column
df_final['date'] = df_final['yyyymmdd'].apply(lambda x: x[6:])
df_final['count'] = 1

df_cobra = df_final[df_final['COBRA'] == 'COBRA']
df_cobrm = df_final[df_final['COBRM'] == 'COBRM']
df_ocara = df_final[df_final['OCARA'] == 'OCARA']
df_accepted = df_final[df_final['Accepted'] == 'accepted']
df_cobra['ttc_cobra'] = df_cobra['ttc_cobra'].astype(float)
df_cobrm['ttc_cobrm'] = df_cobrm['ttc_cobrm'].astype(float)
df_ocara['ttc_ocara'] = df_ocara['ttc_ocara'].astype(float)
df_accepted['tta'] = df_accepted['tta'].astype(float)



#Get average ttc, tta
df_cobra_grouped = df_cobra.groupby(['zone_name', 'yyyymmdd', 'hour'])['ttc_cobra'].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9]).reset_index()
df_ocara_grouped = df_ocara.groupby(['zone_name', 'yyyymmdd', 'hour'])['ttc_ocara'].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9]).reset_index()
df_cobrm_grouped = df_cobrm.groupby(['zone_name', 'yyyymmdd', 'hour'])['ttc_cobrm'].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9]).reset_index()
df_accepted_grouped = df_accepted.groupby(['zone_name', 'yyyymmdd', 'hour'])['tta'].describe(percentiles=[0.5, 0.6, 0.7, 0.8, 0.9]).reset_index()
df_cobra_grouped['order_status'] = 'COBRA'
df_cobrm_grouped['order_status'] = 'COBRM'
df_ocara_grouped['order_status'] = 'OCARA'
df_accepted_grouped['order_status'] = 'accepted'
dfs_to_merge = [df_cobra_grouped, df_cobrm_grouped, df_ocara_grouped, df_accepted_grouped]
df_metrics_final = pd.concat(dfs_to_merge)
df_metrics_final.head()


Unnamed: 0,zone_name,yyyymmdd,hour,count,mean,std,min,50%,60%,70%,80%,90%,max,order_status
0,rest_of_delhi,20221107,0,289.0,241.566858,495.038523,2.04,118.545,149.8802,190.242,275.651,516.5566,4678.055,COBRA
1,rest_of_delhi,20221107,1,190.0,239.164395,481.491608,1.51,89.6455,120.5232,174.7261,270.3834,667.9474,3916.677,COBRA
2,rest_of_delhi,20221107,2,119.0,243.901546,724.680105,0.645,102.349,137.5408,186.266,220.344,323.0616,6731.763,COBRA
3,rest_of_delhi,20221107,3,97.0,199.852361,349.555568,2.311,111.718,148.4234,188.5968,241.738,369.1464,2445.234,COBRA
4,rest_of_delhi,20221107,4,140.0,214.677986,431.8227,0.869,99.5885,139.86,170.632,232.9148,382.7975,2802.885,COBRA


In [30]:
df_metrics_final.to_csv('percentile_dist_delhi.csv', index = False)

In [None]:
df_ocara_grouped = df_final[df_final['OCARA'] == 'OCARA'].groupby(['zone_name', 'yyyymmdd', 'hour'])['ttc_ocara'].describe().reset_index()
df_cobrm_grouped = df_final[df_final['COBRM'] == 'COBRM'].groupby(['zone_name', 'yyyymmdd', 'hour'])['ttc_cobrm'].describe().reset_index()
df_accepted_grouped = df_final[df_final['Accepted'] == 'accepted'].groupby(['zone_name', 'yyyymmdd', 'hour'])['tta'].describe().reset_index()

#Merge the dfs together
dfs_to_merge = [df_cobra_grouped, df_cobrm_grouped, df_ocara_grouped, df_accepted_grouped]
df_metrics_final = reduce(lambda  left, right: pd.merge(left, right,on=['zone_name', 'date', 'hour'], how='left'), dfs_to_merge).fillna('NA')
df_metrics_final.head()

In [45]:
#Save the results
df_metrics_final.to_csv('ttc_delhi.csv', index = False)

In [44]:
df_final.shape

(3939199, 22)