In [1]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [2]:
#Define presto credentials
presto_host = 'presto.processing.yoda.run'
presto_port = '80'
username = 'aditya.bhattar@rapido.bike'

#Create connection to presto host
connection = presto.connect(presto_host,presto_port,username = username)

In [3]:
#Query data from order_logs_snapshot

q = """
select order_id, accept_to_pickup_distance, city_name, customer_location_hex_8, customer_location_latitude, 
customer_location_longitude, hhmmss, hour, quarter_hour, service_obj_service_name, time_bucket, weekday, yyyymmdd,
order_status, cancel_reason, eta
from orders.order_logs_snapshot
where yyyymmdd >= '20220919'
and yyyymmdd <= '20221016'
and city_name in ('Bangalore')
and service_obj_service_name = 'Link'
"""

#Load data into pandas table
df_copy = pd.read_sql(q, connection)

In [16]:
#Query data for getting the gross number of captains
q_captain = """
select pickup_location_hex_8, yyyymmdd, order_type, time, count(distinct captain_id) as gross_captains from 
(select 
        order_id, updated_epoch, yyyymmdd, pickup_location_hex_8, captain_location_hex_8, drop_location_hex_8 as hex_id,
        city_name, order_type, captain_id, event_type, order_status, hhmmss as time,
        cardinality(array_distinct(array_remove(split(replace(replace(replace(map_riders, '['), ']'), '"'), ','), ''))) as map_riders_count,
        array_agg(event_type) over (partition by order_id) as event_type_agg, updated_hhmmss
        
from 
        orders.order_logs_immutable
where 
        yyyymmdd >= '20220919'
        and yyyymmdd <= '20221016'
        and pickup_location_hex_8 IN ('8861892087fffff', '8861892097fffff', '88618920a3fffff', '88618920b1fffff', '88618920b9fffff', '88618920bbfffff',
         '88618920bdfffff', '8861892091fffff', '8861892093fffff', '8861892095fffff', '88618920a7fffff', '88618920abfffff', '88618920b3fffff', '88618920b5fffff',
          '88618920b7fffff', '8861892e49fffff') and order_type IN ('app'))
group by 1, 2, 3, 4
"""

df_captain_copy = pd.read_sql(q_captain, connection)

In [17]:
#Create a copy of the df retrived from presto
df = df_copy.copy()
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta
0,634c32b609ef5036d3ece7d3,1.494,Bangalore,88618920a3fffff,12.964667,77.71627,220502,22,2200,Link,22:00,7,20221016,dropped,,6.0
1,634c32ef6a109e753ae075b0,,Bangalore,88618925d9fffff,12.907373,77.600839,220559,22,2200,Link,22:00,7,20221016,expired,,0.0
2,634c32fc1b4dce751501d59c,,Bangalore,8860145b5dfffff,12.973323,77.577431,220612,22,2200,Link,22:00,7,20221016,customerCancelled,order cancelled before rider accepted,0.0
3,634c34336a109e753ae077a9,1.231,Bangalore,8861892531fffff,12.958807,77.64312,221123,22,2200,Link,22:00,7,20221016,customerCancelled,Customer asked to cancel,5.0
4,634c348278c9d5232134da03,,Bangalore,8860145a35fffff,12.961984,77.53376,221242,22,2200,Link,22:00,7,20221016,customerCancelled,order cancelled before rider accepted,1.0


In [18]:
#View the dataset
df_captain = df_captain_copy.copy()
df_captain.head()

Unnamed: 0,pickup_location_hex_8,yyyymmdd,order_type,time,gross_captains
0,88618920bdfffff,20220929,app,85329,22
1,88618920b1fffff,20221002,app,153408,3
2,88618920a3fffff,20221012,app,93507,12
3,8861892093fffff,20220920,app,191329,26
4,8861892093fffff,20220930,app,113841,7


In [19]:
#Load datasets including hexes from level_1 and level_2
df_level_1 = pd.read_csv('bangalore_location_data_level1_demand_zone2.csv')

level1_hexes = list(df_level_1['customer_location_hex_8'].unique())

#Filter only on the hexes needed
df_filter_level1 = df[df['customer_location_hex_8'].isin(level1_hexes)]
df_filter_level1['count'] = 1
df_filter_level1['date'] = df_filter_level1['yyyymmdd'].apply(lambda x: x[6:])

In [20]:
#Create a new column for cancel_reason to identify OCARA
df_filter_level1['new_cancel_reason'] = np.where(df_filter_level1['cancel_reason'] == 'order cancelled before rider accepted', 'COBRA',
 np.where(df_filter_level1['cancel_reason'] == 'Order cancelled before rider was mapped', 'COBRM', 'OCARA'))

In [21]:
#Zone-level-metrics, calculated on per hour basis

hourly_metrics = []
date_metrics = {}
for date in df_filter_level1['date'].unique().tolist():
    df_date = df_filter_level1[df_filter_level1['date'] == date]
    for hour in list(df_date['hour'].unique()):
        df_hourly_level1 = df_date[df_date['hour'] == hour]

        #OCARA
        df_cancel_level1 = df_hourly_level1[df_hourly_level1['order_status'] == 'customerCancelled']

        ocara_zone_level1 = round((df_cancel_level1[df_cancel_level1['new_cancel_reason'] == 'OCARA'].shape[0]/df_hourly_level1.shape[0]), 4)
        #COBRA
        cobra_zone_level1 = round((df_cancel_level1[df_cancel_level1['new_cancel_reason'] == 'COBRA'].shape[0]/df_hourly_level1.shape[0]), 4)

        #COBRM
        cobrm_zone_level1 = round((df_cancel_level1[df_cancel_level1['new_cancel_reason'] == 'COBRM'].shape[0]/df_hourly_level1.shape[0]), 4)

        #G2N
        g2n_zone_level1 = round((df_hourly_level1[df_hourly_level1['order_status'] == 'dropped'].shape[0]/df_hourly_level1.shape[0]), 4)

        #ETA
        eta_zone_level1 = round(df_hourly_level1['eta'].mean(), 2)

        #FM
        dropped_rides_level_1 = df_hourly_level1[df_hourly_level1['order_status'] == 'dropped']
        fm_zone_level1 = round(dropped_rides_level_1['accept_to_pickup_distance'].mean(), 2)

        #Create list of different metrics
        level1_metrics = [date, hour, ocara_zone_level1, cobra_zone_level1, cobrm_zone_level1, g2n_zone_level1, eta_zone_level1, fm_zone_level1]
        hourly_metrics.append(level1_metrics)
metrics_df = pd.DataFrame(hourly_metrics)
metrics_df.columns = ['date', 'hour', 'OCARA', 'COBRA', 'COBRM', 'G2N', 'ETA', 'FM']
metrics_df = metrics_df.sort_values(by = ['date', 'hour'], ascending=True)
metrics_df

Unnamed: 0,date,hour,OCARA,COBRA,COBRM,G2N,ETA,FM
82,01,00,0.3057,0.4841,0.0000,0.1146,5.27,2.19
72,01,01,0.1360,0.4800,0.0240,0.1040,4.38,2.08
80,01,02,0.2750,0.5000,0.0500,0.0750,5.72,1.47
84,01,03,0.0896,0.3731,0.0448,0.0149,4.30,3.71
89,01,04,0.0845,0.3521,0.1972,0.0704,3.82,3.03
...,...,...,...,...,...,...,...,...
482,30,19,0.1342,0.4541,0.0015,0.2321,2.91,1.41
480,30,20,0.1613,0.4578,0.0000,0.2318,2.98,1.50
492,30,21,0.1934,0.4177,0.0010,0.3011,3.38,1.49
494,30,22,0.2607,0.3712,0.0000,0.3129,4.16,1.63


In [22]:
#Get the number of rides by order status
rides_by_order = pd.pivot_table(df_filter_level1, index = ['date', 'hour'], columns = 'order_status', values = 'count', aggfunc = 'sum',
 fill_value=0).reset_index().rename_axis(None, axis=1)
rides_by_order['total'] = rides_by_order[['aborted', 'customerCancelled', 'dropped', 'expired']].sum(axis = 1)
rides_by_order['Expiry %'] = round(rides_by_order['expired']/rides_by_order['total'], 4)

#Merge with metrics
metrics_df = metrics_df.merge(rides_by_order[['date', 'hour', 'Expiry %', 'total']], on = ['date', 'hour'], how = 'left')
metrics_df

Unnamed: 0,date,hour,OCARA,COBRA,COBRM,G2N,ETA,FM,Expiry %,total
0,01,00,0.3057,0.4841,0.0000,0.1146,5.27,2.19,0.0892,157
1,01,01,0.1360,0.4800,0.0240,0.1040,4.38,2.08,0.2560,125
2,01,02,0.2750,0.5000,0.0500,0.0750,5.72,1.47,0.1000,40
3,01,03,0.0896,0.3731,0.0448,0.0149,4.30,3.71,0.4776,67
4,01,04,0.0845,0.3521,0.1972,0.0704,3.82,3.03,0.2958,71
...,...,...,...,...,...,...,...,...,...,...
667,30,19,0.1342,0.4541,0.0015,0.2321,2.91,1.41,0.1753,1295
668,30,20,0.1613,0.4578,0.0000,0.2318,2.98,1.50,0.1468,1376
669,30,21,0.1934,0.4177,0.0010,0.3011,3.38,1.49,0.0828,1003
670,30,22,0.2607,0.3712,0.0000,0.3129,4.16,1.63,0.0475,652


In [23]:
#Create a column for hour, minute and second in df_captain
df_captain['hour'] = df_captain['time'].apply(lambda x: x[0:2])
df_captain['minute'] = df_captain['time'].apply(lambda x: x[2:4])
df_captain['second'] = df_captain['time'].apply(lambda x: x[4:])
df_captain['date'] = df_captain['yyyymmdd'].apply(lambda x: x[6:])

#Create a column for getting the 20_sec_batch
df_captain['minute'] = df_captain['minute'].astype(float)
df_captain['second'] = df_captain['second'].astype(float)
df_captain['20_sec_batch'] = pd.cut(df_captain['second'], bins = [-1, 20, 40, 61], labels = ['0-20', '20-40', '40-60'])
df_captain['count'] = 1
df_captain.head()

Unnamed: 0,pickup_location_hex_8,yyyymmdd,order_type,time,gross_captains,hour,minute,second,date,20_sec_batch,count
0,88618920bdfffff,20220929,app,85329,22,8,53.0,29.0,29,20-40,1
1,88618920b1fffff,20221002,app,153408,3,15,34.0,8.0,2,0-20,1
2,88618920a3fffff,20221012,app,93507,12,9,35.0,7.0,12,0-20,1
3,8861892093fffff,20220920,app,191329,26,19,13.0,29.0,20,20-40,1
4,8861892093fffff,20220930,app,113841,7,11,38.0,41.0,30,40-60,1


In [24]:
#Get the number of captains coming in each hour and every 20_sec_batch
captain_hourly = df_captain.groupby(['date', 'hour'])['gross_captains'].sum().reset_index()
captain_hourly = captain_hourly.rename(columns={'gross_captains': 'hourly_gross_captains'})

captain_20_sec_batch = df_captain.groupby(['date', 'hour', 'minute', '20_sec_batch'])['gross_captains'].sum().reset_index()

#Get the average number of captains available in each 20_sec_batch per hour
captain_20_sec_average = captain_20_sec_batch.groupby(['date', 'hour'])['gross_captains'].mean().reset_index()
captain_20_sec_average['gross_captains'] = captain_20_sec_average['gross_captains'].apply(lambda x: round(x, 2))
captain_20_sec_average = captain_20_sec_average.rename(columns= {'gross_captains':'20_sec_gross_captains'})

#Merge the hourly captains and 20_sec_captains with the metrics table
metrics_df = metrics_df.merge(captain_hourly, on = ['date', 'hour'], how = 'left')
metrics_df = metrics_df.merge(captain_20_sec_average, on = ['date', 'hour'], how = 'left')
metrics_df

Unnamed: 0,date,hour,OCARA,COBRA,COBRM,G2N,ETA,FM,Expiry %,total,hourly_gross_captains,20_sec_gross_captains
0,01,00,0.3057,0.4841,0.0000,0.1146,5.27,2.19,0.0892,157,797,4.43
1,01,01,0.1360,0.4800,0.0240,0.1040,4.38,2.08,0.2560,125,559,3.11
2,01,02,0.2750,0.5000,0.0500,0.0750,5.72,1.47,0.1000,40,163,0.91
3,01,03,0.0896,0.3731,0.0448,0.0149,4.30,3.71,0.4776,67,246,1.37
4,01,04,0.0845,0.3521,0.1972,0.0704,3.82,3.03,0.2958,71,194,1.08
...,...,...,...,...,...,...,...,...,...,...,...,...
667,30,19,0.1342,0.4541,0.0015,0.2321,2.91,1.41,0.1753,1295,13610,75.61
668,30,20,0.1613,0.4578,0.0000,0.2318,2.98,1.50,0.1468,1376,12560,69.78
669,30,21,0.1934,0.4177,0.0010,0.3011,3.38,1.49,0.0828,1003,7730,42.94
670,30,22,0.2607,0.3712,0.0000,0.3129,4.16,1.63,0.0475,652,5365,29.81


In [25]:
#Load the deamnd_summary and add it to the metrics
df_demand = pd.read_csv('demand_bangalore.csv')
df_demand = df_demand.sort_values(by = ['date','hour'], ascending = True).reset_index(drop = True)
metrics_df['rides_per_hour'] = df_demand['rides_per_hour']
metrics_df['rides_per_20_sec'] = df_demand['rides_per_20_sec']
metrics_df

Unnamed: 0,date,hour,OCARA,COBRA,COBRM,G2N,ETA,FM,Expiry %,total,hourly_gross_captains,20_sec_gross_captains,rides_per_hour,rides_per_20_sec
0,01,00,0.3057,0.4841,0.0000,0.1146,5.27,2.19,0.0892,157,797,4.43,52.0,0.29
1,01,01,0.1360,0.4800,0.0240,0.1040,4.38,2.08,0.2560,125,559,3.11,42.0,0.23
2,01,02,0.2750,0.5000,0.0500,0.0750,5.72,1.47,0.1000,40,163,0.91,13.0,0.07
3,01,03,0.0896,0.3731,0.0448,0.0149,4.30,3.71,0.4776,67,246,1.37,22.0,0.12
4,01,04,0.0845,0.3521,0.1972,0.0704,3.82,3.03,0.2958,71,194,1.08,24.0,0.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,30,19,0.1342,0.4541,0.0015,0.2321,2.91,1.41,0.1753,1295,13610,75.61,432.0,2.40
668,30,20,0.1613,0.4578,0.0000,0.2318,2.98,1.50,0.1468,1376,12560,69.78,459.0,2.55
669,30,21,0.1934,0.4177,0.0010,0.3011,3.38,1.49,0.0828,1003,7730,42.94,334.0,1.86
670,30,22,0.2607,0.3712,0.0000,0.3129,4.16,1.63,0.0475,652,5365,29.81,217.0,1.21


In [26]:
#Load the decrease in FM due to batching and add it to the metrics
decrease_FM = pd.read_excel('decrease_FM_batching.xlsx')
decrease_FM = decrease_FM.sort_values(by = ['date', 'hour'], ascending = True).reset_index(drop = True)
metrics_df['FM_decrease'] = decrease_FM['FM_decrease']
metrics_df

Unnamed: 0,date,hour,OCARA,COBRA,COBRM,G2N,ETA,FM,Expiry %,total,hourly_gross_captains,20_sec_gross_captains,rides_per_hour,rides_per_20_sec,FM_decrease
0,01,00,0.3057,0.4841,0.0000,0.1146,5.27,2.19,0.0892,157,797,4.43,52.0,0.29,0.0091
1,01,01,0.1360,0.4800,0.0240,0.1040,4.38,2.08,0.2560,125,559,3.11,42.0,0.23,0.0033
2,01,02,0.2750,0.5000,0.0500,0.0750,5.72,1.47,0.1000,40,163,0.91,13.0,0.07,-0.0530
3,01,03,0.0896,0.3731,0.0448,0.0149,4.30,3.71,0.4776,67,246,1.37,22.0,0.12,0.0433
4,01,04,0.0845,0.3521,0.1972,0.0704,3.82,3.03,0.2958,71,194,1.08,24.0,0.13,0.0706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,30,19,0.1342,0.4541,0.0015,0.2321,2.91,1.41,0.1753,1295,13610,75.61,432.0,2.40,-0.0659
668,30,20,0.1613,0.4578,0.0000,0.2318,2.98,1.50,0.1468,1376,12560,69.78,459.0,2.55,-0.0785
669,30,21,0.1934,0.4177,0.0010,0.3011,3.38,1.49,0.0828,1003,7730,42.94,334.0,1.86,-0.0306
670,30,22,0.2607,0.3712,0.0000,0.3129,4.16,1.63,0.0475,652,5365,29.81,217.0,1.21,-0.0693


In [27]:
metrics_df.to_csv('metrics_bangalore.csv', index = False)