In [1]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
import calendar
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [2]:
#Define presto credentials
presto_host = 'presto.processing.yoda.run'
presto_port = '80'
username = 'aditya.bhattar@rapido.bike'

#Create connection to presto host
connection = presto.connect(presto_host,presto_port,username = username)

In [5]:
#Query data to calculate gsr
q = """
select 
        yyyymmdd, hour,
        count(distinct order_id) as Gross_Orders,
        count(case when (time_diff >=20 or time_diff is NULL) then customer_id end) as GSR,
        count(distinct case when event_type in ('dropped') and spd_fraud_flag != True then order_id end) as net_orders,
        (case when count(distinct case when event_type in ('dropped') and spd_fraud_flag != True then order_id end) = 0 then 0 else
        sum(case when event_type in ('accepted', 'rider_busy', 'rider_reject') then 1 else 0 end)
        /cast(count(distinct case when event_type in ('dropped') and spd_fraud_flag != True then order_id end) as real) end) as ME
from
(
select
        *, 
        date_diff('minute',  LAG(FROM_UNIXTIME(CAST(epoch AS double) /1000)) OVER(PARTITION BY yyyymmdd, hour, city_name, customer_id, order_type, customer_location_hex_8
        ORDER BY yyyymmdd, customer_id, city_name, customer_location_hex_8, order_type, epoch), FROM_UNIXTIME(CAST(epoch AS double) /1000)) as time_diff,
        date_diff('minute',  LAG(FROM_UNIXTIME(CAST(epoch AS double) /1000)) OVER(PARTITION BY yyyymmdd, hour, city_name, customer_id, order_type, order_status, customer_location_hex_8, spd_fraud_flag 
        ORDER BY yyyymmdd, customer_id, city_name, customer_location_hex_8, order_type, epoch) , FROM_UNIXTIME(CAST(epoch AS double) /1000)) as time_diff_drop
from
(
select 
    order_id, city_name, order_type, cancel_reason, yyyymmdd, customer_id, epoch, substr(hhmmss,1,2) as hour,
    row_number() over (partition by order_id order by updated_epoch desc) as Rank_1,
    
    event_type, order_status, captain_id, pickup_cluster, customer_location_hex_8, spd_fraud_flag, pickup_location_hex_8

from
    orders.order_logs_immutable 
where
    yyyymmdd >= '20230306'
    and yyyymmdd <= '20230310'
    and pickup_location_hex_8 IN ('883da11abbfffff', '883da11ab5fffff', '883da11ab7fffff', '883da11ad7fffff', '883da118cbfffff', '883da11ad5fffff', '883da118dbfffff',
          '883da11aa5fffff', '883da11a95fffff', '883da11ab3fffff', '883da11aa9fffff', '883da11a87fffff', '883da11a9bfffff', '883da11ab9fffff', '883da11acdfffff',
            '883da1185bfffff', '883da118c9fffff', '883da11ab1fffff', '883da11853fffff', '883da11a91fffff', '883da11a83fffff', '883da11ac3fffff', '883da11ae1fffff',
              '883da11aadfffff', '883da11abdfffff', '883da11a99fffff', '883da11aabfffff', '883da11aa3fffff', '883da11aebfffff', '883da11a8bfffff', '883da11a8dfffff',
                '883da11ae7fffff', '883da11ac1fffff', '883da11a9dfffff', '883da11a85fffff', '883da11ae9fffff', '883da11a33fffff', '883da118ddfffff',
                  '883da11a13fffff', '883da11ac5fffff', '883da11aa1fffff', '883da11851fffff', '883da1036dfffff', '883da11aa7fffff', '883da11ae3fffff',
                    '883da11a93fffff', '883da1034dfffff', '883da118d1fffff', '883da118d7fffff', '883da11ae5fffff', '883da10369fffff', '883da11a81fffff',
                      '883da11aedfffff', '883da11a89fffff', '883da11a97fffff', '883da11ac7fffff', '883da10365fffff', '883da118d3fffff', '883da10345fffff',
                        '883da118d9fffff', '883da11859fffff')
     and order_type IN ('app')
     and hour in ('08', '09', '10', '11')
))
group by 1,2
"""

df_gsr_copy = pd.read_sql(q, connection)

In [6]:
#Create a copy of the dataset and view it
df_gsr = df_gsr_copy.copy()
df_gsr.head()

Unnamed: 0,yyyymmdd,hour,Gross_Orders,GSR,net_orders,ME
0,20230310,8,1434,1221,855,4.782456
1,20230310,11,900,756,541,4.57671
2,20230310,10,1621,1344,929,5.006459
3,20230307,11,863,725,507,4.737673
4,20230307,10,1454,1210,831,4.884477


In [7]:
#Calculate GSR2Net
df_gsr['GSR2Net'] = round(df_gsr['net_orders']/df_gsr['GSR'], 4)
df_gsr = df_gsr[['yyyymmdd', 'hour', 'GSR', 'GSR2Net']]

In [9]:
#Query data from supply_login_servicelevel_kpi
q = """
select
        yyyymmdd, substr(hhmm, 1, 2) as hour,
        sum(case when cast(status AS varchar) IN ('2','3','6','7','8','10') then duration end) as login_time,
        sum(case when cast(status AS varchar) IN ('2') then duration end) as idle_time
from 
(select 
        a.*, city, cluster
        
from 
        hive.datasets.supplycursory_history a 
        left join datasets.city_cluster_hex b 
        on a.location = b.hex_id and a.resolution = cast(b.resolution as varchar)
where 
        yyyymmdd >= '20230306'
        and yyyymmdd <= '20230310'
        and city in ('Delhi')
        and a.resolution = '8' 
        and service in ('Link')
        and location in ('883da11abbfffff', '883da11ab5fffff', '883da11ab7fffff', '883da11ad7fffff', '883da118cbfffff', '883da11ad5fffff', '883da118dbfffff',
          '883da11aa5fffff', '883da11a95fffff', '883da11ab3fffff', '883da11aa9fffff', '883da11a87fffff', '883da11a9bfffff', '883da11ab9fffff', '883da11acdfffff',
            '883da1185bfffff', '883da118c9fffff', '883da11ab1fffff', '883da11853fffff', '883da11a91fffff', '883da11a83fffff', '883da11ac3fffff', '883da11ae1fffff',
              '883da11aadfffff', '883da11abdfffff', '883da11a99fffff', '883da11aabfffff', '883da11aa3fffff', '883da11aebfffff', '883da11a8bfffff', '883da11a8dfffff',
                '883da11ae7fffff', '883da11ac1fffff', '883da11a9dfffff', '883da11a85fffff', '883da11ae9fffff', '883da11a33fffff', '883da118ddfffff',
                  '883da11a13fffff', '883da11ac5fffff', '883da11aa1fffff', '883da11851fffff', '883da1036dfffff', '883da11aa7fffff', '883da11ae3fffff',
                    '883da11a93fffff', '883da1034dfffff', '883da118d1fffff', '883da118d7fffff', '883da11ae5fffff', '883da10369fffff', '883da11a81fffff',
                      '883da11aedfffff', '883da11a89fffff', '883da11a97fffff', '883da11ac7fffff', '883da10365fffff', '883da118d3fffff', '883da10345fffff',
                        '883da118d9fffff', '883da11859fffff')
        and substr(hhmm, 1, 2) in ('08', '09', '10', '11')
)
group by 1,2
"""

#Load data into pandas table
df_tse_copy = pd.read_sql(q, connection)

In [10]:
#Create a copy of the df and view it
df_tse = df_tse_copy.copy()
df_tse.head()

Unnamed: 0,yyyymmdd,hour,login_time,idle_time
0,20230306,10,1805373,833838
1,20230306,8,975437,536735
2,20230308,11,267567,67835
3,20230309,10,985720,310589
4,20230309,11,1272546,662835


In [11]:
df_tse['%time_spent_earning'] = round((df_tse['login_time'] - df_tse['idle_time'])/df_tse['login_time'], 4)

In [12]:
df_gsr = df_gsr.merge(df_tse, on = ['yyyymmdd', 'hour'], how = 'left')
df_gsr['login_hours'] = df_gsr['login_time'].apply(lambda x: x/3600)
df_gsr['stress'] = round(df_gsr['GSR']/df_gsr['login_hours'], 4)
df_gsr

Unnamed: 0,yyyymmdd,hour,GSR,GSR2Net,login_time,idle_time,%time_spent_earning,login_hours,stress
0,20230310,8,1221,0.7002,694380,332791,0.5207,192.883333,6.3303
1,20230310,11,756,0.7156,1552904,902406,0.4189,431.362222,1.7526
2,20230310,10,1344,0.6912,1324182,559222,0.5777,367.828333,3.6539
3,20230307,11,725,0.6993,1698147,1033656,0.3913,471.7075,1.537
4,20230307,10,1210,0.6868,1503991,760161,0.4946,417.775278,2.8963
5,20230307,9,1973,0.5717,1059288,307028,0.7102,294.246667,6.7053
6,20230306,11,940,0.733,2209047,1450653,0.3433,613.624167,1.5319
7,20230306,9,2438,0.6071,1356574,322089,0.7626,376.826111,6.4698
8,20230308,9,259,0.529,297023,113423,0.6181,82.506389,3.1392
9,20230309,9,2399,0.4389,723524,134550,0.814,200.978889,11.9366


In [13]:
df_gsr.sort_values(by = 'hour')

Unnamed: 0,yyyymmdd,hour,GSR,GSR2Net,login_time,idle_time,%time_spent_earning,login_hours,stress
0,20230310,8,1221,0.7002,694380,332791,0.5207,192.883333,6.3303
15,20230306,8,1352,0.7234,975437,536735,0.4497,270.954722,4.9898
13,20230308,8,241,0.5892,265210,143732,0.458,73.669444,3.2714
12,20230309,8,1419,0.5518,454180,118751,0.7385,126.161111,11.2475
10,20230307,8,1061,0.7238,776595,427308,0.4498,215.720833,4.9184
14,20230310,9,2231,0.5634,934263,198057,0.788,259.5175,8.5967
8,20230308,9,259,0.529,297023,113423,0.6181,82.506389,3.1392
9,20230309,9,2399,0.4389,723524,134550,0.814,200.978889,11.9366
5,20230307,9,1973,0.5717,1059288,307028,0.7102,294.246667,6.7053
7,20230306,9,2438,0.6071,1356574,322089,0.7626,376.826111,6.4698


In [14]:
df_gsr.groupby(['hour'])['stress'].median()

hour
08    4.9898
09    6.7053
10    3.6539
11    1.7526
Name: stress, dtype: float64

In [3]:
#Query data from order_logs_snapshot

q = """
select order_id, accept_to_pickup_distance, captain_location_hex_8, captain_location_latitude,
captain_location_longitude, city_name, customer_location_hex_8, customer_location_latitude, 
pickup_location_hex_8, pickup_location_latitude, pickup_location_longitude, drop_location_hex_8, drop_location_latitude, drop_location_longitude,
customer_location_longitude, hhmmss, hour, quarter_hour, service_obj_service_name, time_bucket, weekday, yyyymmdd,
order_status, cancel_reason
from orders.order_logs_snapshot
where yyyymmdd >= '20230206'
and yyyymmdd <= '20230210'
and city_name in ('Hyderabad')
and service_obj_service_name = 'Link'
"""

#Load data into pandas table
df_copy = pd.read_sql(q, connection)

In [4]:
#Create a copy of the df retrived from presto
df = df_copy.copy()
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,pickup_location_hex_8,pickup_location_latitude,pickup_location_longitude,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason
0,63e2b54a7e350925a45f934d,0.642,8860b525b3fffff,17.350151,78.492432,Hyderabad,8860b525b1fffff,17.353106,8860b525b1fffff,17.353259,78.501838,8860b525b3fffff,17.350151,78.492432,78.501839,20210,2,200,Link,02:00,3,20230208,dropped,Unable to swipe Arrived
1,63e2b7531eb27f622fc1760b,2.672,8860a25999fffff,17.422369,78.382027,Hyderabad,8860a25999fffff,17.422356,8860a25991fffff,17.426311,78.382179,8860a24b0dfffff,17.490294,78.379847,78.38215,21051,2,200,Link,02:00,3,20230208,customerCancelled,order cancelled before rider accepted
2,63e2ba207e350925a45f94a3,0.331,8860a25b6bfffff,17.38376,78.506737,Hyderabad,8860a25b53fffff,17.361223,8860a25b53fffff,17.361225,78.476958,8860a25b6bfffff,17.38376,78.506737,78.476959,22248,2,215,Link,02:00,3,20230208,dropped,
3,63e2b6037e350925a45f937c,0.973,8860a25b11fffff,17.376757,78.467293,Hyderabad,8860b52c93fffff,17.430814,8860a2595bfffff,17.403901,78.452282,8860a25b11fffff,17.376757,78.467293,78.487615,20515,2,200,Link,02:00,3,20230208,dropped,
4,63e2bcf31eb27f622fc177a6,1.149,8860a25b5bfffff,17.358599,78.482552,Hyderabad,8860a25b5bfffff,17.358599,8860a25b5bfffff,17.358226,78.482138,8860a25b3dfffff,17.399738,78.478563,78.482559,23451,2,230,Link,02:59,3,20230208,customerCancelled,Taking longer than expected


In [5]:
#Make columns for second and minute and date
df['second'] = df['hhmmss'].apply(lambda x: x[4:])
df['minute'] = df['hhmmss'].apply(lambda x: x[2:4])
df['date'] = df['yyyymmdd'].apply(lambda x: x[6:])
df['count'] = 1
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,pickup_location_hex_8,pickup_location_latitude,pickup_location_longitude,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date,count
0,63e2b54a7e350925a45f934d,0.642,8860b525b3fffff,17.350151,78.492432,Hyderabad,8860b525b1fffff,17.353106,8860b525b1fffff,17.353259,78.501838,8860b525b3fffff,17.350151,78.492432,78.501839,20210,2,200,Link,02:00,3,20230208,dropped,Unable to swipe Arrived,10,2,8,1
1,63e2b7531eb27f622fc1760b,2.672,8860a25999fffff,17.422369,78.382027,Hyderabad,8860a25999fffff,17.422356,8860a25991fffff,17.426311,78.382179,8860a24b0dfffff,17.490294,78.379847,78.38215,21051,2,200,Link,02:00,3,20230208,customerCancelled,order cancelled before rider accepted,51,10,8,1
2,63e2ba207e350925a45f94a3,0.331,8860a25b6bfffff,17.38376,78.506737,Hyderabad,8860a25b53fffff,17.361223,8860a25b53fffff,17.361225,78.476958,8860a25b6bfffff,17.38376,78.506737,78.476959,22248,2,215,Link,02:00,3,20230208,dropped,,48,22,8,1
3,63e2b6037e350925a45f937c,0.973,8860a25b11fffff,17.376757,78.467293,Hyderabad,8860b52c93fffff,17.430814,8860a2595bfffff,17.403901,78.452282,8860a25b11fffff,17.376757,78.467293,78.487615,20515,2,200,Link,02:00,3,20230208,dropped,,15,5,8,1
4,63e2bcf31eb27f622fc177a6,1.149,8860a25b5bfffff,17.358599,78.482552,Hyderabad,8860a25b5bfffff,17.358599,8860a25b5bfffff,17.358226,78.482138,8860a25b3dfffff,17.399738,78.478563,78.482559,23451,2,230,Link,02:59,3,20230208,customerCancelled,Taking longer than expected,51,34,8,1


In [6]:
#Load data from cluster table
q = """
select * from datasets.city_cluster_hex
where city in ('Hyderabad')
and resolution = 8
"""
#Load data into pandas table
df_cluster_copy = pd.read_sql(q, connection)

In [7]:
cluster_mapping = df_cluster_copy.copy()
cluster_mapping = cluster_mapping.rename(columns = {'hex_id':'pickup_location_hex_8'})
df = df.merge(cluster_mapping[['pickup_location_hex_8', 'cluster']], on = 'pickup_location_hex_8', how = 'left')
cluster_mapping = cluster_mapping.rename(columns = {'pickup_location_hex_8':'drop_location_hex_8'})
df = df.rename(columns = {'cluster':'pickup_cluster'})
df = df.merge(cluster_mapping[['drop_location_hex_8', 'cluster']], on = 'drop_location_hex_8', how = 'left')
df = df.rename(columns = {'cluster':'drop_cluster'})
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,pickup_location_hex_8,pickup_location_latitude,pickup_location_longitude,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date,count,pickup_cluster,drop_cluster
0,63e2b54a7e350925a45f934d,0.642,8860b525b3fffff,17.350151,78.492432,Hyderabad,8860b525b1fffff,17.353106,8860b525b1fffff,17.353259,78.501838,8860b525b3fffff,17.350151,78.492432,78.501839,20210,2,200,Link,02:00,3,20230208,dropped,Unable to swipe Arrived,10,2,8,1,Santosh Nagar West,Santosh Nagar West
1,63e2b7531eb27f622fc1760b,2.672,8860a25999fffff,17.422369,78.382027,Hyderabad,8860a25999fffff,17.422356,8860a25991fffff,17.426311,78.382179,8860a24b0dfffff,17.490294,78.379847,78.38215,21051,2,200,Link,02:00,3,20230208,customerCancelled,order cancelled before rider accepted,51,10,8,1,Bio Diversity Park Gachibowli,Miyapur
2,63e2ba207e350925a45f94a3,0.331,8860a25b6bfffff,17.38376,78.506737,Hyderabad,8860a25b53fffff,17.361223,8860a25b53fffff,17.361225,78.476958,8860a25b6bfffff,17.38376,78.506737,78.476959,22248,2,215,Link,02:00,3,20230208,dropped,,48,22,8,1,Charminar,
3,63e2b6037e350925a45f937c,0.973,8860a25b11fffff,17.376757,78.467293,Hyderabad,8860b52c93fffff,17.430814,8860a2595bfffff,17.403901,78.452282,8860a25b11fffff,17.376757,78.467293,78.487615,20515,2,200,Link,02:00,3,20230208,dropped,,15,5,8,1,Banjara Hills,Goshamahal Police Quarters
4,63e2bcf31eb27f622fc177a6,1.149,8860a25b5bfffff,17.358599,78.482552,Hyderabad,8860a25b5bfffff,17.358599,8860a25b5bfffff,17.358226,78.482138,8860a25b3dfffff,17.399738,78.478563,78.482559,23451,2,230,Link,02:59,3,20230208,customerCancelled,Taking longer than expected,51,34,8,1,Charminar Murad Mahal Colony,Basheerbagh


In [8]:
#Filter the df on morning and evening peak hours
df_morning = df[df['hour'].isin(['08', '09', '10', '11'])]
df_evening = df[df['hour'].isin(['17', '18', '19', '20', '21'])]

#Groupby on hex and quarter hour to get the number of orders coming in every hex
df_morning_grouped = df_morning.groupby(['yyyymmdd', 'hour', 'pickup_cluster'])['order_id'].nunique().reset_index()
df_evening_grouped = df_evening.groupby(['yyyymmdd', 'hour', 'pickup_cluster'])['order_id'].nunique().reset_index()
df_evening_grouped.head()

Unnamed: 0,yyyymmdd,hour,pickup_cluster,order_id
0,20230206,17,,58
1,20230206,17,AOC Center,297
2,20230206,17,Abids,294
3,20230206,17,Alwal,137
4,20230206,17,Alwal 2,119


In [10]:
#Query data for online captains

q_online_captains = """
select
        yyyymmdd, substr(hhmm, 1, 2) as hour, cluster,
        count(distinct captain_id) as Online_Captains
from 
(select 
        a.*, city, cluster
        
from 
        hive.datasets.supplycursory_history a 
        left join datasets.city_cluster_hex b 
        on a.location = b.hex_id and a.resolution = cast(b.resolution as varchar)
where 
        yyyymmdd >= '20230206'
        and yyyymmdd <= '20230212'
        and city in ('Hyderabad')
        and a.resolution = '8' 
        and service in ('Link')
        and substr(hhmm, 1, 2) in ('08', '09', '10', '11')
)
group by 1,2,3
"""

#Load data into pandas table
df_online_captains_copy = pd.read_sql(q_online_captains, connection)

In [11]:
#Make a copy of the dataset and view it
df_online_captains = df_online_captains_copy.copy()
df_online_captains.head()

Unnamed: 0,yyyymmdd,hour,cluster,Online_Captains
0,20230212,11,shilparamam,161
1,20230206,10,Ashok Nagar 2,63
2,20230206,9,Dilsukhnagar,258
3,20230208,9,Balanagar,174
4,20230207,10,RTC X Road,43


In [12]:
#Rename cluster column
df_online_captains = df_online_captains.rename(columns = {'cluster':'pickup_cluster'})
df_online_captains.head()

Unnamed: 0,yyyymmdd,hour,pickup_cluster,Online_Captains
0,20230212,11,shilparamam,161
1,20230206,10,Ashok Nagar 2,63
2,20230206,9,Dilsukhnagar,258
3,20230208,9,Balanagar,174
4,20230207,10,RTC X Road,43


In [13]:
#Merge online captains with demand data
df_morning_grouped = df_morning_grouped.merge(df_online_captains, on = ['yyyymmdd', 'hour', 'pickup_cluster'], how = 'left')
df_evening_grouped = df_evening_grouped.merge(df_online_captains, on = ['yyyymmdd', 'hour', 'pickup_cluster'], how = 'left')
df_morning_grouped = df_morning_grouped.rename(columns = {'order_id':'num_of_orders'})
df_evening_grouped = df_evening_grouped.rename(columns = {'order_id':'num_of_orders'})
df_morning_grouped['demand_to_supply_ratio'] = round(df_morning_grouped['num_of_orders']/df_morning_grouped['Online_Captains'], 4)
df_evening_grouped['demand_to_supply_ratio'] = round(df_evening_grouped['num_of_orders']/df_evening_grouped['Online_Captains'], 4)

df_morning_grouped = df_morning_grouped.sort_values(by = 'demand_to_supply_ratio', ascending = False)
df_evening_grouped = df_evening_grouped.sort_values(by = 'demand_to_supply_ratio', ascending = False)
df_evening_grouped.head()

Unnamed: 0,yyyymmdd,hour,pickup_cluster,num_of_orders,Online_Captains,demand_to_supply_ratio
0,20230206,17,,58,,
1,20230206,17,AOC Center,297,,
2,20230206,17,Abids,294,,
3,20230206,17,Alwal,137,,
4,20230206,17,Alwal 2,119,,


In [14]:
df_morning = df_morning.merge(df_morning_grouped, on = ['yyyymmdd', 'pickup_cluster', 'hour'], how = 'left')
df_evening = df_evening.merge(df_evening_grouped, on = ['yyyymmdd', 'pickup_cluster', 'hour'], how = 'left')
df_morning.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,pickup_location_hex_8,pickup_location_latitude,pickup_location_longitude,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date,count,pickup_cluster,drop_cluster,num_of_orders,Online_Captains,demand_to_supply_ratio
0,63e1d4a82a61115ff57e23a7,1.839,8860b52cc5fffff,17.430492,78.540184,Hyderabad,8860b52cc5fffff,17.4305,8860b52cc3fffff,17.423353,78.528511,8860b52c89fffff,17.428469,78.528514,78.540176,100344,10,1000,Link,10:00,2,20230207,customerCancelled,Wrong pickup location,44,3,7,1,Osmania,Tarnaka,56.0,131.0,0.4275
1,63e1d4da971e0053a8169b03,2.826,8860a24b6dfffff,17.493007,78.403204,Hyderabad,8860b19693fffff,17.483559,8860b19697fffff,17.484097,78.405907,8860a24a6dfffff,17.446284,78.369092,78.404533,100434,10,1000,Link,10:00,2,20230207,expired,Wrong Pickup Location,34,4,7,1,Kukatpally 3,Hitech City,296.0,361.0,0.8199
2,63e1d4e082e527574670dbc7,1.28,8860a259bbfffff,17.441349,78.377357,Hyderabad,8860a24a61fffff,17.443495,8860a24a61fffff,17.443508,78.36264,8860a259bbfffff,17.441349,78.377357,78.362625,100440,10,1000,Link,10:00,2,20230207,dropped,,40,4,7,1,Hitech City,Hitech City,1421.0,1286.0,1.105
3,63e1d4ed29d1da2ca285e73d,,8860a24b21fffff,17.513906,78.378426,Hyderabad,8860a24b21fffff,17.513893,8860a24b21fffff,17.513927,78.37841,8860a24b15fffff,17.485598,78.358536,78.37841,100453,10,1000,Link,10:00,2,20230207,customerCancelled,order cancelled before rider accepted,53,4,7,1,Nizampet 3,Hafeezpet,91.0,76.0,1.1974
4,63e1d4f124af24679dfb857b,0.473,8860b52c13fffff,17.427802,78.551657,Hyderabad,8860b52c13fffff,17.42804,8860b52c13fffff,17.428034,78.55178,8860b1964bfffff,17.467465,78.506189,78.551764,100457,10,1000,Link,10:00,2,20230207,customerCancelled,Change of plans,57,4,7,1,Nacharam,Thirmulgerry,86.0,203.0,0.4236


In [15]:
#Create a column for weekdays
df_morning['weekday_name'] = df_morning['weekday'].apply(lambda x: calendar.day_name[(x-1)])
df_evening['weekday_name'] = df_evening['weekday'].apply(lambda x: calendar.day_name[(x-1)])
df_morning.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,pickup_location_hex_8,pickup_location_latitude,pickup_location_longitude,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date,count,pickup_cluster,drop_cluster,num_of_orders,Online_Captains,demand_to_supply_ratio,weekday_name
0,63e1d4a82a61115ff57e23a7,1.839,8860b52cc5fffff,17.430492,78.540184,Hyderabad,8860b52cc5fffff,17.4305,8860b52cc3fffff,17.423353,78.528511,8860b52c89fffff,17.428469,78.528514,78.540176,100344,10,1000,Link,10:00,2,20230207,customerCancelled,Wrong pickup location,44,3,7,1,Osmania,Tarnaka,56.0,131.0,0.4275,Tuesday
1,63e1d4da971e0053a8169b03,2.826,8860a24b6dfffff,17.493007,78.403204,Hyderabad,8860b19693fffff,17.483559,8860b19697fffff,17.484097,78.405907,8860a24a6dfffff,17.446284,78.369092,78.404533,100434,10,1000,Link,10:00,2,20230207,expired,Wrong Pickup Location,34,4,7,1,Kukatpally 3,Hitech City,296.0,361.0,0.8199,Tuesday
2,63e1d4e082e527574670dbc7,1.28,8860a259bbfffff,17.441349,78.377357,Hyderabad,8860a24a61fffff,17.443495,8860a24a61fffff,17.443508,78.36264,8860a259bbfffff,17.441349,78.377357,78.362625,100440,10,1000,Link,10:00,2,20230207,dropped,,40,4,7,1,Hitech City,Hitech City,1421.0,1286.0,1.105,Tuesday
3,63e1d4ed29d1da2ca285e73d,,8860a24b21fffff,17.513906,78.378426,Hyderabad,8860a24b21fffff,17.513893,8860a24b21fffff,17.513927,78.37841,8860a24b15fffff,17.485598,78.358536,78.37841,100453,10,1000,Link,10:00,2,20230207,customerCancelled,order cancelled before rider accepted,53,4,7,1,Nizampet 3,Hafeezpet,91.0,76.0,1.1974,Tuesday
4,63e1d4f124af24679dfb857b,0.473,8860b52c13fffff,17.427802,78.551657,Hyderabad,8860b52c13fffff,17.42804,8860b52c13fffff,17.428034,78.55178,8860b1964bfffff,17.467465,78.506189,78.551764,100457,10,1000,Link,10:00,2,20230207,customerCancelled,Change of plans,57,4,7,1,Nacharam,Thirmulgerry,86.0,203.0,0.4236,Tuesday


In [16]:
num_of_hex_morning = df_morning.groupby(['pickup_cluster'])['pickup_location_hex_8'].nunique().reset_index()
num_of_hex_evening = df_evening.groupby(['pickup_cluster'])['pickup_location_hex_8'].nunique().reset_index()
num_of_hex_morning = num_of_hex_morning.rename(columns = {'pickup_location_hex_8':'num_of_hexes'})
num_of_hex_evening = num_of_hex_evening.rename(columns = {'pickup_location_hex_8':'num_of_hexes'})

df_morning = df_morning.merge(num_of_hex_evening, on = ['pickup_cluster'], how = 'left')
df_evening = df_evening.merge(num_of_hex_evening, on = ['pickup_cluster'], how = 'left')
df_morning['online_captains_per_hex'] = round(df_morning['Online_Captains']/df_morning['num_of_hexes'], 4)
df_evening['online_captains_per_hex'] = round(df_evening['Online_Captains']/df_evening['num_of_hexes'], 4)

In [17]:
df_morning_filter = df_morning[['captain_location_hex_8', 'captain_location_latitude', 'captain_location_longitude', 'pickup_location_hex_8',
 'pickup_location_latitude', 'pickup_location_longitude', 'pickup_cluster', 'hour', 'yyyymmdd', 'drop_location_hex_8', 'drop_location_latitude',
  'drop_location_longitude', 'num_of_orders', 'drop_cluster', 'Online_Captains', 'demand_to_supply_ratio', 'weekday_name', 'online_captains_per_hex']]
df_evening_filter = df_evening[['captain_location_hex_8', 'captain_location_latitude', 'captain_location_longitude', 'pickup_location_hex_8',
 'pickup_location_latitude', 'pickup_location_longitude', 'pickup_cluster', 'hour', 'yyyymmdd', 'drop_location_hex_8', 'drop_location_latitude',
  'drop_location_longitude', 'num_of_orders', 'drop_cluster', 'Online_Captains', 'demand_to_supply_ratio', 'weekday_name', 'online_captains_per_hex']]

In [18]:
df_morning_filter.to_csv('hyderabad_demand_supply_morning.csv', index = False)