In [1]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [2]:
area1_hexes = pd.read_csv('delhi_location_data_level1_demand_zone1.csv')
area2_hexes = pd.read_csv('delhi_location_data_level1_demand_zone2.csv')

list_area1_hexes = area1_hexes['customer_location_hex_8'].unique()
list_area2_hexes = area2_hexes['customer_location_hex_8'].unique()

In [4]:
list_area2_hexes

array(['883da106c1fffff', '883da106c9fffff', '883da1a939fffff',
       '883da1a93dfffff', '883da106c3fffff', '883da106c5fffff',
       '883da106c7fffff', '883da106cbfffff', '883da106cdfffff',
       '883da1a903fffff', '883da1a907fffff', '883da1a915fffff',
       '883da1a923fffff', '883da1a927fffff', '883da1a92bfffff',
       '883da1a931fffff', '883da1a935fffff', '883da1a93bfffff'],
      dtype=object)

In [2]:
#Define presto credentials
presto_host = 'presto.processing.yoda.run'
presto_port = '80'
username = 'aditya.bhattar@rapido.bike'

#Create connection to presto host
connection = presto.connect(presto_host,presto_port,username = username)

In [3]:
#Query data from order_logs_snapshot

q = """
select order_id, accept_to_pickup_distance, captain_location_hex_8, captain_location_latitude,
captain_location_longitude, city_name, customer_location_hex_8, customer_location_latitude, 
customer_location_longitude, hhmmss, hour, quarter_hour, service_obj_service_name, time_bucket, weekday, yyyymmdd,
order_status, cancel_reason
from orders.order_logs_snapshot
where yyyymmdd >= '20220919'
and yyyymmdd <= '20220925'
and city_name in ('Delhi')
and service_obj_service_name = 'Link'
"""

#Load data into pandas table
df_copy = pd.read_sql(q, connection)

In [4]:
#Create a copy of the df retrived from presto
df = df_copy.copy()

In [5]:
#View the dataset
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason
0,632b19006d83ed15dc0045d9,0.183,883da1a915fffff,28.637859,77.378349,Delhi,883da1a937fffff,28.614538,77.386744,193032,19,1930,Link,19:00,3,20220921,dropped,
1,632b192fbafbe4293d859c82,1.021,883da1169dfffff,28.683012,77.263275,Delhi,883da116e7fffff,28.662519,77.232452,193119,19,1930,Link,19:59,3,20220921,dropped,
2,632b195bbafbe4293d859cde,,883da10417fffff,28.53924,77.395439,Delhi,883da10417fffff,28.539242,77.395439,193203,19,1930,Link,19:59,3,20220921,customerCancelled,order cancelled before rider accepted
3,632b1983bafbe4293d859d2a,0.641,883da1a941fffff,28.645582,77.335976,Delhi,883da11483fffff,28.627512,77.297806,193243,19,1930,Link,19:59,3,20220921,dropped,
4,632b19a61f57866ffc68a1c2,0.28,883da106adfffff,28.564445,77.38496,Delhi,883da106c7fffff,28.591599,77.38079,193318,19,1930,Link,19:59,3,20220921,customerCancelled,Customer asked to cancel


In [6]:
#Make columns for second and minute and date
df['second'] = df['hhmmss'].apply(lambda x: x[4:])
df['minute'] = df['hhmmss'].apply(lambda x: x[2:4])
df['date'] = df['yyyymmdd'].apply(lambda x: x[6:])
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date
0,632b19006d83ed15dc0045d9,0.183,883da1a915fffff,28.637859,77.378349,Delhi,883da1a937fffff,28.614538,77.386744,193032,19,1930,Link,19:00,3,20220921,dropped,,32,30,21
1,632b192fbafbe4293d859c82,1.021,883da1169dfffff,28.683012,77.263275,Delhi,883da116e7fffff,28.662519,77.232452,193119,19,1930,Link,19:59,3,20220921,dropped,,19,31,21
2,632b195bbafbe4293d859cde,,883da10417fffff,28.53924,77.395439,Delhi,883da10417fffff,28.539242,77.395439,193203,19,1930,Link,19:59,3,20220921,customerCancelled,order cancelled before rider accepted,3,32,21
3,632b1983bafbe4293d859d2a,0.641,883da1a941fffff,28.645582,77.335976,Delhi,883da11483fffff,28.627512,77.297806,193243,19,1930,Link,19:59,3,20220921,dropped,,43,32,21
4,632b19a61f57866ffc68a1c2,0.28,883da106adfffff,28.564445,77.38496,Delhi,883da106c7fffff,28.591599,77.38079,193318,19,1930,Link,19:59,3,20220921,customerCancelled,Customer asked to cancel,18,33,21


In [7]:
#Create 15-sec batches and 20-sec batches
df['second'] = df['second'].astype(float)
df['minute'] = df['minute'].astype(float)
df['20_sec_batch'] = pd.cut(df['second'], bins = [-1, 20, 40, 61], labels = ['0-20', '20-40', '40-60'])
df['count'] = 1
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date,20_sec_batch,count
0,632b19006d83ed15dc0045d9,0.183,883da1a915fffff,28.637859,77.378349,Delhi,883da1a937fffff,28.614538,77.386744,193032,19,1930,Link,19:00,3,20220921,dropped,,32.0,30.0,21,20-40,1
1,632b192fbafbe4293d859c82,1.021,883da1169dfffff,28.683012,77.263275,Delhi,883da116e7fffff,28.662519,77.232452,193119,19,1930,Link,19:59,3,20220921,dropped,,19.0,31.0,21,0-20,1
2,632b195bbafbe4293d859cde,,883da10417fffff,28.53924,77.395439,Delhi,883da10417fffff,28.539242,77.395439,193203,19,1930,Link,19:59,3,20220921,customerCancelled,order cancelled before rider accepted,3.0,32.0,21,0-20,1
3,632b1983bafbe4293d859d2a,0.641,883da1a941fffff,28.645582,77.335976,Delhi,883da11483fffff,28.627512,77.297806,193243,19,1930,Link,19:59,3,20220921,dropped,,43.0,32.0,21,40-60,1
4,632b19a61f57866ffc68a1c2,0.28,883da106adfffff,28.564445,77.38496,Delhi,883da106c7fffff,28.591599,77.38079,193318,19,1930,Link,19:59,3,20220921,customerCancelled,Customer asked to cancel,18.0,33.0,21,0-20,1


In [8]:
#Check the number of rides coming from each hex
df_grouped = df.groupby(['customer_location_hex_8'])['count'].sum().reset_index()

#Filter out those hexes where atleast 1000 rides have come in the last two weeks
df_grouped = df_grouped[df_grouped['count'] >= 100].reset_index(drop = True)

#Add a column to keep for analysis
df_grouped['keep_for_analysis'] = 'yes'
df_grouped.head()

Unnamed: 0,customer_location_hex_8,count,keep_for_analysis
0,883ce0c06bfffff,185,yes
1,883da10043fffff,282,yes
2,883da10049fffff,106,yes
3,883da1004bfffff,611,yes
4,883da10059fffff,742,yes


In [9]:
#Merge the number of rides to the original df
df = df.merge(df_grouped[['customer_location_hex_8', 'keep_for_analysis']], on = 'customer_location_hex_8', how = 'left')
df['keep_for_analysis'] = df['keep_for_analysis'].fillna('no')
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date,20_sec_batch,count,keep_for_analysis
0,632b19006d83ed15dc0045d9,0.183,883da1a915fffff,28.637859,77.378349,Delhi,883da1a937fffff,28.614538,77.386744,193032,19,1930,Link,19:00,3,20220921,dropped,,32.0,30.0,21,20-40,1,yes
1,632b192fbafbe4293d859c82,1.021,883da1169dfffff,28.683012,77.263275,Delhi,883da116e7fffff,28.662519,77.232452,193119,19,1930,Link,19:59,3,20220921,dropped,,19.0,31.0,21,0-20,1,yes
2,632b195bbafbe4293d859cde,,883da10417fffff,28.53924,77.395439,Delhi,883da10417fffff,28.539242,77.395439,193203,19,1930,Link,19:59,3,20220921,customerCancelled,order cancelled before rider accepted,3.0,32.0,21,0-20,1,yes
3,632b1983bafbe4293d859d2a,0.641,883da1a941fffff,28.645582,77.335976,Delhi,883da11483fffff,28.627512,77.297806,193243,19,1930,Link,19:59,3,20220921,dropped,,43.0,32.0,21,40-60,1,yes
4,632b19a61f57866ffc68a1c2,0.28,883da106adfffff,28.564445,77.38496,Delhi,883da106c7fffff,28.591599,77.38079,193318,19,1930,Link,19:59,3,20220921,customerCancelled,Customer asked to cancel,18.0,33.0,21,0-20,1,yes


In [10]:
#Filter the df only on hexes to keep
df_filter = df[df['keep_for_analysis'] == 'yes']

In [11]:
#Groupby on hex to see number of orders coming in 15 and 20 second batches
df_hex_grouped = df_filter.groupby(['customer_location_hex_8', 'date', 'hour', 'minute', '20_sec_batch'])['count'].sum().reset_index()
df_hex_grouped = df_hex_grouped.sort_values(by = ['customer_location_hex_8', 'date', 'hour', 'minute', '20_sec_batch'])
df_hex_grouped.head()

Unnamed: 0,customer_location_hex_8,date,hour,minute,20_sec_batch,count
0,883ce0c06bfffff,19,0,0.0,0-20,0
1,883ce0c06bfffff,19,0,0.0,20-40,0
2,883ce0c06bfffff,19,0,0.0,40-60,0
3,883ce0c06bfffff,19,0,1.0,0-20,0
4,883ce0c06bfffff,19,0,1.0,20-40,0


In [12]:
#Identify high demand hexes
df_hex_grouped_filter2 = df_hex_grouped[df_hex_grouped['count'] >= 2]
count_df = pd.DataFrame(df_hex_grouped_filter2['customer_location_hex_8'].value_counts().reset_index())
count_df.columns = ['customer_location_hex_8', 'count']
top_20_hexes = count_df['customer_location_hex_8'].head(20).tolist()
top_20_hexes

['883da11a83fffff',
 '883da11abdfffff',
 '883da11a99fffff',
 '883da111a3fffff',
 '883da1a939fffff',
 '883da11a87fffff',
 '883da111a9fffff',
 '883da11ad5fffff',
 '883da1a93dfffff',
 '883da111a1fffff',
 '883da111adfffff',
 '883da106c9fffff',
 '883da11a8dfffff',
 '883da11a91fffff',
 '883da11185fffff',
 '883da111e5fffff',
 '883da1385dfffff',
 '883da11ab1fffff',
 '883da106c1fffff',
 '883da111a7fffff']

In [15]:
area2_hexes = pd.read_csv('delhi_location_data_level1_demand_zone2.csv')
list_area2_hexes = area2_hexes['customer_location_hex_8'].unique().tolist()
hexes_zone2 = []
for hex in list_area2_hexes:
    if hex in top_20_hexes:
        hexes_zone2.append(hex)
hexes_zone2

['883da106c1fffff', '883da106c9fffff', '883da1a939fffff', '883da1a93dfffff']

In [21]:
hexes_zone2_final = []
for x in hexes_zone2:
    hexes = list(h3.k_ring(x, 2))
    hexes_zone2_final.append(hexes)

hexes_zone2_final = [item for sublist in hexes_zone2_final for item in sublist]
hex_zone2_unique = []
for hex in hexes_zone2_final:
    if hex in hex_zone2_unique:
        continue
    else:
        hex_zone2_unique.append(hex)
hex_zone2_unique

['883da106d5fffff',
 '883da106c5fffff',
 '883da1a935fffff',
 '883da1a923fffff',
 '883da106e9fffff',
 '883da1a927fffff',
 '883da106cdfffff',
 '883da106c3fffff',
 '883da10689fffff',
 '883da106c1fffff',
 '883da1a937fffff',
 '883da10613fffff',
 '883da106ddfffff',
 '883da106cbfffff',
 '883da106c9fffff',
 '883da106ebfffff',
 '883da1061bfffff',
 '883da106c7fffff',
 '883da1068dfffff',
 '883da1a925fffff',
 '883da1a92bfffff',
 '883da1a93dfffff',
 '883da1a931fffff',
 '883da1a921fffff',
 '883da1a91dfffff',
 '883da1a933fffff',
 '883da1a907fffff',
 '883da1a917fffff',
 '883da1a939fffff',
 '883da1a901fffff',
 '883da1a90bfffff',
 '883da1a903fffff',
 '883da1a915fffff',
 '883da1a905fffff',
 '883da1a9edfffff',
 '883da1a911fffff',
 '883da1a93bfffff',
 '883da1a929fffff']

In [22]:
len(hex_zone2_unique)

38

In [23]:
df_hex_grouped = df_hex_grouped[df_hex_grouped['customer_location_hex_8'].isin(hex_zone2_unique)]

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,second,minute,date,20_sec_batch,count,keep_for_analysis
0,632b19006d83ed15dc0045d9,0.183,883da1a915fffff,28.637859,77.378349,Delhi,883da1a937fffff,28.614538,77.386744,193032,19,1930,Link,19:00,3,20220921,dropped,,32.0,30.0,21,20-40,1,yes
1,632b192fbafbe4293d859c82,1.021,883da1169dfffff,28.683012,77.263275,Delhi,883da116e7fffff,28.662519,77.232452,193119,19,1930,Link,19:59,3,20220921,dropped,,19.0,31.0,21,0-20,1,yes
2,632b195bbafbe4293d859cde,,883da10417fffff,28.539240,77.395439,Delhi,883da10417fffff,28.539242,77.395439,193203,19,1930,Link,19:59,3,20220921,customerCancelled,order cancelled before rider accepted,3.0,32.0,21,0-20,1,yes
3,632b1983bafbe4293d859d2a,0.641,883da1a941fffff,28.645582,77.335976,Delhi,883da11483fffff,28.627512,77.297806,193243,19,1930,Link,19:59,3,20220921,dropped,,43.0,32.0,21,40-60,1,yes
4,632b19a61f57866ffc68a1c2,0.280,883da106adfffff,28.564445,77.384960,Delhi,883da106c7fffff,28.591599,77.380790,193318,19,1930,Link,19:59,3,20220921,customerCancelled,Customer asked to cancel,18.0,33.0,21,0-20,1,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940664,632d96c2d9957d0dba0fd077,2.741,883da111abfffff,28.496929,77.094139,Delhi,883da111abfffff,28.497391,77.094040,165138,16,1645,Link,16:59,5,20220923,customerCancelled,order cancelled before rider accepted,38.0,51.0,23,20-40,1,yes
940665,632d97bcd9957d0dba0fd323,1.646,883da11a07fffff,28.445469,76.999083,Delhi,883da11a01fffff,28.457563,76.989433,165548,16,1645,Link,16:59,5,20220923,customerCancelled,Drop location denied,48.0,55.0,23,40-60,1,yes
940666,632d97c5184fa93b16bf2b5e,1.322,883da106ebfffff,28.584232,77.381783,Delhi,883da1040bfffff,28.542048,77.369949,165557,16,1645,Link,16:59,5,20220923,dropped,,57.0,55.0,23,40-60,1,yes
940667,632d98344f6df037a7291d3c,,883da102c5fffff,28.528112,77.206133,Delhi,883da102c5fffff,28.528109,77.206139,165748,16,1645,Link,16:59,5,20220923,customerCancelled,order cancelled before rider accepted,48.0,57.0,23,40-60,1,yes


In [26]:
df_hex_grouped_combine = df_hex_grouped.groupby(['customer_location_hex_8'])['count'].sum().reset_index()
df_hex_grouped_combine.to_csv('delhi_zone2_expanded.csv', index = False)

In [14]:
#Look at the average demand for each minute
df_hex_grouped['time'] = (pd.to_datetime(df_hex_grouped['hour'].astype(str) + ':' + df_hex_grouped['minute'].astype(int).astype(str), format='%H:%M').dt.time)
df_hex_grouped.head()

Unnamed: 0,customer_location_hex_8,date,hour,minute,20_sec_batch,count,time
0,883ce0c06bfffff,1,0,0.0,0-20,0,00:00:00
1,883ce0c06bfffff,1,0,0.0,20-40,0,00:00:00
2,883ce0c06bfffff,1,0,0.0,40-60,0,00:00:00
3,883ce0c06bfffff,1,0,1.0,0-20,0,00:01:00
4,883ce0c06bfffff,1,0,1.0,20-40,0,00:01:00


In [18]:
#Filter on the hexes identified as high demand
df_high_demand = pd.read_csv('delhi_location_data_level1_demand_zone2.csv')
high_demand_hexes = list(df_high_demand['customer_location_hex_8'].unique())

df_hex_high_demand = df_hex_grouped[df_hex_grouped['customer_location_hex_8'].isin(high_demand_hexes)].reset_index(drop=True)

In [19]:
#Get the total number of rides per hour
df_level_1_grouped = round(df_hex_high_demand.groupby(['date', 'hour'])['count'].sum().reset_index(), 0)
df_level_1_grouped = df_level_1_grouped.rename(columns = {'count':'rides_per_hour'})
df_level_1_grouped['rides_per_20_sec'] = round(df_level_1_grouped['rides_per_hour']/180, 2)
df_level_1_grouped

Unnamed: 0,date,hour,rides_per_hour,rides_per_20_sec
0,01,00,54,0.30
1,01,01,33,0.18
2,01,02,31,0.17
3,01,03,58,0.32
4,01,04,88,0.49
...,...,...,...,...
667,30,19,1142,6.34
668,30,20,657,3.65
669,30,21,344,1.91
670,30,22,229,1.27


In [20]:
df_level_1_grouped.to_csv('demand_delhi.csv', index = False)