In [1]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [2]:
#Define presto credentials
presto_host = 'presto.processing.yoda.run'
presto_port = '80'
username = 'aditya.bhattar@rapido.bike'

#Create connection to presto host
connection = presto.connect(presto_host,presto_port,username = username)

In [3]:
#Query data from order_logs_snapshot

q = """
select order_id, accept_to_pickup_distance, captain_location_hex_8, captain_location_latitude,
captain_location_longitude, city_name, customer_location_hex_8, drop_location_hex_8, drop_location_latitude, drop_location_longitude, customer_location_latitude, 
customer_location_longitude, hhmmss, hour, quarter_hour, service_obj_service_name, time_bucket, weekday, yyyymmdd,
order_status, cancel_reason
from orders.order_logs_snapshot
where yyyymmdd >= '20220919'
and yyyymmdd <= '20221016'
and city_name in ('Bangalore')
and service_obj_service_name = 'Link'
"""

#Load data into pandas table
df_copy = pd.read_sql(q, connection)

In [4]:
#Create two dfs for dropped_in and dropped_out
df_copy['count'] = 1
df_dropped_in = df_copy.copy()
df_dropped_out = df_copy.copy()
df_dropped_in.head()

Unnamed: 0,order_id,accept_to_pickup_distance,captain_location_hex_8,captain_location_latitude,captain_location_longitude,city_name,customer_location_hex_8,drop_location_hex_8,drop_location_latitude,drop_location_longitude,customer_location_latitude,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,count
0,6339fbe53abb71427b9f14fe,,8860169661fffff,13.104808,77.600578,Bangalore,8860169661fffff,8861892d87fffff,13.101064,77.630836,13.104167,77.600533,23021,2,230,Link,02:00,1,20221003,customerCancelled,Order cancelled before rider was mapped,1
1,6339fe07ff856f4b4474592b,0.634,88618925e9fffff,12.938844,77.625885,Bangalore,88618925e9fffff,88618925e9fffff,12.938844,77.625885,12.938918,77.626281,23927,2,230,Link,02:59,1,20221003,dropped,,1
2,6339f56b3abb71427b9f1275,1.187,88618921c7fffff,12.98301,77.761612,Bangalore,88618920b3fffff,88618921c7fffff,12.98301,77.761612,12.954065,77.69799,20243,2,200,Link,02:00,1,20221003,dropped,,1
3,6339f64082db09299f140909,1.361,88618925e9fffff,12.937722,77.627966,Bangalore,88618925c5fffff,88618920b9fffff,12.950546,77.704407,12.934267,77.619576,20616,2,200,Link,02:00,1,20221003,customerCancelled,Change of plans,1
4,6339f8f03abb71427b9f13d7,2.291,8860145a2bfffff,12.953946,77.54364,Bangalore,8860145a2dfffff,88618921ebfffff,12.992388,77.764866,12.962638,77.553467,21744,2,215,Link,02:00,1,20221003,customerCancelled,Drop location denied,1


In [5]:
#Load the hexes to filter on
area1_hexes = pd.read_csv('/Users/rapido/Desktop/batching_demand/bangalore_location_data_level1_demand_zone1.csv')
area2_hexes = pd.read_csv('/Users/rapido/Desktop/batching_demand/bangalore_location_data_level1_demand_zone2.csv')

area1_hexes_list = list(area1_hexes['customer_location_hex_8'].unique())
area2_hexes_list = list(area2_hexes['customer_location_hex_8'].unique())

#Filter the dfs on the hexes identified for bangalore
df_dropped_in_area1 = df_dropped_in[df_dropped_in['drop_location_hex_8'].isin(area1_hexes_list)]
df_dropped_in_area1 = df_dropped_in_area1[df_dropped_in_area1['order_status'] == 'dropped']
df_dropped_in_area2 = df_dropped_in[df_dropped_in['drop_location_hex_8'].isin(area2_hexes_list)]
df_dropped_in_area2 = df_dropped_in_area2[df_dropped_in_area2['order_status'] == 'dropped']


df_dropped_out_area1 = df_dropped_out[df_dropped_out['customer_location_hex_8'].isin(area1_hexes_list)]
df_dropped_out_area1 = df_dropped_out_area1[df_dropped_out_area1['order_status'] == 'dropped']
df_dropped_out_area2 = df_dropped_out[df_dropped_out['customer_location_hex_8'].isin(area2_hexes_list)]
df_dropped_out_area2 = df_dropped_out_area2[df_dropped_out_area2['order_status'] == 'dropped']

In [6]:
#Groupby on date and hour to get the number of rides dropped_in/out
df_dropped_in_area1_rides = df_dropped_in_area1.groupby(['yyyymmdd', 'hour'])['count'].sum().reset_index()
df_dropped_in_area1_rides = df_dropped_in_area1_rides.rename(columns = {'count':'dropped_rides_in'})
df_dropped_in_area2_rides = df_dropped_in_area2.groupby(['yyyymmdd', 'hour'])['count'].sum().reset_index()
df_dropped_in_area2_rides = df_dropped_in_area2_rides.rename(columns = {'count':'dropped_rides_in'})

df_dropped_out_area1_rides = df_dropped_out_area1.groupby(['yyyymmdd', 'hour'])['count'].sum().reset_index()
df_dropped_out_area1_rides = df_dropped_out_area1_rides.rename(columns = {'count':'dropped_rides_out'})
df_dropped_out_area2_rides = df_dropped_out_area2.groupby(['yyyymmdd', 'hour'])['count'].sum().reset_index()
df_dropped_out_area2_rides = df_dropped_out_area2_rides.rename(columns = {'count':'dropped_rides_out'})

In [7]:
#Merge dropped_in and dropped_out
df_replenishment_area1 = df_dropped_in_area1_rides.merge(df_dropped_out_area1_rides, on = ['yyyymmdd', 'hour'], how = 'left')
df_replenishment_area2 = df_dropped_in_area2_rides.merge(df_dropped_out_area2_rides, on = ['yyyymmdd', 'hour'], how = 'left')

df_replenishment_area1 = df_replenishment_area1.sort_values(by = ['yyyymmdd', 'hour'], ascending=True)
df_replenishment_area2 = df_replenishment_area2.sort_values(by = ['yyyymmdd', 'hour'], ascending=True)

#Calculate replenishment
df_replenishment_area1['replenishment'] = df_replenishment_area1['dropped_rides_in'] - df_replenishment_area1['dropped_rides_out']
df_replenishment_area2['replenishment'] = df_replenishment_area2['dropped_rides_in'] - df_replenishment_area2['dropped_rides_out']
df_replenishment_area1.head()

Unnamed: 0,yyyymmdd,hour,dropped_rides_in,dropped_rides_out,replenishment
0,20220919,0,93,103,-10
1,20220919,1,47,56,-9
2,20220919,2,31,34,-3
3,20220919,3,18,20,-2
4,20220919,4,16,19,-3


In [8]:
#Save the replenishment to excel
df_replenishment_area1.to_csv('replenishment_bangalore_area1.csv', index = False)
df_replenishment_area2.to_csv('replenishment_bangalore_area2.csv', index = False)