In [1]:
#Import libraries
from pyhive import presto
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import h3
from shapely.geometry import Polygon, Point
from haversine import haversine, Unit
from scipy.optimize import linear_sum_assignment
from datetime import datetime
import calendar
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 500)

In [2]:
#Define presto credentials
presto_host = 'presto.processing.yoda.run'
presto_port = '80'
username = 'aditya.bhattar@rapido.bike'

#Create connection to presto host
connection = presto.connect(presto_host,presto_port,username = username)

In [3]:
#Query data from order_logs_snapshot

q = """
select order_id, accept_to_pickup_distance, city_name, customer_location_hex_8, customer_location_latitude, drop_location_hex_8,
customer_location_longitude, hhmmss, hour, quarter_hour, service_obj_service_name, time_bucket, weekday, yyyymmdd,
order_status, cancel_reason, eta, ride_time, amount, actual_price, captain_location_latitude, captain_location_longitude, captain_location_hex_8
from orders.order_logs_snapshot
where yyyymmdd >= '20230206'
and yyyymmdd <= '20230210'
and city_name in ('Jaipur')
and service_obj_service_name = 'Link'
and customer_location_hex_8 in ('883da20a61fffff', '883da20a65fffff', '883da20a69fffff', '883da20a6dfffff', '883da20b59fffff', '883da21837fffff', '883da218a1fffff',
 '883da218a5fffff', '883da218a7fffff', '883da218adfffff', '883da21981fffff', '883da21983fffff', '883da21987fffff', '883da21989fffff', '883da2198bfffff',
  '883da2198dfffff', '883da21991fffff', '883da21995fffff', '883da21997fffff', '883da21999fffff', '883da2199bfffff', '883da2199dfffff', '883da219b1fffff',
   '883da219b3fffff', '883da219b7fffff', '883da219b9fffff', '883da219bbfffff', '883da219bdfffff', '883da219c1fffff', '883da219c3fffff', '883da219c5fffff',
    '883da219c7fffff', '883da219cbfffff', '883da219d1fffff', '883da219d3fffff', '883da219d5fffff', '883da219d7fffff', '883da219d9fffff', '883da219dbfffff',
     '883da219ddfffff', '883da219ebfffff')
"""

#Load data into pandas table
df_copy = pd.read_sql(q, connection)

In [4]:
df = df_copy.copy()
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,drop_location_hex_8,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8
0,63e2dd5db53f8921e2e34f32,1.414,Jaipur,883da218a5fffff,26.884123,883da218c7fffff,75.794777,45309,4,445,Link,04:59,3,20230208,dropped,,7.0,8.05225,60.0,60.0,26.918909,75.789268,883da218c7fffff
1,63e2d8d74c232671dbb71628,2.237,Jaipur,883da219cbfffff,26.884695,883da219b1fffff,75.755882,43351,4,430,Link,04:59,3,20230208,dropped,,6.0,13.351733,80.0,80.0,26.849974,75.794189,883da219b1fffff
2,63e2dd3b4ee16e21e9429bf6,2.65,Jaipur,883da21989fffff,26.870787,883da21991fffff,75.768242,45235,4,445,Link,04:59,3,20230208,customerCancelled,Asked to pay extra,5.0,16.58,57.0,,26.870771,75.768227,883da21989fffff
3,63e2d9aadbf56764c64fe09f,3.526,Jaipur,883da21995fffff,26.862297,883da21981fffff,75.795692,43722,4,430,Link,04:59,3,20230208,customerCancelled,order cancelled before rider accepted,8.0,5.67,36.0,,26.862179,75.777267,883da21981fffff
4,63e2ded0ea0e0679f27e8dce,1.139,Jaipur,883da2199dfffff,26.871733,883da218c7fffff,75.788002,45920,4,445,Link,04:59,3,20230208,dropped,,1.0,15.5529,69.0,69.0,26.919754,75.789467,883da218c7fffff


In [5]:
#Query data from driving_distance_and_time
q_dist = """
select * from experiments.driving_distance_and_time
where city_name in ('Jaipur')
and hex_resolution = 8
and destination_hex in ('883da20a61fffff', '883da20a65fffff', '883da20a69fffff', '883da20a6dfffff', '883da20b59fffff', '883da21837fffff', '883da218a1fffff',
 '883da218a5fffff', '883da218a7fffff', '883da218adfffff', '883da21981fffff', '883da21983fffff', '883da21987fffff', '883da21989fffff', '883da2198bfffff',
  '883da2198dfffff', '883da21991fffff', '883da21995fffff', '883da21997fffff', '883da21999fffff', '883da2199bfffff', '883da2199dfffff', '883da219b1fffff',
   '883da219b3fffff', '883da219b7fffff', '883da219b9fffff', '883da219bbfffff', '883da219bdfffff', '883da219c1fffff', '883da219c3fffff', '883da219c5fffff',
    '883da219c7fffff', '883da219cbfffff', '883da219d1fffff', '883da219d3fffff', '883da219d5fffff', '883da219d7fffff', '883da219d9fffff', '883da219dbfffff',
     '883da219ddfffff', '883da219ebfffff')
"""

#Load the data into pandas table
df_distance_copy = pd.read_sql(q_dist, connection)

In [6]:
#View the distance dataset
df_distance = df_distance_copy.copy()
df_distance.head()

Unnamed: 0,day_name,destination_hex,distance_in_km,hex_resolution,num_observations,ridetime_in_minutes,run_date,source_hex,time_period,city_name
0,Friday,883da21989fffff,10.775,8,19,24.116588,20230214,883da20a59fffff,evening_peak,Jaipur
1,Monday,883da219b7fffff,2.940375,8,159,6.050258,20230214,883da20b51fffff,evening_peak,Jaipur
2,Monday,883da219bbfffff,3.4345,8,104,9.266667,20230214,883da2198bfffff,rest_morning,Jaipur
3,Thursday,883da21989fffff,7.819,8,63,16.658333,20230214,883da20b47fffff,afternoon,Jaipur
4,Monday,883da219b9fffff,11.4065,8,8,27.375,20230214,883da21861fffff,rest_morning,Jaipur


In [7]:
#Make columns for second and minute and date
df['second'] = df['hhmmss'].apply(lambda x: x[4:])
df['minute'] = df['hhmmss'].apply(lambda x: x[2:4])
df['date'] = df['yyyymmdd'].apply(lambda x: x[6:])
df['weekday_name'] = df['weekday'].apply(lambda x: calendar.day_name[(x-1)])

#Create a column to get the time of the day
df['time_period'] = np.where(df['hour'].isin(['08', '09', '10', '11']), 'morning_peak', np.where(df['hour'].isin(['17', '18', '19', '20', '21']), 'evening_peak',
    np.where(df['hour'].isin(['12, 13, 14, 15, 16']), 'afternoon', np.where(df['hour'].isin(['00', '01', '02', '03', '04', '05', '06', '07']),
     'rest_morning', 'rest_evening'))))


#Make a column to combine cust_lat_long and cap_lat_long
df['cust_lat_long'] = df[['customer_location_latitude', 'customer_location_longitude']].values.tolist()
df['cap_lat_long'] = df[['captain_location_latitude', 'captain_location_longitude']].values.tolist()
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,drop_location_hex_8,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8,second,minute,date,weekday_name,time_period,cust_lat_long,cap_lat_long
0,63e2dd5db53f8921e2e34f32,1.414,Jaipur,883da218a5fffff,26.884123,883da218c7fffff,75.794777,45309,4,445,Link,04:59,3,20230208,dropped,,7.0,8.05225,60.0,60.0,26.918909,75.789268,883da218c7fffff,9,53,8,Wednesday,rest_morning,"[26.884122848510742, 75.7947769165039]","[26.918909072875977, 75.78926849365234]"
1,63e2d8d74c232671dbb71628,2.237,Jaipur,883da219cbfffff,26.884695,883da219b1fffff,75.755882,43351,4,430,Link,04:59,3,20230208,dropped,,6.0,13.351733,80.0,80.0,26.849974,75.794189,883da219b1fffff,51,33,8,Wednesday,rest_morning,"[26.884695053100586, 75.7558822631836]","[26.849973678588867, 75.794189453125]"
2,63e2dd3b4ee16e21e9429bf6,2.65,Jaipur,883da21989fffff,26.870787,883da21991fffff,75.768242,45235,4,445,Link,04:59,3,20230208,customerCancelled,Asked to pay extra,5.0,16.58,57.0,,26.870771,75.768227,883da21989fffff,35,52,8,Wednesday,rest_morning,"[26.870786666870117, 75.76824188232422]","[26.870771408081055, 75.76822662353516]"
3,63e2d9aadbf56764c64fe09f,3.526,Jaipur,883da21995fffff,26.862297,883da21981fffff,75.795692,43722,4,430,Link,04:59,3,20230208,customerCancelled,order cancelled before rider accepted,8.0,5.67,36.0,,26.862179,75.777267,883da21981fffff,22,37,8,Wednesday,rest_morning,"[26.86229705810547, 75.79569244384766]","[26.862178802490234, 75.77726745605469]"
4,63e2ded0ea0e0679f27e8dce,1.139,Jaipur,883da2199dfffff,26.871733,883da218c7fffff,75.788002,45920,4,445,Link,04:59,3,20230208,dropped,,1.0,15.5529,69.0,69.0,26.919754,75.789467,883da218c7fffff,20,59,8,Wednesday,rest_morning,"[26.871732711791992, 75.78800201416016]","[26.919754028320312, 75.78946685791016]"


In [8]:
#Create 20-sec batches
df['second'] = df['second'].astype(float)
df['minute'] = df['minute'].astype(float)
df['20_sec_batch'] = pd.cut(df['second'], bins = [-1, 20, 40, 61], labels = ['0-20', '20-40', '40-60'])
df['count'] = 1
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,drop_location_hex_8,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8,second,minute,date,weekday_name,time_period,cust_lat_long,cap_lat_long,20_sec_batch,count
0,63e2dd5db53f8921e2e34f32,1.414,Jaipur,883da218a5fffff,26.884123,883da218c7fffff,75.794777,45309,4,445,Link,04:59,3,20230208,dropped,,7.0,8.05225,60.0,60.0,26.918909,75.789268,883da218c7fffff,9.0,53.0,8,Wednesday,rest_morning,"[26.884122848510742, 75.7947769165039]","[26.918909072875977, 75.78926849365234]",0-20,1
1,63e2d8d74c232671dbb71628,2.237,Jaipur,883da219cbfffff,26.884695,883da219b1fffff,75.755882,43351,4,430,Link,04:59,3,20230208,dropped,,6.0,13.351733,80.0,80.0,26.849974,75.794189,883da219b1fffff,51.0,33.0,8,Wednesday,rest_morning,"[26.884695053100586, 75.7558822631836]","[26.849973678588867, 75.794189453125]",40-60,1
2,63e2dd3b4ee16e21e9429bf6,2.65,Jaipur,883da21989fffff,26.870787,883da21991fffff,75.768242,45235,4,445,Link,04:59,3,20230208,customerCancelled,Asked to pay extra,5.0,16.58,57.0,,26.870771,75.768227,883da21989fffff,35.0,52.0,8,Wednesday,rest_morning,"[26.870786666870117, 75.76824188232422]","[26.870771408081055, 75.76822662353516]",20-40,1
3,63e2d9aadbf56764c64fe09f,3.526,Jaipur,883da21995fffff,26.862297,883da21981fffff,75.795692,43722,4,430,Link,04:59,3,20230208,customerCancelled,order cancelled before rider accepted,8.0,5.67,36.0,,26.862179,75.777267,883da21981fffff,22.0,37.0,8,Wednesday,rest_morning,"[26.86229705810547, 75.79569244384766]","[26.862178802490234, 75.77726745605469]",20-40,1
4,63e2ded0ea0e0679f27e8dce,1.139,Jaipur,883da2199dfffff,26.871733,883da218c7fffff,75.788002,45920,4,445,Link,04:59,3,20230208,dropped,,1.0,15.5529,69.0,69.0,26.919754,75.789467,883da218c7fffff,20.0,59.0,8,Wednesday,rest_morning,"[26.871732711791992, 75.78800201416016]","[26.919754028320312, 75.78946685791016]",0-20,1


In [9]:
#Create a column to identify cancellation reason
df['new_cancel_reason'] = np.where(df['order_status'] == 'customerCancelled', np.where(df['cancel_reason'] == 'order cancelled before rider accepted', 'COBRA',
 np.where(df['cancel_reason'] == 'Order cancelled before rider was mapped', 'COBRM', 'OCARA')), 'noCancellation')

#Filter only on the required hexesnand morning 9 AM
df = df.sort_values(by = ['yyyymmdd', 'minute', 'second'], ascending = True).reset_index(drop = True)
df = df[(df['order_status'] == 'dropped') | (df['new_cancel_reason'] == 'OCARA')]
df.head()

Unnamed: 0,order_id,accept_to_pickup_distance,city_name,customer_location_hex_8,customer_location_latitude,drop_location_hex_8,customer_location_longitude,hhmmss,hour,quarter_hour,service_obj_service_name,time_bucket,weekday,yyyymmdd,order_status,cancel_reason,eta,ride_time,amount,actual_price,captain_location_latitude,captain_location_longitude,captain_location_hex_8,second,minute,date,weekday_name,time_period,cust_lat_long,cap_lat_long,20_sec_batch,count,new_cancel_reason
0,63e0f2c83110a2551cb14963,0.226,Jaipur,883da21995fffff,26.862953,883da20a65fffff,75.793503,180000,18,1800,Link,18:00,1,20230206,dropped,,2.0,13.194867,30.0,41.0,26.85532,75.813614,883da20a65fffff,0.0,0.0,6,Monday,evening_peak,"[26.862953186035156, 75.79350280761719]","[26.85531997680664, 75.81361389160156]",0-20,1,noCancellation
1,63e0f2c871b8c2564af45af9,0.047,Jaipur,883da21991fffff,26.872128,883da2199dfffff,75.799156,180000,18,1800,Link,18:00,1,20230206,dropped,,1.0,6.281283,18.0,26.0,26.871145,75.790077,883da2199dfffff,0.0,0.0,6,Monday,evening_peak,"[26.872127532958984, 75.79915618896484]","[26.871145248413086, 75.79007720947266]",0-20,1,noCancellation
2,63e100d8af067c7beff6fd79,0.669,Jaipur,883da21991fffff,26.871822,883da2194bfffff,75.797371,190000,19,1900,Link,19:00,1,20230206,customerCancelled,Asked to pay extra,1.0,32.083333,144.0,,26.872644,75.797478,883da21991fffff,0.0,0.0,6,Monday,evening_peak,"[26.871822357177734, 75.79737091064453]","[26.872644424438477, 75.79747772216797]",0-20,1,OCARA
3,63e0f2c971b8c2564af45afa,0.313,Jaipur,883da21991fffff,26.871588,883da218a5fffff,75.797089,180001,18,1800,Link,18:00,1,20230206,dropped,,1.0,6.23625,12.0,28.0,26.886189,75.798592,883da218a5fffff,1.0,0.0,6,Monday,evening_peak,"[26.8715877532959, 75.79708862304688]","[26.886188507080078, 75.79859161376953]",0-20,1,noCancellation
4,63e0ac791604d44b7e17902f,0.633,Jaipur,883da2199bfffff,26.879202,883da20a4dfffff,75.805199,130001,13,1300,Link,13:00,1,20230206,dropped,,4.0,3.415167,21.0,31.0,26.876383,75.810562,883da20a4dfffff,1.0,0.0,6,Monday,rest_evening,"[26.879201889038086, 75.8051986694336]","[26.87638282775879, 75.81056213378906]",0-20,1,noCancellation


In [10]:
#Filter the df by hour to get hourly reduction in fm
dates = {}
for date in df['yyyymmdd'].unique().tolist():
    df_date = df[df['yyyymmdd'] == date]
    hours = {}
    for hour in df_date['hour'].unique().tolist():
        df_hourly = df_date[df_date['hour'] == hour]
        df_distance_day = df_distance[df_distance['day_name'] == df_hourly['weekday_name'].values[0]]

        #Get the captains available for every 20_sec_batch
        captains_available = df_hourly.groupby(['hour', 'minute', '20_sec_batch'])['captain_location_hex_8'].apply(list).reset_index()
        captains_available2 = df_hourly.groupby(['hour', 'minute', '20_sec_batch'])['cap_lat_long'].apply(list).reset_index()
        captains_available = captains_available.rename(columns = {'captain_location_hex_8':'cap_location_hexes'})
        captains_available2 = captains_available2.rename(columns = {'cap_lat_long':'cap_location_lat_long'})
        #captains_available['number_of_captains'] = captains_available['cap_location_hexes'].apply(lambda x: len(x))
        captains_available = captains_available.merge(captains_available2, on = ['hour', 'minute', '20_sec_batch'], how = 'left')

        #Merge the captain locations with our df
        df_hourly = df_hourly.merge(captains_available, on = ['hour', 'minute', '20_sec_batch'], how = 'left')
        
        #Filter the distance df based on the hour
        if hour in ['08', '09', '10', '11']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'morning_peak']
        elif hour in ['17', '18', '19', '20', '21']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'evening_peak']
        elif hour in ['12, 13, 14, 15, 16']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'afternoon']
        elif hour in ['00', '01', '02', '03', '04', '05', '06', '07']:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'rest_morning']
        else:
            df_distance_filter = df_distance_day[df_distance_day['time_period'] == 'rest_evening']

        #Combine the source and destination hex and make a dictionary to get distance between each hex combo
        df_distance_filter['source_destination'] = df_distance_filter['source_hex'] + df_distance_filter['destination_hex']
        distance_dict = dict(zip(df_distance_filter['source_destination'], df_distance_filter['distance_in_km']))

        #Get the distance between customer and each captain available
        customer_hexes = df_hourly['customer_location_hex_8'].values.tolist()
        captain_hexes = df_hourly['cap_location_hexes'].values.tolist()
        cap_lat_long = df_hourly['cap_location_lat_long'].values.tolist()
        cust_lat_long = df_hourly['cust_lat_long'].values.tolist()

        i = 0
        captain_distances = []
        for x in captain_hexes:
            customer_distance = []
            for hex in range(0, len(x)):
                key_hex = x[hex] + customer_hexes[i]
                if key_hex in distance_dict:
                    individual_distance = distance_dict[key_hex]
                else:
                    individual_distance = round(haversine(cap_lat_long[i][hex], cust_lat_long[i]), 3)
                customer_distance.append(individual_distance)
            captain_distances.append(customer_distance)
            i += 1

        #Add columns for captain distances and etas to each customer
        df_hourly['captain_distances'] = captain_distances

        #Get the nxn matrix for each 20_sec_batch
        df_batches = df_hourly.groupby(['minute', '20_sec_batch'])['captain_distances'].apply(list).reset_index()
        df_batches = df_batches.dropna()

        #Get the row_index and column index suitable for each customer
        optimal_distance = []
        for x in df_batches['captain_distances'].values.tolist():
            try:
                row_ind, col_ind = linear_sum_assignment(x)
            except Exception as e:
                continue
            optimal_distance.append(col_ind)

        #Get the optimal distance for each customer
        captain_customer_dist = df_batches['captain_distances'].values.tolist()
        i = 0
        min_distances = []
        for x in captain_customer_dist:
            j = 0
            for y in x:
                if type(y) == float:
                    continue
                min_distance = y[optimal_distance[i][j]]
                j += 1
                min_distances.append(min_distance)
            i += 1

        #Add the new_dist to the df
        df_hourly['new_FM'] = min_distances

        #Calculate old_FM
        df_hourly['cap_cust_hex'] = df_hourly['captain_location_hex_8'] + df_hourly['customer_location_hex_8']
        captain_customer_dist = []
        for x in df_hourly['cap_cust_hex'].values.tolist():
            if x in distance_dict:
                distance = distance_dict[x]
            else:
                distance = 'NA'
            captain_customer_dist.append(distance)

        cap_lat_long = df_hourly['cap_lat_long'].values.tolist()
        cust_lat_long = df_hourly['cust_lat_long'].values.tolist()

        haversine_cust_cap = []
        for i in range(0, len(cap_lat_long)):
            haversine_distance = round(haversine(cap_lat_long[i], cust_lat_long[i]), 3)
            haversine_cust_cap.append(haversine_distance)

        for i in range(0, len(captain_customer_dist)):
            if captain_customer_dist[i] == 'NA':
                captain_customer_dist[i] = haversine_cust_cap[i]

        df_hourly['old_FM'] = captain_customer_dist

        #Get the mean FM for each 20_sec_batch (old and new)
        fm_min = df_hourly.groupby(['minute', '20_sec_batch'])['old_FM', 'new_FM'].mean().reset_index()

        #Add a column for percentage increase/decrease
        fm_min['%_change_dist'] = (fm_min['new_FM']/fm_min['old_FM'] - 1)

        #Get the percentage change in FM
        fm_change = round((fm_min['new_FM'].mean()/fm_min['old_FM'].mean() - 1), 4)
        hours[hour] = fm_change
    dates[date] = hours

In [11]:
#Create a df from the dates dict
hourly_decrease = pd.DataFrame(dates).T
hourly_decrease = hourly_decrease.reset_index()
hourly_decrease = pd.melt(hourly_decrease, id_vars='index')
hourly_decrease.columns = ['yyyymmdd', 'hour', 'FM_decrease']
hourly_decrease = hourly_decrease.sort_values(by = ['yyyymmdd', 'hour'], ascending=True)
hourly_decrease

Unnamed: 0,yyyymmdd,hour,FM_decrease
95,20230206,00,-0.0271
100,20230206,01,0.0000
110,20230206,02,0.0000
115,20230206,03,0.0000
105,20230206,04,-0.0019
...,...,...,...
9,20230210,19,-0.0901
49,20230210,20,-0.0875
64,20230210,21,-0.0466
44,20230210,22,-0.0377


In [12]:
hourly_decrease.to_csv('decrease_FM_batching_jaipur.csv', index = False)