In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from datetime import datetime

In [3]:
driver_id_path = 'driver_ids.csv'
ride_id_path = 'ride_ids.csv'
timestamps_path = 'ride_timestamps.csv'

driver_ids = pd.read_csv(driver_id_path)
ride_ids = pd.read_csv(ride_id_path)
timestamps = pd.read_csv(timestamps_path)

In [4]:
sep = ride_ids[['driver_id', 'ride_id']]

print("Number of unique drives", len(np.unique(sep['driver_id'])))
print('total number of rides', sep.shape[0])
grouped = sep.groupby('driver_id').count()
print("max number of rides by a single driver", max(grouped['ride_id']))
grouped.describe()


Number of unique drives 937
total number of rides 193502
max number of rides by a single driver 919


Unnamed: 0,ride_id
count,937.0
mean,206.512273
std,173.254063
min,3.0
25%,47.0
50%,200.0
75%,316.0
max,919.0


In [5]:
driver_prime = ride_ids[["driver_id", "ride_prime_time"]]
driver_prime_count = driver_prime.groupby('ride_prime_time').count()
print(driver_prime_count)
driver_prime_count.describe()

                 driver_id
ride_prime_time           
0                   125412
25                   33677
50                   17712
75                    8208
100                   6216
150                   1686
200                    432
250                    101
300                     31
350                     15
400                     11
500                      1


Unnamed: 0,driver_id
count,12.0
mean,16125.166667
std,35884.693705
min,1.0
25%,27.0
50%,1059.0
75%,10584.0
max,125412.0


In [6]:
driv_ride = ride_ids[['driver_id', 'ride_id']]
driv_ride.head()

Unnamed: 0,driver_id,ride_id
0,002be0ffdc997bd5c50703158b7c2491,006d61cf7446e682f7bc50b0f8a5bea5
1,002be0ffdc997bd5c50703158b7c2491,01b522c5c3a756fbdb12e95e87507eda
2,002be0ffdc997bd5c50703158b7c2491,029227c4c2971ce69ff2274dc798ef43
3,002be0ffdc997bd5c50703158b7c2491,034e861343a63ac3c18a9ceb1ce0ac69
4,002be0ffdc997bd5c50703158b7c2491,034f2e614a2f9fc7f1c2f77647d1b981


In [7]:
uniq_rides = timestamps[timestamps['event'] == 'accepted_at'][['ride_id', 'timestamp']]
uniq_rides.head()

Unnamed: 0,ride_id,timestamp
1,00003037a262d9ee40e61b5c0718f7f0,2016-06-13 09:39:51
6,00005eae40882760d675da5effb89ae3,2016-05-14 05:23:25
11,000061d42cf29f73b591041d9a1b2973,2016-05-16 15:43:14
16,00006efeb0d5e3ccad7d921ddeee9900,2016-05-11 19:29:43
21,0000d9b24d8ccdd991b76258e616fa01,2016-04-26 18:12:15


In [8]:
driv_ride_time = pd.merge(uniq_rides, driv_ride, on='ride_id')
print(driv_ride_time.shape)
driv_ride_time.head()

(184819, 3)


Unnamed: 0,ride_id,timestamp,driver_id
0,00003037a262d9ee40e61b5c0718f7f0,2016-06-13 09:39:51,d967f5296732fa55266b5f1314e7447b
1,00005eae40882760d675da5effb89ae3,2016-05-14 05:23:25,0656192a402808805282e60761bda088
2,000061d42cf29f73b591041d9a1b2973,2016-05-16 15:43:14,c468a648519cd42da75e6aa9dadf733e
3,00006efeb0d5e3ccad7d921ddeee9900,2016-05-11 19:29:43,689bdf87fb2de49f98bf4946cfaa5068
4,00012759befd5d34a0609800f6a1ee59,2016-05-31 15:45:05,eece82fe623b4bb335a9b9e20eb0ca54


In [9]:
latest_ride = driv_ride_time.groupby('driver_id').max()
latest_ride.rename(columns={'timestamp':'latest_ride'}, inplace=True)
latest_ride.drop('ride_id', axis=1, inplace=True)

oldest_ride = driv_ride_time.groupby('driver_id').min()
oldest_ride.rename(columns={'timestamp':'oldest_ride'}, inplace=True)
oldest_ride.drop('ride_id', axis=1, inplace=True)
oldest_ride

retention_raw = pd.merge(oldest_ride, latest_ride, on='driver_id')
retention_raw.head()

Unnamed: 0_level_0,oldest_ride,latest_ride
driver_id,Unnamed: 1_level_1,Unnamed: 2_level_1
002be0ffdc997bd5c50703158b7c2491,2016-03-29 18:47:01,2016-06-23 10:06:30
007f0389f9c7b03ef97098422f902e62,2016-03-29 22:28:34,2016-06-22 13:17:44
011e5c5dfc5c2c92501b8b24d47509bc,2016-04-05 10:55:00,2016-06-12 20:22:27
0152a2f305e71d26cc964f8d4411add9,2016-04-25 15:59:35,2016-06-26 10:16:39
01674381af7edd264113d4e6ed55ecda,2016-04-29 07:50:47,2016-06-24 13:03:42


In [10]:
to_date_obj = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S').date()
oldest_date_obj = retention_raw['oldest_ride'].apply(to_date_obj)
latest_date_obj = retention_raw['latest_ride'].apply(to_date_obj)
retention = pd.concat([oldest_date_obj,latest_date_obj], axis=1)

In [11]:
retention['retention_period (in days)'] = (retention['latest_ride'] - retention['oldest_ride']).dt.days


In [12]:
driver_v_retention = retention.filter(['driver_id','retention_period (in days)'], axis=1)
driver_v_retention.reset_index(level=0, inplace=True)
print(driver_v_retention.shape)
len(np.unique(driver_v_retention['driver_id']))
# no repeat drivers 

(844, 2)


844

In [13]:
driver_retention_count = driver_v_retention.groupby('retention_period (in days)').count()
driver_retention_count.head()

Unnamed: 0_level_0,driver_id
retention_period (in days),Unnamed: 1_level_1
1,3
2,1
3,2
4,5
5,3


In [14]:
driver_retention_count.reset_index(level=0, inplace=True)

In [15]:
# driver_retention_count.to_csv('driver_ren.csv', index=False)

In [16]:
driver_ids.head()

Unnamed: 0,driver_id,driver_onboard_date
0,002be0ffdc997bd5c50703158b7c2491,2016-03-29 00:00:00
1,007f0389f9c7b03ef97098422f902e62,2016-03-29 00:00:00
2,011e5c5dfc5c2c92501b8b24d47509bc,2016-04-05 00:00:00
3,0152a2f305e71d26cc964f8d4411add9,2016-04-23 00:00:00
4,01674381af7edd264113d4e6ed55ecda,2016-04-29 00:00:00


In [17]:
ride_ids.head()

Unnamed: 0,driver_id,ride_id,ride_distance,ride_duration,ride_prime_time
0,002be0ffdc997bd5c50703158b7c2491,006d61cf7446e682f7bc50b0f8a5bea5,1811,327,50
1,002be0ffdc997bd5c50703158b7c2491,01b522c5c3a756fbdb12e95e87507eda,3362,809,0
2,002be0ffdc997bd5c50703158b7c2491,029227c4c2971ce69ff2274dc798ef43,3282,572,0
3,002be0ffdc997bd5c50703158b7c2491,034e861343a63ac3c18a9ceb1ce0ac69,65283,3338,25
4,002be0ffdc997bd5c50703158b7c2491,034f2e614a2f9fc7f1c2f77647d1b981,4115,823,100


In [18]:
timestamps.head()

Unnamed: 0,ride_id,event,timestamp
0,00003037a262d9ee40e61b5c0718f7f0,requested_at,2016-06-13 09:39:19
1,00003037a262d9ee40e61b5c0718f7f0,accepted_at,2016-06-13 09:39:51
2,00003037a262d9ee40e61b5c0718f7f0,arrived_at,2016-06-13 09:44:31
3,00003037a262d9ee40e61b5c0718f7f0,picked_up_at,2016-06-13 09:44:33
4,00003037a262d9ee40e61b5c0718f7f0,dropped_off_at,2016-06-13 10:03:05


In [19]:
driv_ride_time.head()

Unnamed: 0,ride_id,timestamp,driver_id
0,00003037a262d9ee40e61b5c0718f7f0,2016-06-13 09:39:51,d967f5296732fa55266b5f1314e7447b
1,00005eae40882760d675da5effb89ae3,2016-05-14 05:23:25,0656192a402808805282e60761bda088
2,000061d42cf29f73b591041d9a1b2973,2016-05-16 15:43:14,c468a648519cd42da75e6aa9dadf733e
3,00006efeb0d5e3ccad7d921ddeee9900,2016-05-11 19:29:43,689bdf87fb2de49f98bf4946cfaa5068
4,00012759befd5d34a0609800f6a1ee59,2016-05-31 15:45:05,eece82fe623b4bb335a9b9e20eb0ca54


In [29]:
mean_stats_by_driver = ride_ids.groupby("driver_id").mean()
mean_stats_by_driver.reset_index(level=0, inplace=True)
mean_stats_by_driver.rename(columns={"ride_distance": "avg_ride_distance", "ride_duration": "avg_ride_duration", "ride_prime_time": "avg_ride_prime_time"}, inplace=True)
mean_stats_by_driver.head()

Unnamed: 0,driver_id,avg_ride_distance,avg_ride_duration,avg_ride_prime_time
0,002be0ffdc997bd5c50703158b7c2491,6282.624549,798.693141,19.404332
1,007f0389f9c7b03ef97098422f902e62,3791.322581,661.193548,20.16129
2,011e5c5dfc5c2c92501b8b24d47509bc,7930.970588,858.970588,19.852941
3,0152a2f305e71d26cc964f8d4411add9,7702.82199,913.722513,10.732984
4,01674381af7edd264113d4e6ed55ecda,8329.717333,953.181333,12.533333


In [30]:
mean_stats_by_driver.to_csv('mean_stats_by_driver.csv', index=False)