In [1]:
import pandas as pd
import numpy as np
import random

from tqdm.notebook import tqdm

In [26]:
# !pip install geopy
!pip install ipywidgets

from geopy.distance import geodesic

Collecting ipywidgets
  Downloading ipywidgets-8.0.4-py3-none-any.whl (137 kB)
Collecting widgetsnbextension~=4.0
  Downloading widgetsnbextension-4.0.5-py3-none-any.whl (2.0 MB)
Collecting jupyterlab-widgets~=3.0
  Downloading jupyterlab_widgets-3.0.5-py3-none-any.whl (384 kB)
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.0.4 jupyterlab-widgets-3.0.5 widgetsnbextension-4.0.5


In [27]:
import warnings
warnings.filterwarnings('ignore')

In [28]:
df = pd.read_csv("practicum2/data/london_postcodes.csv", delimiter=',')

In [29]:
df.shape

(327525, 53)

In [30]:
df.head()

Unnamed: 0,Postcode,In Use?,Latitude,Longitude,Easting,Northing,Grid Ref,County,District,Ward,...,Police force,Water company,Plus Code,Average Income,Sewage Company,Travel To Work Area,ITL level 2,ITL level 3,UPRNs,Distance to sea
0,BR1 1AA,Yes,51.401546,0.015415,540291,168873,TQ402688,Greater London,Bromley,Bromley Town,...,Metropolitan Police,Thames Water,9F32C228+J5,63100,,London,Outer London - South,Bromley,"10070014435,10070014436,10070014437,1007001443...",28.073
1,BR1 1AB,Yes,51.406333,0.015208,540262,169405,TQ402694,Greater London,Bromley,Bromley Town,...,Metropolitan Police,Thames Water,9F32C248+G3,56100,,London,Outer London - South,Bromley,"10070008860,10070008861,10070008862,1007000886...",27.9776
2,BR1 1AD,No,51.400057,0.016715,540386,168710,TQ403687,Greater London,Bromley,Bromley Town,...,Metropolitan Police,,9F32C228+2M,63100,,London,Outer London - South,Bromley,,28.0211
3,BR1 1AE,Yes,51.404543,0.014195,540197,169204,TQ401692,Greater London,Bromley,Bromley Town,...,Metropolitan Police,Thames Water,9F32C237+RM,63100,,London,Outer London - South,Bromley,"10003640209,10070000614,10070002658,1007000265...",28.0861
4,BR1 1AF,Yes,51.401392,0.014948,540259,168855,TQ402688,Greater London,Bromley,Bromley Town,...,Metropolitan Police,Thames Water,9F32C227+HX,63100,,London,Outer London - South,Bromley,"10070014484,10070014485,10070014486,1007001448...",28.1083


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 327525 entries, 0 to 327524
Data columns (total 53 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Postcode                        327525 non-null  object 
 1   In Use?                         327525 non-null  object 
 2   Latitude                        327525 non-null  float64
 3   Longitude                       327525 non-null  float64
 4   Easting                         327525 non-null  int64  
 5   Northing                        327525 non-null  int64  
 6   Grid Ref                        327525 non-null  object 
 7   County                          327525 non-null  object 
 8   District                        327525 non-null  object 
 9   Ward                            327525 non-null  object 
 10  District Code                   327525 non-null  object 
 11  Ward Code                       327525 non-null  object 
 12  Country         

In [32]:
rides = pd.DataFrame(columns=['driver_id', 'client_id',\
                              'start', 'start_latitude', 'start_longtitude', \
                              'finish', 'finish_latitude', 'finish_longtitude', \
                              'distance', 'road_time', 'start_time', 'finish_time', 'cost', \
                              'driver_rate', 'category_driver_feedback', 'text_driver_feedback',\
                             'client_rate', 'category_client_feedback', 'text_client_feedback'])
NUM_RIDES = 5000000

Drivers and clients id's

In [33]:
rides['driver_id'] = np.random.randint(low=0, high=2500, size=NUM_RIDES)
rides['client_id'] = np.random.randint(low=0, high=4500, size=NUM_RIDES)

Start and finish points

In [34]:
rides[['start', 'start_latitude', 'start_longtitude']] = df[['Postcode', 'Latitude', 'Longitude']].sample(n=NUM_RIDES, replace=True).reset_index(drop=True)

In [35]:
rides[['finish', 'finish_latitude', 'finish_longtitude']] = df[['Postcode', 'Latitude', 'Longitude']].sample(n=NUM_RIDES, replace=True).reset_index(drop=True)

Start time

In [36]:
def random_dates(start, end, n=10):
    start_u = start.value//10**9
    end_u = end.value//10**9
    return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')

start = pd.to_datetime('2010-01-01')
end = pd.to_datetime('2020-01-01')
rides['start_time'] = random_dates(start, end, NUM_RIDES)

Distance between start and finish points

In [38]:
rides['distance'] = [geodesic((x1, y1), (x2, y2)).km for x1, y1, x2, y2 in zip(rides['start_latitude'], \
                                                                                              rides['start_longtitude'], \
                                                                                              rides['finish_latitude'], \
                                                                                              rides['finish_longtitude'])]
rides['distance'] = rides['distance'].round(2)

Calculate road time

In [39]:
rides['road_time'] = abs(np.random.normal(size=NUM_RIDES, scale=10)) + rides['distance'] * abs(np.random.normal(size=NUM_RIDES, loc=1, scale=0.25))
rides['road_time'] = rides['road_time'].astype('int')
rides['road_time'] = pd.to_timedelta(rides['road_time'], unit='m')

Calculate finish time

In [40]:
rides['finish_time'] = rides['start_time'] + rides['road_time']

Calculate cost of the ride

In [43]:
def count_cost(start_time, distance):
    cost = 2 + 0.5 * distance
    if (start_time.hour >= 8 and start_time.hour <= 9) or \
        (start_time.hour >= 18 and start_time.hour <= 19):
        cost *= 1.5
    if (start_time.hour >= 22 or start_time.hour <= 6):
        cost *= 1.3
    return cost
    
rides['cost'] = [count_cost(s, d) for s, d in zip(rides.start_time, rides.distance)]
rides['cost'] = rides['cost'].round(2)

Exception ignored in: <function tqdm.__del__ at 0x00000245251845E0>
Traceback (most recent call last):
  File "c:\Users\wsu\anaconda3\envs\env-main\lib\site-packages\tqdm\std.py", line 1147, in __del__
    self.close()
  File "c:\Users\wsu\anaconda3\envs\env-main\lib\site-packages\tqdm\notebook.py", line 286, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


Drivers rates

In [44]:
driver_rate_idx = np.random.randint(low=0, high=NUM_RIDES, size=int(NUM_RIDES*0.3))
driver_rate_distribution_arr = np.random.multinomial(1, [0.2, 0.05, 0.1, 0.25, 0.4], size=int(NUM_RIDES*0.3))
rides['driver_rate'][driver_rate_idx] = np.where(driver_rate_distribution_arr == 1)[1] + 1

In [45]:
driver_feedback_categories_good = ['great service', 'nice car', 'wonderful companion', 'neat and tidy', 'expert navigation', 'recommend']
driver_feedback_categories_bad = ['awful service', 'bad car', 'unpleasant companion', 'dirty', 'non-expert navigation', 'not recommend']

In [46]:
category_driver_good_feedback_idx = np.random.choice(rides[rides.driver_rate > 3].index, size=int(NUM_RIDES*0.3*0.2))
rides["category_driver_feedback"][category_driver_good_feedback_idx] = np.random.choice(driver_feedback_categories_good, size=int(NUM_RIDES*0.3*0.2))

category_driver_bad_feedback_idx = np.random.choice(rides[rides.driver_rate < 4].index, size=int(NUM_RIDES*0.3*0.2))
rides["category_driver_feedback"][category_driver_bad_feedback_idx] = np.random.choice(driver_feedback_categories_bad, size=int(NUM_RIDES*0.3*0.2))

In [47]:
text_good_feedback_driver_length = np.random.randint(low=0, high=7, size=int(NUM_RIDES*0.3*0.2))
text_good_feedback_driver_sample = [random.sample(driver_feedback_categories_good, i) for i in text_good_feedback_driver_length]
rides['text_driver_feedback'][category_driver_good_feedback_idx] = text_good_feedback_driver_sample

text_bad_feedback_driver_length = np.random.randint(low=0, high=7, size=int(NUM_RIDES*0.3*0.2))
text_bad_feedback_driver_sample = [random.sample(driver_feedback_categories_bad, i) for i in text_bad_feedback_driver_length]
rides['text_driver_feedback'][category_driver_bad_feedback_idx] = text_bad_feedback_driver_sample

Clients rates

In [48]:
client_rate_idx = np.random.randint(low=0, high=NUM_RIDES, size=int(NUM_RIDES*0.5))
client_rate_distribution_arr = np.random.multinomial(1, [0.2, 0.05, 0.1, 0.25, 0.4], size=int(NUM_RIDES*0.5))
rides['client_rate'][client_rate_idx] = np.where(client_rate_distribution_arr == 1)[1] + 1

In [49]:
client_feedback_categories_good = ['polite', 'pleasant', 'quiet', 'neat and tidy', 'recommend']
client_feedback_categories_bad = ['unpolite', 'unpleasant', 'loud', 'dirty','not recommend']

In [50]:
category_client_good_feedback_idx = np.random.choice(rides[rides.client_rate > 3].index, size=int(NUM_RIDES*0.3*0.2))
rides["category_client_feedback"][category_client_good_feedback_idx] = np.random.choice(client_feedback_categories_good, size=int(NUM_RIDES*0.3*0.2))

category_client_bad_feedback_idx = np.random.choice(rides[rides.client_rate < 4].index, size=int(NUM_RIDES*0.3*0.2))
rides["category_client_feedback"][category_client_bad_feedback_idx] = np.random.choice(client_feedback_categories_bad, size=int(NUM_RIDES*0.3*0.2))

In [51]:
text_good_feedback_client_length = np.random.randint(low=0, high=6, size=int(NUM_RIDES*0.3*0.2))
text_good_feedback_client_sample = [random.sample(client_feedback_categories_good, i) for i in text_good_feedback_client_length]
rides['text_client_feedback'][category_client_good_feedback_idx] = text_good_feedback_client_sample

text_bad_feedback_client_length = np.random.randint(low=0, high=6, size=int(NUM_RIDES*0.3*0.2))
text_bad_feedback_client_sample = [random.sample(client_feedback_categories_good, i) for i in text_bad_feedback_client_length]
rides['text_client_feedback'][category_client_good_feedback_idx] = text_bad_feedback_client_sample

In [52]:
rides.head()

Unnamed: 0,driver_id,client_id,start,start_latitude,start_longtitude,finish,finish_latitude,finish_longtitude,distance,road_time,start_time,finish_time,cost,driver_rate,category_driver_feedback,text_driver_feedback,client_rate,category_client_feedback,text_client_feedback
0,263,4164,CR8 2EQ,51.328882,-0.111292,SW14 7ES,51.466422,-0.267102,18.75,0 days 00:19:00,2012-07-02 15:19:14,2012-07-02 15:38:14,11.38,5.0,,,5.0,,
1,271,2153,SE15 6XL,51.471766,-0.070925,SW9 7SH,51.469102,-0.10663,2.5,0 days 00:14:00,2015-08-02 09:02:38,2015-08-02 09:16:38,4.88,1.0,,,,,
2,1288,2624,SW3 2ET,51.496215,-0.168982,E17 5RL,51.593946,-0.03181,14.45,0 days 00:27:00,2014-01-21 11:15:26,2014-01-21 11:42:26,9.22,,,,,,
3,821,2717,N16 5GZ,51.567659,-0.07342,CR2 3NA,51.34216,-0.130647,25.4,0 days 00:26:00,2012-11-09 16:47:47,2012-11-09 17:13:47,14.7,,,,5.0,quiet,[recommend]
4,1292,1709,IG3 8BE,51.562887,0.09585,SW1V 2WB,51.496429,-0.137437,17.8,0 days 00:26:00,2017-12-28 11:24:50,2017-12-28 11:50:50,10.9,,,,,,


In [53]:
rides.to_csv("practicum2/data/rides.csv")