### Kaggle Food Delivery Dataset 
https://www.kaggle.com/datasets/gauravmalik26/food-delivery-dataset?resource=download

---
#### What's the KPI of delivery? 
- 1) Right product, 2) Right location, 3) Understandable time 
---

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print(train.shape)
print(test.shape)

(45593, 20)
(11399, 19)


In [3]:
train.head()

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,0x4607,INDORES13DEL02,37,4.9,22.745049,75.892471,22.765049,75.912471,19-03-2022,11:30:00,11:45:00,conditions Sunny,High,2,Snack,motorcycle,0,No,Urban,(min) 24
1,0xb379,BANGRES18DEL02,34,4.5,12.913041,77.683237,13.043041,77.813237,25-03-2022,19:45:00,19:50:00,conditions Stormy,Jam,2,Snack,scooter,1,No,Metropolitian,(min) 33
2,0x5d6d,BANGRES19DEL01,23,4.4,12.914264,77.6784,12.924264,77.6884,19-03-2022,08:30:00,08:45:00,conditions Sandstorms,Low,0,Drinks,motorcycle,1,No,Urban,(min) 26
3,0x7a6a,COIMBRES13DEL02,38,4.7,11.003669,76.976494,11.053669,77.026494,05-04-2022,18:00:00,18:10:00,conditions Sunny,Medium,0,Buffet,motorcycle,1,No,Metropolitian,(min) 21
4,0x70a2,CHENRES12DEL01,32,4.6,12.972793,80.249982,13.012793,80.289982,26-03-2022,13:30:00,13:45:00,conditions Cloudy,High,1,Snack,scooter,1,No,Metropolitian,(min) 30


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45593 non-null  object 
 1   Delivery_person_ID           45593 non-null  object 
 2   Delivery_person_Age          45593 non-null  object 
 3   Delivery_person_Ratings      45593 non-null  object 
 4   Restaurant_latitude          45593 non-null  float64
 5   Restaurant_longitude         45593 non-null  float64
 6   Delivery_location_latitude   45593 non-null  float64
 7   Delivery_location_longitude  45593 non-null  float64
 8   Order_Date                   45593 non-null  object 
 9   Time_Orderd                  45593 non-null  object 
 10  Time_Order_picked            45593 non-null  object 
 11  Weatherconditions            45593 non-null  object 
 12  Road_traffic_density         45593 non-null  object 
 13  Vehicle_conditio

----
### 1. 숨어있는 결측치 처리 
----

In [5]:
## Delivery_person_Age, Delivery_person_Ratings --> int, float로 변환하려는데 에러발생
## .info()에서는 null값이 없는 것으로 나왔으나, 실제는 str로 인식된 NaN이 다수 존재하기 때문임  
print(train['Delivery_person_Age'].value_counts())

35      2262
36      2260
37      2227
30      2226
38      2219
24      2210
32      2202
22      2196
29      2191
33      2187
28      2179
25      2174
34      2166
26      2159
21      2153
27      2150
39      2144
20      2136
31      2120
23      2087
NaN     1854
50        53
15        38
Name: Delivery_person_Age, dtype: int64


In [6]:
## NaN을 float 타입으로 변환 후, 다시 결측값 확인 
train = train.replace(" ", "").replace('NaN', float(np.nan), regex = True)
train.isnull().sum()

ID                                0
Delivery_person_ID                0
Delivery_person_Age            1854
Delivery_person_Ratings        1908
Restaurant_latitude               0
Restaurant_longitude              0
Delivery_location_latitude        0
Delivery_location_longitude       0
Order_Date                        0
Time_Orderd                    1731
Time_Order_picked                 0
Weatherconditions               616
Road_traffic_density            601
Vehicle_condition                 0
Type_of_order                     0
Type_of_vehicle                   0
multiple_deliveries             993
Festival                        228
City                           1200
Time_taken(min)                   0
dtype: int64

In [7]:
## 결측치 모두 제거 
train = train.dropna(axis = 0)
train = train.reset_index(drop=True)
train.isnull().sum()

ID                             0
Delivery_person_ID             0
Delivery_person_Age            0
Delivery_person_Ratings        0
Restaurant_latitude            0
Restaurant_longitude           0
Delivery_location_latitude     0
Delivery_location_longitude    0
Order_Date                     0
Time_Orderd                    0
Time_Order_picked              0
Weatherconditions              0
Road_traffic_density           0
Vehicle_condition              0
Type_of_order                  0
Type_of_vehicle                0
multiple_deliveries            0
Festival                       0
City                           0
Time_taken(min)                0
dtype: int64

---
#### 2. Time_Orderd, Time_Order_picked : timedelta로 타입을 변경
---

In [8]:
## Time_Orderd, Time_Order_picked --> timedelta로 변경 
train['Time_Orderd'] = pd.to_timedelta(train['Time_Orderd'])
train['Time_Order_picked'] = pd.to_timedelta(train['Time_Order_picked'])

train[['Time_Orderd', 'Time_Order_picked']]

Unnamed: 0,Time_Orderd,Time_Order_picked
0,0 days 11:30:00,0 days 11:45:00
1,0 days 19:45:00,0 days 19:50:00
2,0 days 08:30:00,0 days 08:45:00
3,0 days 18:00:00,0 days 18:10:00
4,0 days 13:30:00,0 days 13:45:00
...,...,...
41363,0 days 11:35:00,0 days 11:45:00
41364,0 days 19:55:00,0 days 20:10:00
41365,0 days 23:50:00,0 days 00:05:00
41366,0 days 13:35:00,0 days 13:40:00


---
#### 3. Weatherconditions 변수의 conditions와 Time_taken(min)의 (min)단어를 제거
---

In [9]:
## Weatherconditions 컬럼의 불필요한 단어(conditions) 제거 
## Time_taken(min) 컬럼의 불필요한 단어(min) 제거 
train['Weatherconditions'] = train['Weatherconditions'].str.slice(start=11)
train['Time_taken(min)'] = train['Time_taken(min)'].str.slice(start=6)
train[['Weatherconditions', 'Time_taken(min)']]

Unnamed: 0,Weatherconditions,Time_taken(min)
0,Sunny,24
1,Stormy,33
2,Sandstorms,26
3,Sunny,21
4,Cloudy,30
...,...,...
41363,Windy,32
41364,Windy,36
41365,Cloudy,16
41366,Cloudy,26


---
#### 4. 변수들을 실수형과 날짜형으로 변경 
- Delivery_person_Age, Delivery_person_Ratings, Restaurant_latitude, Restaurant_longitude, Delivery_location_latitude, Delivery_location_longitude, Vehicle_condition, multiple_deliveries, Time_taken(min) ===> float
- Order_Date ===> datetime
---

In [10]:
## 변수들을 실수형과 날짜형으로 변경 
num_cols = ['Delivery_person_Age', 'Delivery_person_Ratings', 'Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 
            'Delivery_location_longitude', 'Vehicle_condition', 'multiple_deliveries', 'Time_taken(min)']

for col in num_cols : 
    train[col] = train[col].astype('float64')
    
train['Order_Date'] = pd.to_datetime(train['Order_Date'], format = '%d-%m-%Y')

In [11]:
print(train.info())
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41368 entries, 0 to 41367
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype          
---  ------                       --------------  -----          
 0   ID                           41368 non-null  object         
 1   Delivery_person_ID           41368 non-null  object         
 2   Delivery_person_Age          41368 non-null  float64        
 3   Delivery_person_Ratings      41368 non-null  float64        
 4   Restaurant_latitude          41368 non-null  float64        
 5   Restaurant_longitude         41368 non-null  float64        
 6   Delivery_location_latitude   41368 non-null  float64        
 7   Delivery_location_longitude  41368 non-null  float64        
 8   Order_Date                   41368 non-null  datetime64[ns] 
 9   Time_Orderd                  41368 non-null  timedelta64[ns]
 10  Time_Order_picked            41368 non-null  timedelta64[ns]
 11  Weatherconditions           

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
0,0x4607,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,0 days 11:30:00,0 days 11:45:00,Sunny,High,2.0,Snack,motorcycle,0.0,No,Urban,24.0
1,0xb379,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,0 days 19:45:00,0 days 19:50:00,Stormy,Jam,2.0,Snack,scooter,1.0,No,Metropolitian,33.0
2,0x5d6d,BANGRES19DEL01,23.0,4.4,12.914264,77.6784,12.924264,77.6884,2022-03-19,0 days 08:30:00,0 days 08:45:00,Sandstorms,Low,0.0,Drinks,motorcycle,1.0,No,Urban,26.0
3,0x7a6a,COIMBRES13DEL02,38.0,4.7,11.003669,76.976494,11.053669,77.026494,2022-04-05,0 days 18:00:00,0 days 18:10:00,Sunny,Medium,0.0,Buffet,motorcycle,1.0,No,Metropolitian,21.0
4,0x70a2,CHENRES12DEL01,32.0,4.6,12.972793,80.249982,13.012793,80.289982,2022-03-26,0 days 13:30:00,0 days 13:45:00,Cloudy,High,1.0,Snack,scooter,1.0,No,Metropolitian,30.0


---
#### 5. 주문 후 라이더가 음식을 픽업했는지를 볼 수 있는 변수 생성
- pick_time과 order_time을 빼서 주문시간과 픽업 시간의 차이(orpi_time_diff_m)를 구하기
---

In [12]:
## order_time과 pick_time을 구하고, 그 차이를 계산 
## 주의! 야간 12시 직전 주문의 경우 픽업이 주문일 다음날일 수도 있음 --> pd.DateOffset(1)로 +1일 
train['order_time'] = train['Order_Date'] + train['Time_Orderd']
train['pick_time'] = np.where(train['Time_Orderd'] > train['Time_Order_picked'], train['Order_Date'] + pd.DateOffset(1) + train['Time_Order_picked'], 
                              train['Order_Date'] + train['Time_Order_picked'])

train['orpi_time_diff_m'] = ((train['pick_time'] - train['order_time']).dt.total_seconds()) / 60

train[['order_time', 'pick_time','orpi_time_diff_m']]

Unnamed: 0,order_time,pick_time,orpi_time_diff_m
0,2022-03-19 11:30:00,2022-03-19 11:45:00,15.0
1,2022-03-25 19:45:00,2022-03-25 19:50:00,5.0
2,2022-03-19 08:30:00,2022-03-19 08:45:00,15.0
3,2022-04-05 18:00:00,2022-04-05 18:10:00,10.0
4,2022-03-26 13:30:00,2022-03-26 13:45:00,15.0
...,...,...,...
41363,2022-03-24 11:35:00,2022-03-24 11:45:00,10.0
41364,2022-02-16 19:55:00,2022-02-16 20:10:00,15.0
41365,2022-03-11 23:50:00,2022-03-12 00:05:00,15.0
41366,2022-03-07 13:35:00,2022-03-07 13:40:00,5.0


In [13]:
## 불필요한 변수 Time_Orderd, Time_Order_picked 삭제 
del train['Time_Orderd']
del train['Time_Order_picked']
train.head(2)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Weatherconditions,...,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min),order_time,pick_time,orpi_time_diff_m
0,0x4607,INDORES13DEL02,37.0,4.9,22.745049,75.892471,22.765049,75.912471,2022-03-19,Sunny,...,2.0,Snack,motorcycle,0.0,No,Urban,24.0,2022-03-19 11:30:00,2022-03-19 11:45:00,15.0
1,0xb379,BANGRES18DEL02,34.0,4.5,12.913041,77.683237,13.043041,77.813237,2022-03-25,Stormy,...,2.0,Snack,scooter,1.0,No,Metropolitian,33.0,2022-03-25 19:45:00,2022-03-25 19:50:00,5.0


---
#### 7. 레스토랑과 배달지까지의 거리와 속력 구하기
- 위경도를 기준으로 최단거리를 구하기 위해서, haversine 패키지를 사용한다. 
- haversine으로 구한 거리를 소요시간으로 나누어 속력을 구한다. 
---

In [14]:
!pip install haversine
from haversine import haversine




[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
train['distance_km'] = ''

for i in range(len(train)): 
    restaurant = (train['Restaurant_latitude'][i], train['Restaurant_longitude'][i])
    order = (train['Delivery_location_latitude'][i], train['Delivery_location_longitude'][i])
    train['distance_km'][i] = round(haversine(restaurant, order, unit = 'km'), 1)
    
train[['Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude', 'distance_km']]

Unnamed: 0,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,distance_km
0,22.745049,75.892471,22.765049,75.912471,3.0
1,12.913041,77.683237,13.043041,77.813237,20.2
2,12.914264,77.678400,12.924264,77.688400,1.6
3,11.003669,76.976494,11.053669,77.026494,7.8
4,12.972793,80.249982,13.012793,80.289982,6.2
...,...,...,...,...,...
41363,26.902328,75.794257,26.912328,75.804257,1.5
41364,0.000000,0.000000,0.070000,0.070000,11.0
41365,13.022394,80.242439,13.052394,80.272439,4.7
41366,11.001753,76.986241,11.041753,77.026241,6.2


In [16]:
train['speed_km/h'] = train['distance_km'] / train['Time_taken(min)']*60

for i in range(len(train)): 
    train['speed_km/h'][i] = round(train['speed_km/h'][i], 1) 
    
train[['distance_km', 'Time_taken(min)', 'speed_km/h']]

Unnamed: 0,distance_km,Time_taken(min),speed_km/h
0,3.0,24.0,7.5
1,20.2,33.0,36.7
2,1.6,26.0,3.7
3,7.8,21.0,22.3
4,6.2,30.0,12.4
...,...,...,...
41363,1.5,32.0,2.8
41364,11.0,36.0,18.3
41365,4.7,16.0,17.6
41366,6.2,26.0,14.3


---
#### 8. 요일과 city code 구하기
- Delivery_person_ID의 res 앞단어가 city_code임 
---

In [17]:
train['dayoftheweek'] = train['Order_Date'].dt.weekday.astype('str')
train['dayoftheweek'] = train['dayoftheweek'].replace('0','MON').replace('1','TUE').replace('2','WED').replace('3','THU').replace('4','FRI').replace('5','SAT').replace('6','SUN')
train[['Order_Date', 'dayoftheweek']]

Unnamed: 0,Order_Date,dayoftheweek
0,2022-03-19,SAT
1,2022-03-25,FRI
2,2022-03-19,SAT
3,2022-04-05,TUE
4,2022-03-26,SAT
...,...,...
41363,2022-03-24,THU
41364,2022-02-16,WED
41365,2022-03-11,FRI
41366,2022-03-07,MON


In [18]:
train['city_code'] = train['Delivery_person_ID'].str.split('RES', expand=True)[0]
train[['Delivery_person_ID', 'city_code']]

Unnamed: 0,Delivery_person_ID,city_code
0,INDORES13DEL02,INDO
1,BANGRES18DEL02,BANG
2,BANGRES19DEL01,BANG
3,COIMBRES13DEL02,COIMB
4,CHENRES12DEL01,CHEN
...,...,...
41363,JAPRES04DEL01,JAP
41364,AGRRES16DEL01,AGR
41365,CHENRES08DEL03,CHEN
41366,COIMBRES11DEL01,COIMB


In [19]:
##cf. str.splite의 output 
train['Delivery_person_ID'].str.split('RES', expand=True)

Unnamed: 0,0,1
0,INDO,13DEL02
1,BANG,18DEL02
2,BANG,19DEL01
3,COIMB,13DEL02
4,CHEN,12DEL01
...,...,...
41363,JAP,04DEL01
41364,AGR,16DEL01
41365,CHEN,08DEL03
41366,COIMB,11DEL01


In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41368 entries, 0 to 41367
Data columns (total 25 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   ID                           41368 non-null  object        
 1   Delivery_person_ID           41368 non-null  object        
 2   Delivery_person_Age          41368 non-null  float64       
 3   Delivery_person_Ratings      41368 non-null  float64       
 4   Restaurant_latitude          41368 non-null  float64       
 5   Restaurant_longitude         41368 non-null  float64       
 6   Delivery_location_latitude   41368 non-null  float64       
 7   Delivery_location_longitude  41368 non-null  float64       
 8   Order_Date                   41368 non-null  datetime64[ns]
 9   Weatherconditions            41368 non-null  object        
 10  Road_traffic_density         41368 non-null  object        
 11  Vehicle_condition            41368 non-nu

In [21]:
train.to_pickle('data/df.pkl')