# Importing libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt

# Importing the data

In [2]:
train_data = pd.read_csv('historical_data.csv',parse_dates =[1,2])
test_data = pd.read_csv('predict_data.csv',parse_dates =[1])

In [3]:
train_data.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:00,2015-02-06 23:27:00,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:00,2015-02-10 22:56:00,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:00,2015-01-22 21:09:00,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:00,2015-02-03 22:13:00,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:00,2015-02-15 03:20:00,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0


In [4]:
test_data.head()

Unnamed: 0,market_id,created_at,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,delivery_id,platform
0,3.0,2015-02-25 02:22:30,5477,,1.0,5,7500,4,800,1800,4.0,4.0,4.0,446,670.0,194096,android
1,3.0,2015-02-25 01:14:19,5477,,1.0,5,7100,4,800,1500,4.0,1.0,1.0,446,446.0,236895,other
2,4.0,2015-02-22 02:27:44,5477,thai,1.0,4,4500,2,750,1500,9.0,7.0,6.0,446,504.0,190868,android
3,3.0,2015-02-25 01:32:39,5477,,1.0,1,1700,1,1400,1400,3.0,3.0,3.0,446,687.0,183076,ios
4,1.0,2015-02-24 05:20:45,2841,italian,1.0,2,3150,2,1525,1625,4.0,4.0,4.0,446,528.0,186200,android


# Data engineering

## Delivery time

In [5]:
train_data['Duration'] = train_data['actual_delivery_time'] - train_data['created_at']
train_data['Duration'] = train_data['Duration'].dt.seconds

# drop the rows with delivery time more than three hours
train_data = train_data[train_data['Duration']<=10800]

## Dropping irrelevant columns

In [6]:
created_at = train_data.created_at
actual_delivery_time = train_data.actual_delivery_time

train_data = train_data.drop(columns = ['created_at','actual_delivery_time'])

## Summary of features

In [7]:
# converting negative values to nan
store_primary_category = train_data.store_primary_category
train_data = train_data.drop(columns=['store_primary_category']).applymap(lambda x:x if x>=0 else float('nan'))
train_data['store_primary_category'] = store_primary_category

In [8]:
def ds_summry(Mod_df_2):
    msng = ((Mod_df_2.isnull().sum()/len(Mod_df_2))*100).apply(round,args=(2,))
    uniq,max_v,min_v = Mod_df_2.nunique(),Mod_df_2.max(),Mod_df_2.min()
    df_stats = pd.concat([msng, uniq,max_v,min_v], axis=1).rename(index=str, 
                                                                  columns={0: "% missing", 1: "No_uniq",2:'Max_Val',3:'Min_Val'})
    return df_stats

ds_summry(train_data)


Unnamed: 0,% missing,No_uniq,Max_Val,Min_Val
market_id,0.5,6,6.0,1.0
store_id,0.0,6742,6987.0,1.0
order_protocol,0.5,7,7.0,1.0
total_items,0.0,57,411.0,1.0
subtotal,0.0,8367,27100.0,0.0
num_distinct_items,0.0,20,20.0,1.0
min_item_price,0.01,2300,14700.0,0.0
max_item_price,0.0,2652,14700.0,0.0
total_onshift_dashers,8.25,168,171.0,0.0
total_busy_dashers,8.25,154,154.0,0.0


## Handling categorical data and nan values

In [9]:
print('columns with null values:')
train_data.isnull().any(axis=0)

columns with null values:


market_id                                        True
store_id                                        False
order_protocol                                   True
total_items                                     False
subtotal                                        False
num_distinct_items                              False
min_item_price                                   True
max_item_price                                  False
total_onshift_dashers                            True
total_busy_dashers                               True
total_outstanding_orders                         True
estimated_order_place_duration                  False
estimated_store_to_consumer_driving_duration     True
Duration                                        False
store_primary_category                           True
dtype: bool

### market_id

In [10]:
# market_id: adding a new class for missing values:

train_data.loc[train_data['market_id'].isnull(),'market_id'] = 0

### store_id

In [11]:
# store_id: convert to numerical classes:

from sklearn import preprocessing
le_strid = preprocessing.LabelEncoder()
le_strid.fit(train_data['store_id'])
train_data['store_id_t'] = le_strid.transform(train_data['store_id'])
train_data.drop(['store_id'],axis = 1, inplace = True)

### order_protocol

In [12]:
# order_protocol: adding a new class for missing values:

train_data.loc[train_data['order_protocol'].isnull(),'order_protocol'] = 0

### min_item_price, total_onshift_dashers, total_busy_dashers, total_outstanding_orders, estimated_store_to_consumer_driving_duration

In [20]:
# impute with median value:

for i in ['min_item_price','total_onshift_dashers','total_busy_dashers','total_outstanding_orders',
          'estimated_store_to_consumer_driving_duration']:
    
    train_data.loc[train_data[i].isnull(),i] = train_data[i].median()
    
# onshift dashers < busy dashers !!

train_data['total_onshift_dashers'] = np.where(train_data['total_busy_dashers']>train_data['total_onshift_dashers'],
                                             train_data['total_busy_dashers'],train_data['total_onshift_dashers'])


### store_primary_category

In [39]:
# store_primary_category: adding a new class for missing values and encode to numerical classes:

train_data.loc[train_data['store_primary_category'].isnull(),'store_primary_category'] = 'miss'
le_str_ctg = preprocessing.LabelEncoder()
le_str_ctg.fit(train_data['store_primary_category'])
train_data['store_primary_category_t'] = le_str_ctg.transform(train_data['store_primary_category'])

train_data.drop(['store_primary_category'],axis = 1, inplace = True)

## Adding new features:

### Day, Hour, Shift

In [41]:
# Busy hours depend on day, hour and shift of the order

train_data['Deliv_Day'] = created_at.dt.weekday
train_data['Deliv_hour'] = created_at.dt.hour

def shift(x):
    if x >6 and x<12:
        return 1    # 'Morning'
    elif x >=12 and x<=17:
        return 2    #'Lunch'
    elif x >17 and x<24:
        return 3    #'Dinner'
    else:
        return 4    #'Early_Morning'
        
train_data['Deliver_shift'] = train_data['Deliv_hour'].apply(shift)

### Percentage of free dashers

In [42]:
# % of free dashers at any given time

train_data['%_Dashers_free'] = np.where((train_data['total_busy_dashers'] + train_data['total_onshift_dashers'] ==0) , 0,
                                      (1 - (train_data['total_busy_dashers']/train_data['total_onshift_dashers']))*100)


### Ratio of dashers and orders

In [43]:
# No of free dashers to no of outstanding orders at a given time

train_data['Free_Dash/Outdng_Orders'] = np.where((train_data['total_outstanding_orders'] ==0) , 0,
                                               (train_data['total_onshift_dashers'] - 
                                                train_data['total_busy_dashers'])/train_data['total_outstanding_orders'])


## Summary of new features

In [44]:
ds_summry(train_data)

Unnamed: 0,% missing,No_uniq,Max_Val,Min_Val
market_id,0.0,7,6.0,0.0
order_protocol,0.0,8,7.0,0.0
total_items,0.0,57,411.0,1.0
subtotal,0.0,8367,27100.0,0.0
num_distinct_items,0.0,20,20.0,1.0
min_item_price,0.0,2300,14700.0,0.0
max_item_price,0.0,2652,14700.0,0.0
total_onshift_dashers,0.0,168,171.0,0.0
total_busy_dashers,0.0,154,154.0,0.0
total_outstanding_orders,0.0,275,285.0,0.0


# Pre-processing data

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = train_data.drop(['Duration'],axis = 1)
y = train_data[['Duration']]

# Breaking up into train/test  set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=10)

# Scaling the X variables for train set
scaler = StandardScaler().fit(X_train) 

# Models

## Decision Tree

In [79]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

dtr = DecisionTreeRegressor(max_depth = 11, min_samples_leaf = 12, random_state = 0)
dtr.fit(scaler.transform(X_train), y_train)
error = mean_squared_error(y_test, dtr.predict(scaler.transform(X_test)))

print('Decision Regressor', round(error,2))

Decision Regressor 914554.36


## Random Forest

In [65]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators =100)
rf.fit(scaler.transform(X_train), y_train)
error = mean_squared_error(y_test, rf.predict(scaler.transform(X_test)))

print('Random Forest Regressor', round(error,2))

Random Forest Regressor 819392.22


## Gradient Boosting

In [67]:
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor

gbr = GradientBoostingRegressor(n_estimators =100, max_depth=11, max_features = 12,
                                    random_state=0, learning_rate=0.01, loss='ls')
gbr.fit(scaler.transform(X_train), y_train) 
error = mean_squared_error(y_test, gbr.predict(scaler.transform(X_test)))

print('Gradient Boosting Regressor', round(error,2))

Gradient Boosting Regressor 889443.05


## Extra Trees

In [69]:
from sklearn.ensemble import ExtraTreesRegressor

etr = ExtraTreesRegressor(n_estimators=200, random_state=0)
etr.fit(scaler.transform(X_train), y_train) 
error = mean_squared_error(y_test, etr.predict(scaler.transform(X_test)))

print('Extra Trees Regressor', round(error,2))

Extra Trees Regressor 799797.83


## AdaBOOST

In [84]:
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(base_estimator=dtr, n_estimators=100, random_state=0, learning_rate=0.01, loss='square')
ada.fit(scaler.transform(X_train), y_train)
error = mean_squared_error(y_test, ada.predict(scaler.transform(X_test)))

print('Adaboost Regressor', round(error,2))

Adaboost Regressor 855084.35


## XGBoost

In [87]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(scaler.transform(X_train), y_train)
error = mean_squared_error(y_test, xgb.predict(scaler.transform(X_test)))

print('XGBoost Regressor', round(error,2))

XGBoost Regressor 764773.38


## MLP

In [91]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Activation

from tensorflow import keras

def DNN():
    model = Sequential()
    model.add(Dense(50, input_dim=19, kernel_initializer='GlorotUniform', activation='tanh'))
    model.add(Dense(50, activation='tanh'))
    model.add(Dense(10, activation='tanh'))
    model.add(Dense(1))

    # Compile model
    model.compile(loss='mse', optimizer='adam')
    return model

model = DNN()
model.fit(scaler.transform(X_train), y_train, epochs=100, batch_size=256)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1a40ee5b10>

In [92]:
error = mean_squared_error(y_test, model.predict(scaler.transform(X_test)))

print('MLP', round(error,2))

MLP 6504656.46
