# Delivery Duration Prediction

In [55]:
import warnings
warnings.filterwarnings('ignore')

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Loading Dataset

In [57]:
df = pd.read_csv('Historical_Cleaned_Dataset.csv')

In [58]:
df.columns = df.columns.str.upper()
df.columns

Index(['CREATED_AT', 'ACTUAL_DELIVERY_TIME', 'STORE_PRIMARY_CATEGORY',
       'STORE_ID', 'TOTAL_ITEMS', 'SUBTOTAL', 'NUM_DISTINCT_ITEMS',
       'MIN_ITEM_PRICE', 'MAX_ITEM_PRICE', 'ESTIMATED_ORDER_PLACE_DURATION',
       'MARKET_ID', 'ORDER_PROTOCOL', 'TOTAL_ONSHIFT_DASHERS',
       'TOTAL_BUSY_DASHERS', 'TOTAL_OUTSTANDING_ORDERS',
       'ESTIMATED_STORE_TO_CONSUMER_DRIVING_DURATION',
       'DELIVERY_DURATION_SEC'],
      dtype='object')

In [59]:
df.head()

Unnamed: 0,CREATED_AT,ACTUAL_DELIVERY_TIME,STORE_PRIMARY_CATEGORY,STORE_ID,TOTAL_ITEMS,SUBTOTAL,NUM_DISTINCT_ITEMS,MIN_ITEM_PRICE,MAX_ITEM_PRICE,ESTIMATED_ORDER_PLACE_DURATION,MARKET_ID,ORDER_PROTOCOL,TOTAL_ONSHIFT_DASHERS,TOTAL_BUSY_DASHERS,TOTAL_OUTSTANDING_ORDERS,ESTIMATED_STORE_TO_CONSUMER_DRIVING_DURATION,DELIVERY_DURATION_SEC
0,0 days 22:24:17,0 days 23:27:16,american,1845,4,3441,4,557,1239,446,1.0,1.0,33.0,14.0,21.0,861.0,3779.0
1,0 days 21:49:25,0 days 22:56:29,mexican,5477,1,1900,1,1400,1400,446,2.0,2.0,1.0,2.0,2.0,690.0,4024.0
2,0 days 20:39:28,0 days 21:09:09,Unknown,5477,1,1900,1,1900,1900,446,3.0,1.0,1.0,0.0,0.0,690.0,1781.0
3,0 days 21:21:45,0 days 22:13:00,Unknown,5477,6,6900,5,600,1800,446,3.0,1.0,1.0,1.0,2.0,289.0,3075.0
4,0 days 02:40:36,0 days 03:20:26,Unknown,5477,3,3900,3,1100,1600,446,3.0,1.0,6.0,6.0,9.0,650.0,2390.0


### Data quality checks

In [60]:
df.shape

(183610, 17)

In [61]:
df['CREATED_AT'] = pd.to_timedelta(df['CREATED_AT'])
df['ACTUAL_DELIVERY_TIME'] = pd.to_timedelta(df['ACTUAL_DELIVERY_TIME'])

df['CREATED_AT'] = df['CREATED_AT'].dt.total_seconds()
df['ACTUAL_DELIVERY_TIME'] = df['ACTUAL_DELIVERY_TIME'].dt.total_seconds()

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183610 entries, 0 to 183609
Data columns (total 17 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   CREATED_AT                                    183610 non-null  float64
 1   ACTUAL_DELIVERY_TIME                          183610 non-null  float64
 2   STORE_PRIMARY_CATEGORY                        183610 non-null  object 
 3   STORE_ID                                      183610 non-null  int64  
 4   TOTAL_ITEMS                                   183610 non-null  int64  
 5   SUBTOTAL                                      183610 non-null  int64  
 6   NUM_DISTINCT_ITEMS                            183610 non-null  int64  
 7   MIN_ITEM_PRICE                                183610 non-null  int64  
 8   MAX_ITEM_PRICE                                183610 non-null  int64  
 9   ESTIMATED_ORDER_PLACE_DURATION                18

In [63]:
df.duplicated().sum()

np.int64(0)

In [64]:
df = df.drop_duplicates()

In [65]:
missing_counts = df.isna().sum()
missing_percentage = (missing_counts/df.shape[0]*100).round(2)

In [66]:
missing_summary = pd.DataFrame({
    "missing_counts": missing_counts,
    "missing_percentage": missing_percentage
})
missing_summary

Unnamed: 0,missing_counts,missing_percentage
CREATED_AT,0,0.0
ACTUAL_DELIVERY_TIME,0,0.0
STORE_PRIMARY_CATEGORY,0,0.0
STORE_ID,0,0.0
TOTAL_ITEMS,0,0.0
SUBTOTAL,0,0.0
NUM_DISTINCT_ITEMS,0,0.0
MIN_ITEM_PRICE,0,0.0
MAX_ITEM_PRICE,0,0.0
ESTIMATED_ORDER_PLACE_DURATION,0,0.0


In [67]:
df.shape

(183610, 17)

**There is no duplicated & missing data**

### Separating X and Y Features

In [68]:
X = df.drop(columns=['DELIVERY_DURATION_SEC'], axis=1)
Y = df['DELIVERY_DURATION_SEC']

In [69]:
X.shape

(183610, 16)

In [70]:
Y.shape

(183610,)

### Data Cleaning & Data preprocessing

In [71]:
cat  = X.select_dtypes(include=['object']).columns.tolist()
con  = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [72]:
print("Categorical:", cat)
print("Numerical:", con)

Categorical: ['STORE_PRIMARY_CATEGORY']
Numerical: ['CREATED_AT', 'ACTUAL_DELIVERY_TIME', 'STORE_ID', 'TOTAL_ITEMS', 'SUBTOTAL', 'NUM_DISTINCT_ITEMS', 'MIN_ITEM_PRICE', 'MAX_ITEM_PRICE', 'ESTIMATED_ORDER_PLACE_DURATION', 'MARKET_ID', 'ORDER_PROTOCOL', 'TOTAL_ONSHIFT_DASHERS', 'TOTAL_BUSY_DASHERS', 'TOTAL_OUTSTANDING_ORDERS', 'ESTIMATED_STORE_TO_CONSUMER_DRIVING_DURATION']


In [73]:
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [74]:
num_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value=0),StandardScaler())  
cat_pipe = make_pipeline(SimpleImputer(strategy="constant", fill_value="Unknown"),OrdinalEncoder())

In [75]:
# combine both the pipelines
pre = ColumnTransformer([
    ('cat',cat_pipe,cat),
    ('con',num_pipe,con)
]).set_output(transform="pandas")

In [76]:
pre

0,1,2
,transformers,"[('cat', ...), ('con', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,0
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [77]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,cat__STORE_PRIMARY_CATEGORY,con__CREATED_AT,con__ACTUAL_DELIVERY_TIME,con__STORE_ID,con__TOTAL_ITEMS,con__SUBTOTAL,con__NUM_DISTINCT_ITEMS,con__MIN_ITEM_PRICE,con__MAX_ITEM_PRICE,con__ESTIMATED_ORDER_PLACE_DURATION,con__MARKET_ID,con__ORDER_PROTOCOL,con__TOTAL_ONSHIFT_DASHERS,con__TOTAL_BUSY_DASHERS,con__TOTAL_OUTSTANDING_ORDERS,con__ESTIMATED_STORE_TO_CONSUMER_DRIVING_DURATION
0,5.0,1.638237,1.681135,-0.820844,0.306728,0.428651,0.827406,-0.249256,0.146023,1.540413,-1.30998,-1.254474,-0.253733,-0.759386,-0.625825,1.459855
1,48.0,1.569334,1.619985,0.947989,-0.820652,-0.42768,-1.027932,1.372352,0.436362,1.540413,-0.65214,-0.589196,-1.155069,-1.122806,-0.984146,0.677557
2,0.0,1.431101,1.40677,0.947989,-0.820652,-0.42768,-1.027932,2.33416,1.338036,1.540413,0.0057,-1.254474,-1.155069,-1.183376,-1.021864,0.677557
3,0.0,1.51466,1.533606,0.947989,1.058314,2.35081,1.445852,-0.166541,1.157701,1.540413,0.0057,-1.254474,-1.155069,-1.153091,-0.984146,-1.156953
4,0.0,-0.700925,-0.716203,0.947989,-0.069066,0.683716,0.20896,0.795267,0.797032,1.540413,0.0057,-1.254474,-1.014235,-1.001666,-0.852133,0.494564


In [78]:
X_pre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183610 entries, 0 to 183609
Data columns (total 16 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   cat__STORE_PRIMARY_CATEGORY                        183610 non-null  float64
 1   con__CREATED_AT                                    183610 non-null  float64
 2   con__ACTUAL_DELIVERY_TIME                          183610 non-null  float64
 3   con__STORE_ID                                      183610 non-null  float64
 4   con__TOTAL_ITEMS                                   183610 non-null  float64
 5   con__SUBTOTAL                                      183610 non-null  float64
 6   con__NUM_DISTINCT_ITEMS                            183610 non-null  float64
 7   con__MIN_ITEM_PRICE                                183610 non-null  float64
 8   con__MAX_ITEM_PRICE                                183610 non-null  float6

In [79]:
X_pre['cat__STORE_PRIMARY_CATEGORY'].unique()

array([ 5., 48.,  0., 37., 40., 60., 70., 16., 59., 57., 21., 62., 14.,
       11., 47., 41., 35., 19., 30., 24., 52., 42., 72.,  7.,  8., 29.,
       25., 63., 61., 74., 17., 67., 49., 64., 73., 55., 51., 68., 44.,
       36., 22., 15., 12., 54.,  3., 26., 56., 71., 46.,  1., 13., 33.,
       31., 18., 34., 23., 27., 32., 53., 50., 66., 65., 69., 58., 10.,
       28., 20.,  2.,  6., 43., 39., 45.,  9., 38.,  4.])

### Feature Selection: Forward Selection

In [80]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

In [81]:
base_model = LinearRegression()
for_sel = SequentialFeatureSelector(base_model,direction='forward',n_features_to_select=10)
for_sel.fit(X_pre,Y)

0,1,2
,estimator,LinearRegression()
,n_features_to_select,10
,tol,
,direction,'forward'
,scoring,
,cv,5
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [82]:
for_sel.get_feature_names_out()

array(['cat__STORE_PRIMARY_CATEGORY', 'con__CREATED_AT',
       'con__ACTUAL_DELIVERY_TIME', 'con__STORE_ID', 'con__TOTAL_ITEMS',
       'con__SUBTOTAL', 'con__NUM_DISTINCT_ITEMS',
       'con__TOTAL_ONSHIFT_DASHERS', 'con__TOTAL_OUTSTANDING_ORDERS',
       'con__ESTIMATED_STORE_TO_CONSUMER_DRIVING_DURATION'], dtype=object)

In [83]:
imp_cols = for_sel.get_feature_names_out()
imp_cols

array(['cat__STORE_PRIMARY_CATEGORY', 'con__CREATED_AT',
       'con__ACTUAL_DELIVERY_TIME', 'con__STORE_ID', 'con__TOTAL_ITEMS',
       'con__SUBTOTAL', 'con__NUM_DISTINCT_ITEMS',
       'con__TOTAL_ONSHIFT_DASHERS', 'con__TOTAL_OUTSTANDING_ORDERS',
       'con__ESTIMATED_STORE_TO_CONSUMER_DRIVING_DURATION'], dtype=object)

In [84]:
len(imp_cols)

10

In [85]:
sel_cols = []
for i in imp_cols:
    cols = i.split('__')[-1]
    sel_cols.append(cols)
sel_cols

['STORE_PRIMARY_CATEGORY',
 'CREATED_AT',
 'ACTUAL_DELIVERY_TIME',
 'STORE_ID',
 'TOTAL_ITEMS',
 'SUBTOTAL',
 'NUM_DISTINCT_ITEMS',
 'TOTAL_ONSHIFT_DASHERS',
 'TOTAL_OUTSTANDING_ORDERS',
 'ESTIMATED_STORE_TO_CONSUMER_DRIVING_DURATION']

In [86]:
X_sel = X[sel_cols]
X_sel.head()

Unnamed: 0,STORE_PRIMARY_CATEGORY,CREATED_AT,ACTUAL_DELIVERY_TIME,STORE_ID,TOTAL_ITEMS,SUBTOTAL,NUM_DISTINCT_ITEMS,TOTAL_ONSHIFT_DASHERS,TOTAL_OUTSTANDING_ORDERS,ESTIMATED_STORE_TO_CONSUMER_DRIVING_DURATION
0,american,80657.0,84436.0,1845,4,3441,4,33.0,21.0,861.0
1,mexican,78565.0,82589.0,5477,1,1900,1,1.0,2.0,690.0
2,Unknown,74368.0,76149.0,5477,1,1900,1,1.0,0.0,690.0
3,Unknown,76905.0,79980.0,5477,6,6900,5,1.0,2.0,289.0
4,Unknown,9636.0,12026.0,5477,3,3900,3,6.0,9.0,650.0


In [87]:
X_sel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183610 entries, 0 to 183609
Data columns (total 10 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   STORE_PRIMARY_CATEGORY                        183610 non-null  object 
 1   CREATED_AT                                    183610 non-null  float64
 2   ACTUAL_DELIVERY_TIME                          183610 non-null  float64
 3   STORE_ID                                      183610 non-null  int64  
 4   TOTAL_ITEMS                                   183610 non-null  int64  
 5   SUBTOTAL                                      183610 non-null  int64  
 6   NUM_DISTINCT_ITEMS                            183610 non-null  int64  
 7   TOTAL_ONSHIFT_DASHERS                         183610 non-null  float64
 8   TOTAL_OUTSTANDING_ORDERS                      183610 non-null  float64
 9   ESTIMATED_STORE_TO_CONSUMER_DRIVING_DURATION  18

## Data cleaning & Preprocessing

In [88]:
from sklearn.preprocessing import OneHotEncoder

In [89]:
X_sel_cat = list(X_sel.columns[X_sel.dtypes=='object'])
X_sel_con = list(X_sel.columns[X_sel.dtypes!='object'])

In [90]:
num_pipe2 = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

In [91]:
cat_pipe2 = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore',sparse_output=False)
)

In [92]:
pre2 = ColumnTransformer([
    ('cat',cat_pipe2,X_sel_cat),
    ('con',num_pipe2,X_sel_con)
]).set_output(transform='pandas')

In [93]:
pre2

0,1,2
,transformers,"[('cat', ...), ('con', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [94]:
X_sel_pre = pre2.fit_transform(X_sel)
X_sel_pre.head()

Unnamed: 0,cat__STORE_PRIMARY_CATEGORY_Unknown,cat__STORE_PRIMARY_CATEGORY_afghan,cat__STORE_PRIMARY_CATEGORY_african,cat__STORE_PRIMARY_CATEGORY_alcohol,cat__STORE_PRIMARY_CATEGORY_alcohol-plus-food,cat__STORE_PRIMARY_CATEGORY_american,cat__STORE_PRIMARY_CATEGORY_argentine,cat__STORE_PRIMARY_CATEGORY_asian,cat__STORE_PRIMARY_CATEGORY_barbecue,cat__STORE_PRIMARY_CATEGORY_belgian,...,cat__STORE_PRIMARY_CATEGORY_vietnamese,con__CREATED_AT,con__ACTUAL_DELIVERY_TIME,con__STORE_ID,con__TOTAL_ITEMS,con__SUBTOTAL,con__NUM_DISTINCT_ITEMS,con__TOTAL_ONSHIFT_DASHERS,con__TOTAL_OUTSTANDING_ORDERS,con__ESTIMATED_STORE_TO_CONSUMER_DRIVING_DURATION
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.638237,1.681135,-0.820844,0.306728,0.428651,0.827406,-0.253733,-0.625825,1.459855
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.569334,1.619985,0.947989,-0.820652,-0.42768,-1.027932,-1.155069,-0.984146,0.677557
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.431101,1.40677,0.947989,-0.820652,-0.42768,-1.027932,-1.155069,-1.021864,0.677557
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.51466,1.533606,0.947989,1.058314,2.35081,1.445852,-1.155069,-0.984146,-1.156953
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.700925,-0.716203,0.947989,-0.069066,0.683716,0.20896,-1.014235,-0.852133,0.494564


### Train Test Split

In [95]:
from sklearn.model_selection import train_test_split

In [96]:
xtrain,xtest,ytrain,ytest = train_test_split(X_sel_pre,Y,train_size=0.8, random_state=42)

In [97]:
xtrain.shape

(146888, 84)

### Model Building

In [98]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain,ytrain)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [99]:
model.intercept_

np.float64(2741.1420129622607)

In [100]:
model.coef_

array([ 4.17768384e-12,  1.12720500e-10,  9.37887989e-10, -1.85167437e-11,
        4.89364993e-10, -3.68061137e-12,  7.03437308e-11,  7.04858394e-12,
        2.68585154e-12,  1.58365765e-10,  5.33546540e-11, -1.12933662e-10,
       -1.80691018e-11, -5.36317657e-11,  6.19593266e-12,  3.79927201e-11,
       -6.82689461e-11,  4.09272616e-11,  3.03543857e-11, -1.18305366e-12,
       -4.03588274e-11,  2.96154212e-11,  5.37966116e-10, -4.02167188e-11,
       -5.75823833e-11, -5.54223334e-12,  7.84439180e-12,  2.20836682e-11,
       -9.00683972e-11, -3.22586402e-11, -1.86446414e-11, -1.78488335e-11,
       -8.75388650e-12,  6.75299816e-11, -1.60866875e-11,  2.38529196e-11,
       -5.34328137e-11,  3.17186277e-11,  7.81597009e-12,  3.49018592e-11,
        5.46833689e-11,  3.53850282e-11, -5.34328137e-12, -1.79170456e-10,
        9.69890834e-13,  1.00726538e-10,  5.67581537e-11, -7.11963821e-12,
        3.46744855e-12,  3.74313913e-11,  5.47402124e-11, -4.99511543e-12,
       -2.65458766e-11,  

In [101]:
ypreds = model.predict(xtrain)
ypreds

array([3094., 2881., 1165., ..., 4215., 2516., 2272.], shape=(146888,))

In [102]:
ypreds_test = model.predict(xtest)
ypreds_test

array([2951., 1998., 2856., ..., 3347., 2323., 1757.], shape=(36722,))

In [103]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
mse = mean_squared_error(ytest, ypreds_test)
mae = mean_absolute_error(ytest, ypreds_test)
rmse = mse**(1/2)
r2 = r2_score(ytest,ypreds_test)

print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2 score: {r2*100:.2f}%")

MSE: 0.00
MAE: 0.00
RMSE: 0.00
R2 score: 100.00%


In [104]:
model.score(xtrain,ytrain)

1.0

In [105]:
model.score(xtest,ytest)

1.0

In [106]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Initialize Model
xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.07,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.7,
    random_state=42
)

# Train Model
xgb.fit(xtrain, ytrain)

# Predict
y_pred_xgb = xgb.predict(xtest)

# Evaluation Metrics
mae = mean_absolute_error(ytest, y_pred_xgb)
rmse = np.sqrt(mean_squared_error(ytest, y_pred_xgb))
r2 = r2_score(ytest, y_pred_xgb)

print(f"XGBoost MSE : {mse:.2f}")
print(f"XGBoost MAE : {mae:.2f}")
print(f"XGBoost RMSE : {rmse:.2f}")
print(f"XGBoost R²   : {r2*100:.2f}")

XGBoost MSE : 0.00
XGBoost MAE : 156.79
XGBoost RMSE : 245.52
XGBoost R²   : 92.40


# Model can be considered for final model building