<a href="https://colab.research.google.com/github/aisha-partha/Delivery-Demand-Prediction/blob/main/DeliverOnTime_Food_Delivery_Duration_Predictor_Aishwarya.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary

![Screenshot 2023-05-29 at 10.24.05 PM.png](attachment:908cb898-7e3f-4204-a544-0eaebcb73d4f.png)

**The food delivery time prediction model plays a crucial role in the food delivery industry, where prompt and accurate delivery is of utmost importance. Delivery time directly impacts customer satisfaction and influences their overall experience.**

**To develop an effective prediction model, we began by thoroughly cleaning the dataset, ensuring it was free from errors and inconsistencies. This step was vital in ensuring the reliability and accuracy of the model's predictions.**

**Feature engineering was then employed to extract valuable insights from the dataset. By considering factors such as delivery person age, ratings, location coordinates, and time-related variables, we aimed to capture key variables that influence delivery time. These engineered features contributed to the model's predictive capabilities.**

**Using regression algorithms like linear regression, decision tree, random forest,XGBoost we built the predictive model. It was trained on a subset of the dataset using techniques like cross-validation to ensure robustness. Evaluation metrics such as mean squared error (MSE) and R-squared (R2) score were used to assess the model's accuracy. The food delivery time prediction model empowers businesses to optimize their operations and improve the overall delivery experience for their customers.**

# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statistics
from geopy.distance import geodesic

from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Load & Understand Data

In [None]:
df_train = pd.read_csv('train.csv')
df_train.head()

In [None]:
df_train.columns

In [None]:
print("Train Dataset :", df_train.shape)

In [None]:
df_train.info()

In [None]:
#Check statistical values for fields with numerical datatype
df_train.describe().T

In [None]:
#Check statistical values for fields with other than numerical datatype
df_train.describe(exclude=np.number).T

In [None]:
#checking for null values

df_train.columns[df_train.isnull().any()].tolist()

**Observations:**

1. Data Formatting will be required for Weatherconditions & Time_taken(min) column.
2. Both numerical & categorical features are present.
3. ID & Delivery_person_ID will not be used to build the model.

In [None]:
#Explore each column
for column in df_train.columns:
    print(column)
    print(df_train[column].value_counts())
    print("------------------------------------")

# Data Cleaning

In [None]:
#Update Column Names
def update_column_name(df):
    #Renaming Weatherconditions column
    df.rename(columns={'Weatherconditions': 'Weather_conditions'},inplace=True)
    df.rename(columns={'Time_taken(min)': 'Time_taken_in_min'},inplace=True)
    df.columns = [x.lower() for x in df.columns]


update_column_name(df_train)
print(df_train.columns)

In [None]:
df_train['delivery_person_id'].value_counts()

In [None]:
#Extract relevant values from column
def extract_column_value(df):
    #Extract time and convert to int
    df['time_taken_in_min'] = df['time_taken_in_min'].apply(lambda x: int(x.split(' ')[1].strip()))
    #Extract Weather conditions
    df['weather_conditions'] = df['weather_conditions'].apply(lambda x: x.split(' ')[1].strip())
    #Extract city code from Delivery person ID
    df['city_code']=df['delivery_person_id'].str.split("RES", expand=True)[0]

extract_column_value(df_train)
df_train[['time_taken_in_min','weather_conditions','city_code']].head()

In [None]:
#Check for Duplicate Values
if (len(df_train[df_train.duplicated()])>0):
    print("There are Duplicate values present")
else:
    print("There is no duplicate value present")

In [None]:
#Update datatypes
def update_datatype(df):
    df['delivery_person_age'] = df['delivery_person_age'].astype('float64')
    df['delivery_person_ratings'] = df['delivery_person_ratings'].astype('float64')
    df['multiple_deliveries'] = df['multiple_deliveries'].astype('float64')
    df['order_date']=pd.to_datetime(df['order_date'],format="%d-%m-%Y")

update_datatype(df_train)

In [None]:
#Convert String 'NaN' to np.nan
def convert_nan(df):
    df.replace('NaN', float(np.nan), regex=True,inplace=True)
    df.replace('nan', float(np.nan), regex=True,inplace=True)
convert_nan(df_train)

In [None]:
#Check null values
df_train.isnull().sum().sort_values(ascending=False)

In [None]:
#Let's explore columns that have null values
cols = ['delivery_person_age','delivery_person_ratings','weather_conditions','road_traffic_density','multiple_deliveries','festival','city']
num_plots = len(cols)
num_rows = (num_plots // 2) + (num_plots % 2)

fig, axes = plt.subplots(num_rows, 2, figsize=(60,25))

for i, column_name in enumerate(cols):
    row = i // 2
    col = i % 2

    ax = axes[row, col]
    sns.countplot(data=df_train, x=column_name, order=df_train[column_name].value_counts().sort_index().index, ax=ax)

    ax.set_xlabel(column_name)
    ax.set_ylabel('No. of Orders')
    ax.set_title(column_name)
    ax.tick_params(axis='x', rotation=45)

if num_plots % 2 != 0:
    fig.delaxes(axes[-1, -1])

plt.tight_layout()
plt.show()

In [None]:
#Handle null values
def handle_null_values(df):
    df['delivery_person_age'].fillna(np.random.choice(df['delivery_person_age']), inplace=True)
    df['weather_conditions'].fillna(np.random.choice(df['weather_conditions']), inplace=True)
    df['city'].fillna(df['city'].mode()[0], inplace=True)
    df['festival'].fillna(df['festival'].mode()[0], inplace=True)
    df['multiple_deliveries'].fillna(df['multiple_deliveries'].mode()[0], inplace=True)
    df['road_traffic_density'].fillna(df['road_traffic_density'].mode()[0], inplace=True)
    df['delivery_person_ratings'].fillna(df['delivery_person_ratings'].median(), inplace=True)
    df.replace(np.nan,np.random.choice(df['delivery_person_age']) , regex=True,inplace=True)

handle_null_values(df_train)
df_train.isnull().sum()

In [None]:
df_train.isnull().sum()

In [None]:
#Drop Columns which won't be use for building model
def drop_columns(df):
    df.drop(['ID','Delivery_person_ID'],axis=1,inplace=True)

print("Before No. of columns: ",df_train.shape[1])
drop_columns(df_train)
print("After No. of columns: ",df_train.shape[1])

In [None]:
#Vizualizations

In [None]:
!pip install folium

In [None]:
import numpy as np
import pandas as pd

# For plotting maps
import folium

# For Regular Expressions
import re

# For working with geographical data
import geopandas

# For plotting in python
import matplotlib
import matplotlib.pyplot as plt

In [None]:
# https://github.com/covid19india/covid19india-react/blob/master/public/maps/india.json

In [None]:
india_geojson=geopandas.read_file('india.json')
india_geojson.head()

In [None]:
india_geojson.plot()

In [None]:
df_train['city'].value_counts()

In [None]:
from branca.element import Figure
fig=Figure(width=550,height=350)

In [None]:
# Creating Basemap
fig3=Figure(width=550,height=350)
m3=folium.Map(location=[22.7196, 75.8577],tiles='cartodbpositron',zoom_start=11)
fig3.add_child(m3)

In [None]:
f1=folium.FeatureGroup("Delivery 1")

In [None]:
df_train.head()

In [None]:
coords_1=[[ df_train['delivery_location_latitude'][0], df_train['delivery_location_longitude'][0]],[df_train['restaurant_latitude'][0], df_train['restaurant_longitude'][0]]]

In [None]:
line_1=folium.vector_layers.PolyLine(coords_1,popup='<b>Delivery_1</b>',tooltip='D_1',color='blue',weight=10).add_to(f1)

In [None]:
f1.add_to(m3)
folium.LayerControl().add_to(m3)
m3

In [None]:
df_train['city_code']

In [None]:
df_indore_data = df_train[df_train['city_code']=='INDO']

In [None]:
df_indore_data.info()

In [None]:
m = folium.Map([22.7196, 75.8577], zoom_start=11)

for _, row in df_indore_data.head(50).iterrows():
    folium.CircleMarker([row['restaurant_latitude'], row['restaurant_longitude']],
                        radius=15,
                        fill_color="#3db7e4", # divvy color
                       ).add_to(m)

    folium.CircleMarker([row['delivery_location_latitude'], row['delivery_location_longitude']],
                        radius=15,
                        fill_color="red", # divvy color
                       ).add_to(m)

    folium.PolyLine([[row['restaurant_latitude'], row['restaurant_longitude']],
                     [row['delivery_location_latitude'], row['delivery_location_longitude']]]).add_to(m)
m

# Feature Engineering

In [None]:
def extract_date_features(data):
    data["day"] = data.order_date.dt.day
    data["month"] = data.order_date.dt.month
    data["quarter"] = data.order_date.dt.quarter
    data["year"] = data.order_date.dt.year
    data['day_of_week'] = data.order_date.dt.day_of_week.astype(int)
    data["is_month_start"] = data.order_date.dt.is_month_start.astype(int)
    data["is_month_end"] = data.order_date.dt.is_month_end.astype(int)
    data["is_quarter_start"] = data.order_date.dt.is_quarter_start.astype(int)
    data["is_quarter_end"] = data.order_date.dt.is_quarter_end.astype(int)
    data["is_year_start"] = data.order_date.dt.is_year_start.astype(int)
    data["is_year_end"] = data.order_date.dt.is_year_end.astype(int)
    data['is_weekend'] = np.where(data['day_of_week'].isin([5,6]),1,0)

extract_date_features(df_train)
df_train.head()

In [None]:
#Calculate Time Differnce
'''
def calculate_time_diff(df):
    # Find the difference between ordered time & picked time
    df['time_orderd'] = pd.to_timedelta(df['time_orderd'])
    df['time_order_picked'] = pd.to_timedelta(df['time_order_picked'])

    df['time_order_picked_formatted'] = df['order_date'] + np.where(df['time_order_picked'] < df['time_orderd'], pd.DateOffset(days=1), pd.DateOffset(days=0)) + df['time_order_picked']
    df['time_ordered_formatted'] = df['order_date'] + df['time_orderd']

    df['order_prepare_time'] = (df['time_order_picked_formatted'] - df['time_ordered_formatted']).dt.total_seconds() / 60

    # Handle null values by filling with the median
    df['order_prepare_time'].fillna(df['order_prepare_time'].median(), inplace=True)

    # Drop all the time & date related columns
    df.drop(['time_orderd', 'time_order_picked', 'time_ordered_formatted', 'time_order_picked_formatted', 'order_date'], axis=1, inplace=True)


calculate_time_diff(df_train)
df_train.head()
'''

In [None]:
#Calculate distance between restaurant location & delivery location
def calculate_distance(df):
    df['distance']=np.zeros(len(df))
    restaurant_coordinates=df[['restaurant_latitude','restaurant_longitude']].to_numpy()
    delivery_location_coordinates=df[['delivery_location_latitude','delivery_location_longitude']].to_numpy()
    df['distance'] = np.array([geodesic(restaurant, delivery) for restaurant, delivery in zip(restaurant_coordinates, delivery_location_coordinates)])
    df['distance']= df['distance'].astype("str").str.extract('(\d+)').astype("int64")

calculate_distance(df_train)
df_train.head()