In [1]:
import pandas as pd


In [2]:
df = pd.read_csv("Zomato_cleaned_data.csv")

In [3]:
df

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),Ordered_Time_Hour,Ordered_Time_Minute,distance
0,36,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,21,55,10.280582
1,21,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,14,55,6.242319
2,23,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,17,30,13.787860
3,34,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,Metropolitian,20,9,20,2.930258
4,24,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19,50,19.396618
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45044,30,4.8,Windy,High,1,Meal,motorcycle,0.0,No,Metropolitian,32,11,35,1.489846
45045,21,4.6,Windy,Jam,0,Buffet,motorcycle,1.0,No,Metropolitian,36,19,55,11.007735
45046,30,4.9,Cloudy,Low,1,Drinks,scooter,0.0,No,Metropolitian,16,23,50,4.657195
45047,20,4.7,Cloudy,High,0,Snack,motorcycle,1.0,No,Metropolitian,26,13,35,6.232393


In [4]:
## drop type of order. It has less correlation with time taken.
df.drop("Type_of_order", axis=1, inplace= True)

In [5]:
## Independent and dependent feature
X = df.drop("Time_taken (min)", axis=1)
X

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_vehicle,multiple_deliveries,Festival,City,Ordered_Time_Hour,Ordered_Time_Minute,distance
0,36,4.2,Fog,Jam,2,motorcycle,3.0,No,Metropolitian,21,55,10.280582
1,21,4.7,Stormy,High,1,motorcycle,1.0,No,Metropolitian,14,55,6.242319
2,23,4.7,Sandstorms,Medium,1,scooter,1.0,No,Metropolitian,17,30,13.787860
3,34,4.3,Sandstorms,Low,0,motorcycle,0.0,No,Metropolitian,9,20,2.930258
4,24,4.7,Fog,Jam,1,scooter,1.0,No,Metropolitian,19,50,19.396618
...,...,...,...,...,...,...,...,...,...,...,...,...
45044,30,4.8,Windy,High,1,motorcycle,0.0,No,Metropolitian,11,35,1.489846
45045,21,4.6,Windy,Jam,0,motorcycle,1.0,No,Metropolitian,19,55,11.007735
45046,30,4.9,Cloudy,Low,1,scooter,0.0,No,Metropolitian,23,50,4.657195
45047,20,4.7,Cloudy,High,0,motorcycle,1.0,No,Metropolitian,13,35,6.232393


In [6]:
y = df[["Time_taken (min)"]]
y

Unnamed: 0,Time_taken (min)
0,46
1,23
2,21
3,20
4,41
...,...
45044,32
45045,36
45046,16
45047,26


In [7]:
## categorical and numerical columns
categorical_colms = X.select_dtypes(include='object').columns
numerical_colms = X.select_dtypes(exclude='object').columns

In [8]:
categorical_colms

Index(['Weather_conditions', 'Road_traffic_density', 'Type_of_vehicle',
       'Festival', 'City'],
      dtype='object')

In [9]:
numerical_colms

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition',
       'multiple_deliveries', 'Ordered_Time_Hour', 'Ordered_Time_Minute',
       'distance'],
      dtype='object')

In [10]:
df["Weather_conditions"].unique()

array(['Fog', 'Stormy', 'Sandstorms', 'Windy', 'Cloudy', 'Sunny'],
      dtype=object)

In [11]:
df["Road_traffic_density"].unique()

array(['Jam', 'High', 'Medium', 'Low'], dtype=object)

In [12]:
df["Type_of_vehicle"].unique()

array(['motorcycle', 'scooter', 'electric_scooter', 'bicycle'],
      dtype=object)

In [13]:
df["City"].unique()

array(['Metropolitian', 'Urban', 'Semi-Urban'], dtype=object)

In [14]:
df["Festival"].unique()

array(['No', 'Yes'], dtype=object)

In [29]:
Weather_categories = ["Sunny", "Cloudy", "Sandstorms","Windy", "Stormy", "Fog" ]
Traffic_categories  = ["Low", "Medium", "High", "Jam"]
Vehicle_categories = ["electric_scooter", "motorcycle", "scooter","bicycle"]
festival_categories = ["No", "Yes"]
City_categories = ["Metropolitian", "Urban", "Semi-Urban"]


In [30]:
from sklearn.impute import SimpleImputer ## Handling Missing Values
from sklearn.preprocessing import StandardScaler # Handling Feature Scaling. 
## Standardize features by removing the mean and scaling to unit variance.
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding. Encode categorical features as an integer array.
from sklearn.pipeline import Pipeline  ## for creating pipeline
from sklearn.compose import ColumnTransformer  ## This estimator allows different columns of the input to be transformed separately
## and the features generated by each transformer will be concatenated to form a single feature space.

In [31]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[Weather_categories, Traffic_categories, Vehicle_categories, festival_categories, City_categories])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline, numerical_colms),
('cat_pipeline',cat_pipeline, categorical_colms)
])

In [32]:
## Train test split

from sklearn.model_selection import train_test_split  ## Split arrays or matrices into random train and test subsets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [33]:
X_train.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_vehicle,multiple_deliveries,Festival,City,Ordered_Time_Hour,Ordered_Time_Minute,distance
3508,28,4.5,Sandstorms,High,2,scooter,0.0,No,Metropolitian,14,50,6.049765
19277,38,4.8,Fog,Jam,0,motorcycle,1.0,No,Metropolitian,19,30,7.763197
39044,35,4.6,Fog,Low,1,motorcycle,1.0,No,Metropolitian,22,50,4.527973
39431,28,5.0,Cloudy,Medium,0,motorcycle,2.0,No,Metropolitian,18,10,10.703215
24912,35,4.4,Sandstorms,Jam,0,motorcycle,1.0,No,Metropolitian,20,30,7.57347


In [34]:
y_train.head()

Unnamed: 0,Time_taken (min)
3508,16
19277,26
39044,15
39431,39
24912,41


In [35]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test  = pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [36]:
X_train.head()

Unnamed: 0,num_pipeline__Delivery_person_Age,num_pipeline__Delivery_person_Ratings,num_pipeline__Vehicle_condition,num_pipeline__multiple_deliveries,num_pipeline__Ordered_Time_Hour,num_pipeline__Ordered_Time_Minute,num_pipeline__distance,cat_pipeline__Weather_conditions,cat_pipeline__Road_traffic_density,cat_pipeline__Type_of_vehicle,cat_pipeline__Festival,cat_pipeline__City
0,-0.309785,-0.420808,1.165871,-1.32178,-0.804536,1.243481,-0.08373,-0.321272,0.50367,1.24424,-0.142199,-0.53819
1,1.418221,0.501044,-1.222059,0.440929,0.261906,0.070244,-0.082133,1.426051,1.302238,-0.426011,-0.142199,-0.53819
2,0.899819,-0.113524,-0.028094,0.440929,0.901772,1.243481,-0.085148,1.426051,-1.093467,-0.426011,-0.142199,-0.53819
3,-0.309785,1.115612,-1.222059,2.203638,0.048618,-1.102993,-0.079393,-0.903712,-0.294899,-0.426011,-0.142199,-0.53819
4,0.899819,-0.728091,-1.222059,0.440929,0.475195,0.070244,-0.08231,-0.321272,1.302238,-0.426011,-0.142199,-0.53819


In [41]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [42]:
## To check the error and accuracy of the model
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [43]:
## Training the  multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'RandomForestClassifier': RandomForestClassifier(),
    'RandomForestRegressor': RandomForestRegressor()
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, mse, rmse, r2_square = evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("MSE:", mse)
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')

LinearRegression
Model Training Performance
MSE: 44.13189183370115
RMSE: 6.643183862704776
MAE: 5.253547626875059
R2 score 50.43589449528885


Lasso
Model Training Performance
MSE: 50.718408280663155
RMSE: 7.121685775198395
MAE: 5.695966644679346
R2 score 43.03864088749213


Ridge
Model Training Performance
MSE: 44.13189475923908
RMSE: 6.643184082895722
MAE: 5.253548797752595
R2 score 50.43589120964518


Elasticnet
Model Training Performance
MSE: 51.33139851836378
RMSE: 7.1645933951874605
MAE: 5.744459099287227
R2 score 42.350197416062464




  model.fit(X_train,y_train)


RandomForestClassifier
Model Training Performance
MSE: 29.492193858675545
RMSE: 5.430671584498141
MAE: 4.067998520162782
R2 score 66.87759923175271




  model.fit(X_train,y_train)


RandomForestRegressor
Model Training Performance
MSE: 17.066190604295514
RMSE: 4.131124617376667
MAE: 3.250812541620421
R2 score 80.83312460607304




In [44]:
## Conclusion: RandomForest Regressor is the best model with highest accuracy