In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from src.logger import logger
from src.exception import CustomException
from src.utils import *
import sys


In [7]:
# Extraction of Data:
try:
    conn, cursor = mysql_connection()
    sql = "select * from zomato.delivery"
    cursor.execute(sql)
    result = cursor.fetchall()
    df = pd.DataFrame(result)
except Exception as e:
    logger.error("Error extracting data from MySQL")
    raise CustomException(e, sys)
finally:
    conn.close()
    logger.info("MySQL connection closed")


2024-03-19 00:44:07 - Zomato_Time_Prediction - utils - INFO : Connecting to MySQL...
2024-03-19 00:44:07 - Zomato_Time_Prediction - utils - INFO : MySQL connection established
2024-03-19 00:44:08 - Zomato_Time_Prediction - 272529110 - INFO : MySQL connection closed


In [8]:
#Drop columns:
drop_cols = ['SerialNo', 'ID', 'Delivery_person_ID', 'Order_Date', 'Time_Orderd', 'Time_Order_picked' ]
df.drop(labels=drop_cols, axis=1, inplace=True)

#Calculating distance:
df["distance"] = df.apply(lambda row: cal_distance(row['Restaurant_latitude'], row['Restaurant_longitude'], row['Delivery_location_latitude'], row['Delivery_location_longitude']), axis=1)
df.drop(labels=[ 'Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude'], axis=1, inplace=True)


In [9]:
cat_cols = []
num_cols = []

for i in df.columns:
    if df[i].dtype == 'object':
        cat_cols.append(i)
    else:
        num_cols.append(i)

In [10]:
df = fill_empty_with_mode(df,cat_cols)

In [11]:
#Overall mean
mean = df["Time_taken (min)"].mean()
thereshold_percentage = 0.1
threshold_value = mean * thereshold_percentage
threshold_value

2.63658376810238

In [12]:
df_weather = df[["Weather_conditions", "Time_taken (min)"]].groupby("Weather_conditions").mean()
df_weather["Time_taken (min)"].std()

2.5996474791257365

In [13]:
df_road = df[["Road_traffic_density", "Time_taken (min)"]].groupby("Road_traffic_density").mean()
df_road["Time_taken (min)"].std()

4.054263656149913

In [14]:
df_road = df[["Type_of_order", "Time_taken (min)"]].groupby("Type_of_order").mean()
df_road["Time_taken (min)"].std()

0.08766091041580491

In [15]:
df_road = df[["Type_of_vehicle", "Time_taken (min)"]].groupby("Type_of_vehicle").mean()
df_road

Unnamed: 0_level_0,Time_taken (min)
Type_of_vehicle,Unnamed: 1_level_1
bicycle,25.4
electric_scooter,24.440586
motorcycle,27.676678
scooter,24.543232


In [16]:
df_road = df[["Festival", "Time_taken (min)"]].groupby("Festival").mean()
df_road["Time_taken (min)"].std()

13.79609904522959

In [17]:
df_road = df[["City", "Time_taken (min)"]].groupby("City").mean()
df_road["Time_taken (min)"].std()

14.329871133322866

In [18]:
df["Vehicle_condition"] = df["Vehicle_condition"].replace(0,1)
df["multiple_deliveries"] = df["multiple_deliveries"].replace(0.0,1.0)

# Dividing ordinal and onehot encoder:

1. Low Standard Deviation: If the standard deviation of the mean time taken across categories is low (e.g., less than 10% of the overall mean of the response variable), it suggests that the mean time taken doesn't vary much between categories. In such cases, OneHotEncoder might be suitable, especially if the categorical variable is nominal.

2. High Standard Deviation: If the standard deviation is high (e.g., greater than 10% of the overall mean), it indicates significant variability in the mean time taken between categories. For ordinal variables or when you want to capture this variability without increasing dimensionality too much, OrdinalEncoder might be more appropriate.

In [19]:
# class FeatureClassifier:
#     def __init__(self,df, target_column):
#         self.df = df
#         self.target_column = target_column
    
#     def get_ordinal_columns_mapping(self,columns):
#         """
#         This function is used to get the mapping of ordinal columns.
#         Each key is named as 'ColumnName_Map' and contains the unique values for that column.
#         """
#         ordinal_columns_mapping = {}
#         for col in columns:
#             sorted_groups = self.df.groupby(col)[self.target_column].mean().sort_values().index.tolist()
#             key_name = f"{col}_Map"
#             ordinal_columns_mapping[key_name] = sorted_groups
        
#         return ordinal_columns_mapping
        

        
#     def ordinal_onehot_numerical_divide(self):
#         """
#         This function is used to divide the categorical into ordinal and one-hot columns and numerical columns.
#         """
#         one_hot_cols = []
#         ordinal_cols = []
#         num_cols = []
#         #Overall mean
#         mean = df[self.target_column].mean()
#         thereshold_percentage = 0.1
#         threshold_value = mean * thereshold_percentage
#         try:
#             for column in self.df.columns:
#                 if column != self.target_column and self.df[column].dtype == 'object':
#                     df_column = self.df[[column, self.target_column]].groupby(column).mean().reset_index()
#                     standard_dev = df_column[self.target_column].std()
#                     if standard_dev > threshold_value:
#                         ordinal_cols.append(column)
#                     else:
#                         one_hot_cols.append(column)
#                 else:
#                     num_cols.append(column)

#             #Get Mappingsd for ordinal columns:
#             ordinal_columns_mapping = self.get_ordinal_columns_mapping(ordinal_cols)
#             return (one_hot_cols, ordinal_cols, num_cols, ordinal_columns_mapping)
                 

#         except Exception as e:
#             print(e)
#             raise CustomException("Error in feature_classifier.ordinal_onehot_numerical_divide: {}".format(e, sys))


In [43]:
from src.utils import *
target_column = "Time_taken (min)"
feature_classifier_obj = FeatureClassifier(df, target_column)

In [44]:
one_hot_cols, ordinal_cols, num_cols, ordinal_columns_mapping = feature_classifier_obj.ordinal_onehot_numerical_divide()

In [46]:
one_hot_cols

['Weather_conditions', 'Type_of_order', 'Type_of_vehicle']

In [22]:
one_hot_cols, "\n", ordinal_cols, "\n", num_cols

(['Weather_conditions', 'Type_of_order', 'Type_of_vehicle'],
 '\n',
 ['Road_traffic_density', 'Festival', 'City'],
 '\n',
 ['Delivery_person_Age',
  'Delivery_person_Ratings',
  'Vehicle_condition',
  'multiple_deliveries',
  'Time_taken (min)',
  'distance'])

In [23]:
ordinal_columns_mapping

{'Road_traffic_density_Map': ['Low', 'Medium', 'High', 'Jam'],
 'Festival_Map': ['No', 'Yes'],
 'City_Map': ['Urban', 'Metropolitian', 'Semi-Urban']}

In [47]:
df["Type_of_vehicle"].value_counts()

Type_of_vehicle
motorcycle          24972
scooter             14272
electric_scooter     3484
bicycle                15
Name: count, dtype: int64

In [25]:
df

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),distance
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,Metropolitian,46,10.280582
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,Metropolitian,23,6.242319
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,Metropolitian,21,13.787860
3,34.0,4.3,Sandstorms,Low,1,Buffet,motorcycle,1.0,No,Metropolitian,20,2.930258
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,Metropolitian,41,19.396618
...,...,...,...,...,...,...,...,...,...,...,...,...
42738,30.0,4.8,Windy,High,1,Meal,motorcycle,1.0,No,Metropolitian,32,1.489846
42739,21.0,4.6,Windy,Jam,1,Buffet,motorcycle,1.0,No,Metropolitian,36,11.007735
42740,30.0,4.9,Cloudy,Low,1,Drinks,scooter,1.0,No,Metropolitian,16,4.657195
42741,20.0,4.7,Cloudy,High,1,Snack,motorcycle,1.0,No,Metropolitian,26,6.232393


In [30]:
#outlier Removal:
df_filter = outlier_removal(df, num_cols)

In [31]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df_filter, test_size=0.2, random_state=42)

input_feature_train_df = train_set.drop(labels=target_column,axis=1)
input_feature_test_df = test_set.drop(labels=target_column,axis=1)
target_training_df = train_set[target_column]
target_testing_df = test_set[target_column]

In [32]:
train_set["Type_of_vehicle"].unique()

array(['electric_scooter', 'scooter', 'motorcycle'], dtype=object)

In [33]:
categories = []
for key, value in ordinal_columns_mapping.items():
    categories.append(value)
categories


[['Low', 'Medium', 'High', 'Jam'],
 ['No', 'Yes'],
 ['Urban', 'Metropolitian', 'Semi-Urban']]

In [34]:
ordinal_columns_mapping

{'Road_traffic_density_Map': ['Low', 'Medium', 'High', 'Jam'],
 'Festival_Map': ['No', 'Yes'],
 'City_Map': ['Urban', 'Metropolitian', 'Semi-Urban']}

In [35]:
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


num_cols_pipe = [col for col in num_cols if col != "Time_taken (min)"]

# Define pipelines for categorical and numeric data
categorical_onehot_pipeline = Pipeline([
    
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False)),
    ('scaler', StandardScaler())
])

categorical_ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OrdinalEncoder(categories=categories)),
    ('scaler', StandardScaler())
])

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine pipelines in a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('cat_one_hot', categorical_onehot_pipeline, one_hot_cols),
    ('cat_ordinal', categorical_ordinal_pipeline, ordinal_cols),
    ('num', numerical_pipeline, num_cols_pipe)
])



In [29]:
input_feature_train_df.columns

NameError: name 'input_feature_train_df' is not defined

In [36]:
input_feature_train_df["Type_of_vehicle"].unique()

array(['electric_scooter', 'scooter', 'motorcycle'], dtype=object)

In [36]:
input_training_arr = preprocessor.fit_transform(input_feature_train_df)
input_testing_arr = preprocessor.transform(input_feature_test_df)

In [41]:
input_training_arr[0]

array([-0.43136094, -0.43726641, -0.45054967,  2.1872357 , -0.45160441,
       -0.45509609,  1.75208452, -0.57910708, -0.57468843, -0.58485694,
        3.279649  , -1.15008913, -0.72650841, -1.04488653, -0.05493065,
        0.5569542 , -1.59895577,  0.93667413,  1.37172379,  0.        ,
       -1.11468622])

In [42]:
input_testing_arr[0]

array([-0.43136094, -0.43726641, -0.45054967,  2.1872357 , -0.45160441,
       -0.45509609, -0.57074872, -0.57910708,  1.74007332, -0.58485694,
        3.279649  , -1.15008913, -0.72650841, -1.04488653, -0.05493065,
        0.5569542 ,  1.00533579, -0.43761514,  1.37172379,  0.        ,
       -1.40440731])

In [24]:
train_arr = np.c_[input_training_arr, np.array(target_training_df)]
test_arr = np.c_[input_testing_arr, np.array(target_testing_df)]

In [25]:
X_train, y_train, X_test, y_test = (
    train_arr[:, :-1],
    train_arr[:, -1],
    test_arr[:, :-1],
    test_arr[:, -1]
)

In [26]:
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor

models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'SVR': SVR(),
    'DecisionTree':DecisionTreeRegressor(random_state=42),
    'RandomForest':RandomForestRegressor(random_state=42),
    'GradientBoostingRegressor':GradientBoostingRegressor(random_state=42),
    'BaggingRegressor' : BaggingRegressor(random_state=42)
}

In [27]:
import yaml
config_path = "../params.yaml"

#Load yaml file:
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

In [28]:
from sklearn.metrics import r2_score


for model_name, model_instance in models.items():
    if model_name != "LinearRegression":
        model_params = config[model_name]
        random_cv, best_params_, best_score_ = random_search_cv(model_instance, X_train, y_train, model_params)
        print("#"*80, "\n")
        print(f"{model_name}:\n")
        print(f"Training Score: {best_score_}\n")
        print(f"Best Params: {best_params_}\n")
        y_pred = random_cv.predict(X_test)
        r2_score_value = r2_score(y_test, y_pred)
        print(f"Testing Score: {r2_score_value}\n\n\n\n")
    else:
        model = model_instance
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2_score_value = r2_score(y_test, y_pred)
        print("#"*80, "\n")
        print(f"{model_name}:\n")
        print(f"Testing Score: {r2_score_value}\n\n\n\n")

################################################################################ 

LinearRegression:

Testing Score: 0.42620103767580153








################################################################################ 

Lasso:

Training Score: 0.41168085251279757

Best Params: {'alpha': 0.2}

Testing Score: 0.41787819375237745








################################################################################ 

Ridge:

Training Score: 0.41815279394769805

Best Params: {'alpha': 2}

Testing Score: 0.4261816569676332




################################################################################ 

Elasticnet:

Training Score: 0.4111130316805121

Best Params: {'l1_ratio': 0.6, 'alpha': 0.2}

Testing Score: 0.4177061508079507








################################################################################ 

SVR:

Training Score: 0.5659891311393979

Best Params: {'kernel': 'rbf'}

Testing Score: 0.5767816831772832




################################################################################ 

DecisionTree:

Training Score: 0.6961566317727843

Best Params: {'min_samples_split': 7, 'min_samples_leaf': 4, 'max_depth': 10}

Testing Score: 0.7082324160807151




################################################################################ 

RandomForest:

Training Score: 0.708707813986199

Best Params: {'n_estimators': 200, 'min_samples_split': 7, 'min_samples_leaf': 7, 'max_depth': 13}

Testing Score: 0.7180853658872189




################################################################################ 

GradientBoostingRegressor:

Training Score: 0.6962382510538351

Best Params: {'subsample': 1.0, 'n_estimators': 300, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'log2', 'max_depth':

In [29]:
RandomForest_params = {
  "n_estimators": [50, 100, 150, 200],
  "max_depth": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,28,29,20],
  "min_samples_split": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,28,29,20],
  "min_samples_leaf": [2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,28,29,20]
}

random_cv, best_params_, best_score_ = random_search_cv(RandomForestRegressor(), X_train, y_train, RandomForest_params)


In [30]:
print(random_cv, best_params_, best_score_)

RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
                   param_distributions={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 28, 29, 20],
                                        'min_samples_leaf': [2, 3, 4, 5, 6, 7,
                                                             8, 9, 10, 11, 12,
                                                             13, 14, 15, 16, 17,
                                                             28, 29, 20],
                                        'min_samples_split': [2, 3, 4, 5, 6, 7,
                                                              8, 9, 10, 11, 12,
                                                              13, 14, 15, 16,
                                                              17, 28, 29, 20],
                                        'n_estimators': [50, 100, 150, 200]},
 

In [31]:
y_pred = random_cv.predict(X_test)
r2_score_value = r2_score(y_test, y_pred)
print(f"Testing Score: {r2_score_value}\n\n\n\n")

Testing Score: 0.7206733112113541






In [27]:

model = load_obj("/Users/abhishek/Desktop/iNeuron/Machine Learning/EndToEnd/ML/Zomato Time Prediction/artifacts/model.pkl")
preprocessor = load_obj("/Users/abhishek/Desktop/iNeuron/Machine Learning/EndToEnd/ML/Zomato Time Prediction/artifacts/preprocessor_obj.pkl")



2024-03-18 23:58:24 - Zomato_Time_Prediction - utils - INFO : Object loaded successfully
2024-03-18 23:58:24 - Zomato_Time_Prediction - utils - INFO : Object loaded successfully


In [33]:
data = {
    "Delivery_person_Age": 30,
    "Delivery_person_Ratings": 4.5,
    "Weather_conditions": 'Sunny',
    "Road_traffic_density": 'Low',
    "Vehicle_condition": 3,
    "Type_of_vehicle": 'motorcycle',
    "Type_of_order": 'Meal',
    "multiple_deliveries": 1,
    "Festival": "No",
    "City": 'Urban',
    "distance": 5.2  # Assuming distance is in km or miles
}

df_new = pd.DataFrame(data=data, index=[0])
df_new



Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_vehicle,Type_of_order,multiple_deliveries,Festival,City,distance
0,30,4.5,Sunny,Low,3,motorcycle,Meal,1,No,Urban,5.2


In [34]:
data_scaled = preprocessor.transform(df_new)


In [35]:
pred = model.predict(data_scaled)
pred

array([13.27])