In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from src.logger import logger
from src.exception import CustomException
from src.utils import *
import sys


# Connection to DB and Data Extraction

In [2]:
# Extraction of Data:
try:
    conn, cursor = mysql_connection()
    sql = "select * from zomato.delivery"
    cursor.execute(sql)
    result = cursor.fetchall()
    df = pd.DataFrame(result)
except Exception as e:
    logger.error("Error extracting data from MySQL")
    raise CustomException(e, sys)
finally:
    conn.close()
    logger.info("MySQL connection closed")


2024-03-16 04:50:27 - Zomato_Time_Prediction - utils - INFO : Connecting to MySQL...
2024-03-16 04:50:28 - Zomato_Time_Prediction - utils - INFO : MySQL connection established
2024-03-16 04:50:28 - Zomato_Time_Prediction - 272529110 - INFO : MySQL connection closed


In [3]:
#Drop columns:
drop_cols = ['SerialNo', 'ID', 'Delivery_person_ID', 'Order_Date', 'Time_Orderd', 'Time_Order_picked' ]
df.drop(labels=drop_cols, axis=1, inplace=True)

In [4]:
df["distance"] = df.apply(lambda row: cal_distance(row['Restaurant_latitude'], row['Restaurant_longitude'], row['Delivery_location_latitude'], row['Delivery_location_longitude']), axis=1)
df.drop(labels=[ 'Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude'], axis=1, inplace=True)


In [5]:
cat_cols = []
num_cols = []

for i in df.columns:
    if df[i].dtype == 'object':
        cat_cols.append(i)
    else:
        num_cols.append(i)

In [6]:
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
# from sklearn.impute import SimpleImputer
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

# class FillEmptyWithMode(BaseEstimator, TransformerMixin):
#     def __init__(self, cat_cols):
#         self.cat_cols = cat_cols

#     def fit(self, X, y=None):
#         self.fill_values_ = {}
#         for column in self.cat_cols:
#             if (X[column] == '').any():
#                 self.fill_values_[column] = X[column][X[column] != ""].mode().iloc[0]
#         return self

#     def transform(self, X, y=None):
#         # Check if X is a DataFrame or not
#         if isinstance(X, pd.DataFrame):
#             X_copy = X.copy()
#         else:
#             # Convert X to a DataFrame if it's a numpy array
#             # This requires knowing the column names beforehand
#             X_copy = pd.DataFrame(X, columns=self.cat_cols)  # 'self.cat_cols' should be the column names

#         # Your transformation logic here, using X_copy
#         for column, fill_value in self.fill_values_.items():
#             X_copy[column] = X_copy[column].replace('', fill_value)
        
#         return X_copy



In [7]:
# for column in cat_cols:
#     has_empty_string = (df2[column] == '').any()
#     has_nan = pd.isnull(df2[column]).any()
#     print(f"{column}: Empty Strings? {has_empty_string}, NaNs? {has_nan}")

In [8]:
# Categorical columns for one-hot encoding
one_hot_cols = ['Weather_conditions', 'Type_of_order', 'Type_of_vehicle']

# Categorical columns for ordinal encoding
ordinal_cols = ['Road_traffic_density', 'Festival', 'City']
Road_Traffic_Map=["Low","Medium","High","Jam"]
Festival_Map=["No","Yes"]
City_Map=["Urban","Metropolitian","Semi-Urban"]

In [9]:
# class OutlierRemover(BaseEstimator, TransformerMixin):
#     def __init__(self, num_cols_indices=None, n_std=2):
#         # Expect indices (integers) instead of column names
#         self.num_cols_indices = num_cols_indices  
#         self.n_std = n_std

#     def fit(self, X, y=None):
#         print(X.shape)
#         self.limits_ = {}
#         for idx in self.num_cols_indices:
#             col_data = X[:, idx] if not isinstance(X, pd.DataFrame) else X.iloc[:, idx]
#             mean = col_data.mean()
#             std_dev = col_data.std()
#             self.limits_[idx] = (mean - self.n_std * std_dev, mean + self.n_std * std_dev)
#         return self

#     def transform(self, X, y=None):
#         print(X.shape)
#         # Ensure X is a numpy array for uniform processing
#         X_copy = X if not isinstance(X, pd.DataFrame) else X.to_numpy()
#         for idx, (lower, upper) in self.limits_.items():
#             # Apply limits; consider handling for pandas separately if needed
#             X_copy[:, idx] = np.where(
#                 (X_copy[:, idx] > upper) | (X_copy[:, idx] < lower),
#                 np.nan,  # Replace outliers with NaN
#                 X_copy[:, idx]
#             )
#         return X_copy


In [10]:
df["Vehicle_condition"] = df["Vehicle_condition"].replace(0,1)
df["multiple_deliveries"] = df["multiple_deliveries"].replace(0.0,1.0)

In [11]:
#outlier Removal:
df_filter = outlier_removal(df, num_cols)
df_filter.shape

(36151, 12)

In [12]:
df_filter = fill_empty_with_mode(df_filter, cat_cols=cat_cols)


In [13]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df_filter, test_size=0.2, random_state=42)


In [14]:
train_set.columns

Index(['Delivery_person_Age', 'Delivery_person_Ratings', 'Weather_conditions',
       'Road_traffic_density', 'Vehicle_condition', 'Type_of_order',
       'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City',
       'Time_taken (min)', 'distance'],
      dtype='object')

In [15]:
#Segregating input and output Feature:

target_column = "Time_taken (min)"

input_feature_train_df = train_set.drop(labels=target_column,axis=1)
input_feature_test_df = test_set.drop(labels=target_column,axis=1)


In [16]:
target_training_df = train_set[target_column]
target_testing_df = test_set[target_column]

In [17]:
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# # Assuming df is your DataFrame and you've defined cat_cols, num_cols, one_hot_cols, etc

# # Instantiate custom transformers
# fill_empty_with_mode = FillEmptyWithMode(cat_cols=cat_cols)

num_cols_pipe = [col for col in num_cols if col != "Time_taken (min)"]
# Define pipelines for categorical and numeric data
categorical_onehot_pipeline = Pipeline([
    # ('fill_empty', fill_empty_with_mode),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False)),
    ('scaler', StandardScaler())
])

categorical_ordinal_pipeline = Pipeline([
    # ('fill_empty', fill_empty_with_mode),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OrdinalEncoder(sparse_output = False, categories=[Road_Traffic_Map, Festival_Map, City_Map])),
    ('scaler', StandardScaler())
])

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    # ('outlier_removal', outlier_remover),
    ('scaler', StandardScaler())
])

# Combine pipelines in a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('cat_one_hot', categorical_onehot_pipeline, one_hot_cols),
    ('cat_ordinal', categorical_ordinal_pipeline, ordinal_cols),
    ('num', numerical_pipeline, num_cols_pipe)
])



In [18]:
input_training_arr = preprocessor.fit_transform(input_feature_train_df)

In [19]:
input_testing_arr = preprocessor.transform(input_feature_test_df)

In [20]:
train_arr = np.c_[input_training_arr, np.array(target_training_df)]
test_arr = np.c_[input_testing_arr, np.array(target_testing_df)]

In [21]:
train_arr[:, -1]

array([13., 11., 24., ..., 23., 37., 27.])

In [22]:
X_train, y_train, X_test, y_test = (
    train_arr[:, :-1],
    train_arr[:, -1],
    test_arr[:, :-1],
    test_arr[:, -1]
)

In [23]:
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'SVR': SVR(),
    'DecisionTree':DecisionTreeRegressor(),
    'RandomForest':RandomForestRegressor()
}

In [30]:
import yaml
config_path = "../params.yaml"

#Load yaml file:
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)



In [31]:
random_forest_params = config["SVR"]
random_forest_params

{'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

In [28]:
from sklearn.metrics import r2_score

for model_name, model_instance in models.items():
    if model_name != "LinearRegression":
        model_params = config[model_name]
        random_cv, best_params_, best_score_ = random_search_cv(model_instance, X_train, y_train, model_params)
        print("#"*20, "\n")
        print(f"{model_name}:\n")
        print(f"Training Score: {best_score_}\n")
        print(f"Best Params: {best_params_}\n")
        y_pred = random_cv.predict(X_test)
        r2_score_value = r2_score(y_test, y_pred)
        print(f"Testing Score: {r2_score_value}\n\n\n\n")
    else:
        model = model_instance
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2_score_value = r2_score(y_test, y_pred)
        print("#"*20, "\n")
        print(f"{model_name}:\n")
        print(f"Testing Score: {r2_score_value}\n\n\n\n")

#################### 

LinearRegression:

Testing Score: 0.4411931564967969








#################### 

Lasso:

Training Score: 0.4257620856641628

Best Params: {'alpha': 0.2}

Testing Score: 0.4320840331644533








#################### 

Ridge:

Training Score: 0.4336620976107401

Best Params: {'alpha': 2}

Testing Score: 0.4411830667653901




#################### 

Elasticnet:

Training Score: 0.37924318830620285

Best Params: {'l1_ratio': 0.7, 'alpha': 0.6}

Testing Score: 0.3844129750096714








ValueError: X should be a square kernel matrix

In [32]:
from sklearn.metrics import r2_score

for model_name, model_instance in models.items():
    if model_name != "LinearRegression":
        model_params = config[model_name]
        random_cv, best_params_, best_score_ = random_search_cv(model_instance, X_train, y_train, model_params)
        print("#"*20, "\n")
        print(f"{model_name}:\n")
        print(f"Training Score: {best_score_}\n")
        print(f"Best Params: {best_params_}\n")
        y_pred = random_cv.predict(X_test)
        r2_score_value = r2_score(y_test, y_pred)
        print(f"Testing Score: {r2_score_value}\n\n\n\n")
    else:
        model = model_instance
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2_score_value = r2_score(y_test, y_pred)
        print("#"*20, "\n")
        print(f"{model_name}:\n")
        print(f"Testing Score: {r2_score_value}\n\n\n\n")

#################### 

LinearRegression:

Testing Score: 0.4411931564967969








#################### 

Lasso:

Training Score: 0.4257620856641628

Best Params: {'alpha': 0.2}

Testing Score: 0.4320840331644533








#################### 

Ridge:

Training Score: 0.4336620976107401

Best Params: {'alpha': 2}

Testing Score: 0.4411830667653901




#################### 

Elasticnet:

Training Score: 0.3696173776983672

Best Params: {'l1_ratio': 1, 'alpha': 0.6}

Testing Score: 0.37455862534321416








#################### 

SVR:

Training Score: 0.5907932169447208

Best Params: {'kernel': 'rbf'}

Testing Score: 0.6032467579577171




#################### 

DecisionTree:

Training Score: 0.6482111103282142

Best Params: {'min_samples_split': 4, 'min_samples_leaf': 3, 'max_depth': 7}

Testing Score: 0.6677740000438843






ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/abhishek/Desktop/iNeuron/Machine Learning/EndToEnd/ML/Zomato Time Prediction/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/abhishek/Desktop/iNeuron/Machine Learning/EndToEnd/ML/Zomato Time Prediction/venv/lib/python3.11/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/abhishek/Desktop/iNeuron/Machine Learning/EndToEnd/ML/Zomato Time Prediction/venv/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/abhishek/Desktop/iNeuron/Machine Learning/EndToEnd/ML/Zomato Time Prediction/venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of RandomForestRegressor must be a str among {'friedman_mse', 'poisson', 'absolute_error', 'squared_error'}. Got 'gini' instead.

--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/abhishek/Desktop/iNeuron/Machine Learning/EndToEnd/ML/Zomato Time Prediction/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/abhishek/Desktop/iNeuron/Machine Learning/EndToEnd/ML/Zomato Time Prediction/venv/lib/python3.11/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/Users/abhishek/Desktop/iNeuron/Machine Learning/EndToEnd/ML/Zomato Time Prediction/venv/lib/python3.11/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/Users/abhishek/Desktop/iNeuron/Machine Learning/EndToEnd/ML/Zomato Time Prediction/venv/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'criterion' parameter of RandomForestRegressor must be a str among {'friedman_mse', 'poisson', 'absolute_error', 'squared_error'}. Got 'entropy' instead.


In [36]:
model_params = {
  "n_estimators": [50, 100],
  "max_depth": [num for num in range(2,20)],
  "min_samples_split": [num for num in range(2,20)],
  "min_samples_leaf": [num for num in range(2,20)]
}
random_cv, best_params_, best_score_ = random_search_cv(RandomForestRegressor(), X_train, y_train, model_params)
print("#"*20, "\n")
print(f"{model_name}:\n")
print(f"Training Score: {best_score_}\n")
print(f"Best Params: {best_params_}\n")
y_pred = random_cv.predict(X_test)
r2_score_value = r2_score(y_test, y_pred)
print(f"Testing Score: {r2_score_value}\n\n\n\n")

#################### 

RandomForest:

Training Score: 0.7111607889329821

Best Params: {'n_estimators': 100, 'min_samples_split': 3, 'min_samples_leaf': 7, 'max_depth': 10}

Testing Score: 0.721230318199608






In [40]:
from sklearn.ensemble import BaggingRegressor


model = BaggingRegressor(estimator=RandomForestRegressor(), random_state=42)

# Define a distribution of parameters to sample from
param_distributions = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 1.0],  # Fraction of samples to draw from X to train each base estimator
    'max_features': [0.5, 0.7, 1.0],  # Fraction of features to draw from X to train each base estimator
    'bootstrap': [True, False],
    'bootstrap_features': [True, False]
}

random_cv, best_params_, best_score_ = random_search_cv(BaggingRegressor(), X_train, y_train, param_distributions)
print("#"*20, "\n")
print(f"{model_name}:\n")
print(f"Training Score: {best_score_}\n")
print(f"Best Params: {best_params_}\n")
y_pred = random_cv.predict(X_test)
r2_score_value = r2_score(y_test, y_pred)
print(f"Testing Score: {r2_score_value}\n\n\n\n")

#################### 

RandomForest:

Training Score: 0.6933693632320084

Best Params: {'n_estimators': 50, 'max_samples': 0.5, 'max_features': 1.0, 'bootstrap_features': False, 'bootstrap': True}

Testing Score: 0.7027865810848773






In [43]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize the GradientBoostingRegressor
model_name = GradientBoostingRegressor(random_state=42)

# Define a distribution of parameters to sample from
param_distributions = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [None, 'sqrt', 'log2'],
    'subsample': [0.8, 0.9, 1.0]  # Fraction of samples to use for fitting the individual base learners
}


random_cv, best_params_, best_score_ = random_search_cv(model_name, X_train, y_train, param_distributions)
print("#"*20, "\n")
print(f"{model_name}:\n")
print(f"Training Score: {best_score_}\n")
print(f"Best Params: {best_params_}\n")
y_pred = random_cv.predict(X_test)
r2_score_value = r2_score(y_test, y_pred)
print(f"Testing Score: {r2_score_value}\n\n\n\n")

#################### 

GradientBoostingRegressor(random_state=42):

Training Score: 0.6953225811782637

Best Params: {'subsample': 1.0, 'n_estimators': 300, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 'log2', 'max_depth': 5, 'learning_rate': 0.1}

Testing Score: 0.7082738124939238




