In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from src.logger import logger
from src.exception import CustomException
from src.utils import *
import sys


In [2]:
# Extraction of Data:
try:
    conn, cursor = mysql_connection()
    sql = "select * from zomato.delivery"
    cursor.execute(sql)
    result = cursor.fetchall()
    df = pd.DataFrame(result)
except Exception as e:
    logger.error("Error extracting data from MySQL")
    raise CustomException(e, sys)
finally:
    conn.close()
    logger.info("MySQL connection closed")


2024-03-18 16:06:58 - Zomato_Time_Prediction - utils - INFO : Connecting to MySQL...
2024-03-18 16:06:58 - Zomato_Time_Prediction - utils - INFO : MySQL connection established
2024-03-18 16:06:59 - Zomato_Time_Prediction - 272529110 - INFO : MySQL connection closed


In [3]:
#Drop columns:
drop_cols = ['SerialNo', 'ID', 'Delivery_person_ID', 'Order_Date', 'Time_Orderd', 'Time_Order_picked' ]
df.drop(labels=drop_cols, axis=1, inplace=True)

#Calculating distance:
df["distance"] = df.apply(lambda row: cal_distance(row['Restaurant_latitude'], row['Restaurant_longitude'], row['Delivery_location_latitude'], row['Delivery_location_longitude']), axis=1)
df.drop(labels=[ 'Restaurant_latitude', 'Restaurant_longitude', 'Delivery_location_latitude', 'Delivery_location_longitude'], axis=1, inplace=True)


In [4]:
cat_cols = []
num_cols = []

for i in df.columns:
    if df[i].dtype == 'object':
        cat_cols.append(i)
    else:
        num_cols.append(i)

In [5]:
df = fill_empty_with_mode(df,cat_cols)

In [6]:
#Overall mean
mean = df["Time_taken (min)"].mean()
thereshold_percentage = 0.1
threshold_value = mean * thereshold_percentage
threshold_value

2.63658376810238

In [11]:
df_weather = df[["Weather_conditions", "Time_taken (min)"]].groupby("Weather_conditions").mean()
df_weather["Time_taken (min)"].std()

2.5996474791257365

In [13]:
df_road = df[["Road_traffic_density", "Time_taken (min)"]].groupby("Road_traffic_density").mean()
df_road["Time_taken (min)"].std()

4.054263656149913

In [14]:
df_road = df[["Type_of_order", "Time_taken (min)"]].groupby("Type_of_order").mean()
df_road["Time_taken (min)"].std()

0.08766091041580491

In [15]:
df_road = df[["Type_of_vehicle", "Time_taken (min)"]].groupby("Type_of_vehicle").mean()
df_road["Time_taken (min)"].std()

1.5038591080461499

In [16]:
df_road = df[["Festival", "Time_taken (min)"]].groupby("Festival").mean()
df_road["Time_taken (min)"].std()

13.79609904522959

In [17]:
df_road = df[["City", "Time_taken (min)"]].groupby("City").mean()
df_road["Time_taken (min)"].std()

14.329871133322866

In [7]:
df["Vehicle_condition"] = df["Vehicle_condition"].replace(0,1)
df["multiple_deliveries"] = df["multiple_deliveries"].replace(0.0,1.0)

# Dividing ordinal and onehot encoder:

1. Low Standard Deviation: If the standard deviation of the mean time taken across categories is low (e.g., less than 10% of the overall mean of the response variable), it suggests that the mean time taken doesn't vary much between categories. In such cases, OneHotEncoder might be suitable, especially if the categorical variable is nominal.

2. High Standard Deviation: If the standard deviation is high (e.g., greater than 10% of the overall mean), it indicates significant variability in the mean time taken between categories. For ordinal variables or when you want to capture this variability without increasing dimensionality too much, OrdinalEncoder might be more appropriate.

In [8]:
class FeatureClassifier:
    def __init__(self,df, target_column):
        self.df = df
        self.target_column = target_column
    
    def get_ordinal_columns_mapping(self,columns):
        """
        This function is used to get the mapping of ordinal columns.
        Each key is named as 'ColumnName_Map' and contains the unique values for that column.
        """
        ordinal_columns_mapping = {}
        for col in columns:
            sorted_groups = self.df.groupby(col)[self.target_column].mean().sort_values().index.tolist()
            key_name = f"{col}_Map"
            ordinal_columns_mapping[key_name] = sorted_groups
        
        return ordinal_columns_mapping
        

        
    def ordinal_onehot_numerical_divide(self):
        """
        This function is used to divide the categorical into ordinal and one-hot columns and numerical columns.
        """
        one_hot_cols = []
        ordinal_cols = []
        num_cols = []
        #Overall mean
        mean = df[self.target_column].mean()
        thereshold_percentage = 0.1
        threshold_value = mean * thereshold_percentage
        try:
            for column in self.df.columns:
                if column != self.target_column and self.df[column].dtype == 'object':
                    df_column = self.df[[column, self.target_column]].groupby(column).mean().reset_index()
                    standard_dev = df_column[self.target_column].std()
                    if standard_dev > threshold_value:
                        ordinal_cols.append(column)
                    else:
                        one_hot_cols.append(column)
                else:
                    num_cols.append(column)

            #Get Mappingsd for ordinal columns:
            ordinal_columns_mapping = self.get_ordinal_columns_mapping(ordinal_cols)
            return (one_hot_cols, ordinal_cols, num_cols, ordinal_columns_mapping)
                 

        except Exception as e:
            print(e)
            raise CustomException("Error in feature_classifier.ordinal_onehot_numerical_divide: {}".format(e, sys))


In [9]:
target_column = "Time_taken (min)"
feature_classifier_obj = FeatureClassifier(df, target_column)

In [10]:
one_hot_cols, ordinal_cols, num_cols, ordinal_columns_mapping = feature_classifier_obj.ordinal_onehot_numerical_divide()

In [11]:
one_hot_cols, "\n", ordinal_cols, "\n", num_cols

(['Weather_conditions', 'Type_of_order', 'Type_of_vehicle'],
 '\n',
 ['Road_traffic_density', 'Festival', 'City'],
 '\n',
 ['Delivery_person_Age',
  'Delivery_person_Ratings',
  'Vehicle_condition',
  'multiple_deliveries',
  'Time_taken (min)',
  'distance'])

In [12]:
ordinal_columns_mapping

{'Road_traffic_density_Map': ['Low', 'Medium', 'High', 'Jam'],
 'Festival_Map': ['No', 'Yes'],
 'City_Map': ['Urban', 'Metropolitian', 'Semi-Urban']}

In [13]:
#outlier Removal:
df_filter = outlier_removal(df, num_cols)

In [14]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df_filter, test_size=0.2, random_state=42)

input_feature_train_df = train_set.drop(labels=target_column,axis=1)
input_feature_test_df = test_set.drop(labels=target_column,axis=1)
target_training_df = train_set[target_column]
target_testing_df = test_set[target_column]

In [15]:
categories = []
for key, value in ordinal_columns_mapping.items():
    categories.append(value)
categories


[['Low', 'Medium', 'High', 'Jam'],
 ['No', 'Yes'],
 ['Urban', 'Metropolitian', 'Semi-Urban']]

In [16]:
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


num_cols_pipe = [col for col in num_cols if col != "Time_taken (min)"]

# Define pipelines for categorical and numeric data
categorical_onehot_pipeline = Pipeline([
    
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False)),
    ('scaler', StandardScaler())
])

categorical_ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OrdinalEncoder(categories=categories)),
    ('scaler', StandardScaler())
])

numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine pipelines in a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('cat_one_hot', categorical_onehot_pipeline, one_hot_cols),
    ('cat_ordinal', categorical_ordinal_pipeline, ordinal_cols),
    ('num', numerical_pipeline, num_cols_pipe)
])



In [17]:
input_training_arr = preprocessor.fit_transform(input_feature_train_df)
input_testing_arr = preprocessor.transform(input_feature_test_df)

In [18]:
train_arr = np.c_[input_training_arr, np.array(target_training_df)]
test_arr = np.c_[input_testing_arr, np.array(target_testing_df)]

In [19]:
X_train, y_train, X_test, y_test = (
    train_arr[:, :-1],
    train_arr[:, -1],
    test_arr[:, :-1],
    test_arr[:, -1]
)

In [20]:
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor

models = {
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'SVR': SVR(),
    'DecisionTree':DecisionTreeRegressor(),
    'RandomForest':RandomForestRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor(),
    'BaggingRegressor' : BaggingRegressor()
}

In [21]:
import yaml
config_path = "../params.yaml"

#Load yaml file:
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

In [None]:
from sklearn.metrics import r2_score

for model_name, model_instance in models.items():
    if model_name != "LinearRegression":
        model_params = config[model_name]
        random_cv, best_params_, best_score_ = random_search_cv(model_instance, X_train, y_train, model_params)
        print("#"*80, "\n")
        print(f"{model_name}:\n")
        print(f"Training Score: {best_score_}\n")
        print(f"Best Params: {best_params_}\n")
        y_pred = random_cv.predict(X_test)
        r2_score_value = r2_score(y_test, y_pred)
        print(f"Testing Score: {r2_score_value}\n\n\n\n")
    else:
        model = model_instance
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2_score_value = r2_score(y_test, y_pred)
        print("#"*80, "\n")
        print(f"{model_name}:\n")
        print(f"Testing Score: {r2_score_value}\n\n\n\n")