In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,MinMaxScaler,PowerTransformer
from sklearn.pipeline import Pipeline

In [2]:
!pip install dagshub mlflow

Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow
  Downloading mlflow-2.21.2-py3-none-any.whl.metadata (30 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub)
  Downloading gql-3.5.2-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting dataclasses-json (from dagshub)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting treelib>=1.6.4 (from dagshub)
  Downloading treelib-1.7.1-py3-none-any.whl.metadata (1.4 kB)
Collecting pathvalidate>=3.0.0 (from dagshub)
  Downloading pathvalidate-3.2.3-py3-none-any.whl.metadata (12 kB)
Collecting boto3 (from dagshub)
  Downloading boto3-1.37.23-py3-none-any.whl.metadata (6.7 kB)
Collecting semver (from dagshub)
  Downloading semver-3.0.4-py3-none-any.whl.metadata (6.8 kB)
Collec

In [3]:
import dagshub
dagshub.init(repo_owner='amitkumar981', repo_name='swiggy-delivery-time-prediction', mlflow=True)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=f623989d-e528-4e46-b06f-1fdcdd6ee3fe&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=4e56b87d0e8883e7af5b3af2a209235edbd45e16d24813802e1b67f8f5368d2a




In [5]:
import mlflow
mlflow.set_tracking_uri('https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow')

In [6]:
mlflow.set_experiment('exp_6-lightGBM_HP')

2025/03/31 05:47:49 INFO mlflow.tracking.fluent: Experiment with name 'exp_6-lightGBM_HP' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/b8770e8844f84138ae1eb15fb40e7ce9', creation_time=1743400069853, experiment_id='5', last_update_time=1743400069853, lifecycle_stage='active', name='exp_6-lightGBM_HP', tags={}>

In [7]:
df=pd.read_csv('/content/final_df (4).csv')
df.head()

Unnamed: 0,person_age,person_ratings,latitude,longitude,delivery_location_latitude,delivery_location_longitude,weather_condition,traffic_density,vehicle_condition,order_type,...,city_name,day,month,day_of_week,is_weekend,pickup_time_in_minutes,order_time_hr,time_slot,distance,distance_type
0,37.0,4.9,22.745049,75.892471,22.765049,75.912471,sunny,High,2,Snack,...,INDO,19,3,Saturday,1,15.0,11.0,Morning,3.025149,short
1,34.0,4.5,12.913041,77.683237,13.043041,77.813237,stormy,Jam,2,Snack,...,BANG,25,3,Friday,0,5.0,19.0,Evening,20.18353,very long
2,23.0,4.4,12.914264,77.6784,12.924264,77.6884,sandstorms,Low,0,Drinks,...,BANG,19,3,Saturday,1,15.0,8.0,Morning,1.552758,short
3,38.0,4.7,11.003669,76.976494,11.053669,77.026494,sunny,Medium,0,Buffet,...,COIMB,5,4,Tuesday,0,10.0,18.0,Evening,7.790401,medium
4,32.0,4.6,12.972793,80.249982,13.012793,80.289982,cloudy,High,1,Snack,...,CHEN,26,3,Saturday,1,15.0,13.0,Afternoon,6.210138,medium


In [8]:
# drop columns not required for model input

columns_to_drop =  ['latitude','longitude','delivery_location_latitude','delivery_location_longitude',
                    'city_name', 'day', 'month', 'day_of_week','order_time_hr']
df.drop(columns_to_drop,axis=1,inplace=True)

In [9]:
#drop missing values from data set
df.dropna(inplace=True)

In [10]:
#check again missing values
df.isnull().sum().sum()

np.int64(0)

In [11]:
#saprate dependent and independent variable from data set
x=df.drop(columns='time_taken')
y=df['time_taken']

In [12]:
#spliting
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)
x_train.shape,x_test.shape

((30451, 15), (7613, 15))

In [13]:
#numarical columns
num_cols=['person_age','person_ratings','pickup_time_in_minutes','distance']
num_cat_cols=['weather_condition','order_type','vehicle_type','multiple_deliveries','festival','city',
             'time_slot']
ordinal_cat_cols=['traffic_density','distance_type']

In [14]:
traffic_order=['Low ','Medium ','High ','Jam ']
distance_order=['short','medium','long','very long']

In [15]:
preprocessor=ColumnTransformer(transformers=[
    ('scaling',MinMaxScaler(),num_cols),
    ('OHE',OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False,feature_name_combiner='concat'),num_cat_cols),
    ('ordinal_encoding',OrdinalEncoder(categories=[traffic_order,distance_order]),ordinal_cat_cols),
],remainder='passthrough',verbose_feature_names_out=False,force_int_remainder_cols=False)
preprocessor.set_output(transform='pandas')

In [16]:
preprocessor_pipeline=Pipeline(steps=[
    ('preprocessor',preprocessor)
])

In [17]:
x_train_trans=preprocessor_pipeline.fit_transform(x_train)
x_test_trans=preprocessor_pipeline.transform(x_test)
x_train_trans

Unnamed: 0,person_age,person_ratings,pickup_time_in_minutes,distance,weather_condition_fog,weather_condition_sandstorms,weather_condition_stormy,weather_condition_sunny,weather_condition_windy,order_type_Drinks,...,festival_yes,city_semi-urban,city_urban,time_slot_Evening,time_slot_Morning,time_slot_Night,traffic_density,distance_type,vehicle_condition,is_weekend
24939,0.052632,0.80,0.5,0.163629,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,1.0,0.0,0.0,3.0,0.0,1,0
9439,0.526316,0.96,1.0,0.157036,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,2,0
18409,0.631579,1.00,0.0,0.160160,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,3.0,0.0,2,1
15267,0.789474,0.96,0.5,0.003145,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2,0
43924,0.157895,0.88,0.5,0.084651,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20042,0.368421,0.88,1.0,0.621266,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,2,1
7521,0.842105,0.88,0.0,0.788907,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,3.0,3.0,0,1
13483,0.736842,0.92,0.5,0.154044,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2,0
1038,0.368421,0.92,0.5,0.992901,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,2,0


In [18]:
pt=PowerTransformer()
y_train_pt=pt.fit_transform(y_train.values.reshape(-1,1))
y_test_pt=pt.transform(y_test.values.reshape(-1,1))

In [19]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.2.1


In [20]:
from sklearn.ensemble import RandomForestRegressor
import optuna
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.compose import TransformedTargetRegressor

In [23]:
def objective(trial):
  with mlflow.start_run(nested=True):
       params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, step=50),
        "max_depth": trial.suggest_int("max_depth", 3, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "bootstrap": trial.suggest_categorical("bootstrap", [True, False]),
        "max_samples": trial.suggest_float("max_samples", 0.5, 1.0) if trial.suggest_categorical("bootstrap", [True, False]) else None,
        "min_impurity_decrease": trial.suggest_float("min_impurity_decrease", 0.0, 0.1),
    }
       reg_model =RandomForestRegressor(**params)
       reg_model = TransformedTargetRegressor(regressor=reg_model,transformer=pt)
       reg_model.fit(x_train_trans,y_train_pt)

       y_pred_train=reg_model.predict(x_train_trans)
       y_pred_test=reg_model.predict(x_test_trans)

        # perform cross validation
       cv_score = cross_val_score(reg_model,
                                x_train_trans,
                                y_train,
                                cv=5,
                                scoring="neg_mean_absolute_error",
                                n_jobs=-1)
       cv_mean_score=-(cv_score.mean())

        #log cv score
       mlflow.log_metric("cv score", cv_mean_score)


       return cv_mean_score

In [25]:
#create study
study=optuna.create_study(direction='minimize')

with mlflow.start_run(run_name='best_model')  :
    study.optimize(objective,n_trials=50,n_jobs=-1,show_progress_bar=True)

    #log best params
    mlflow.log_params(study.best_params)

    #log best value
    mlflow.log_param('best_score',study.best_value)

    #train model on best params
    best_rf=RandomForestRegressor(**study.best_params)
    best_rf.fit(x_train_trans,y_train_pt)

    # get the predictions
    y_pred_train = best_rf.predict(x_train_trans)
    y_pred_test = best_rf.predict(x_test_trans)

    # get the actual predictions values
    y_pred_train_org = pt.inverse_transform(y_pred_train.reshape(-1,1))
    y_pred_test_org = pt.inverse_transform(y_pred_test.reshape(-1,1))


      # perform cross validation
    model = TransformedTargetRegressor(regressor=best_rf,
                                          transformer=pt)


    scores = cross_val_score(model,
                          x_train_trans,
                          y_train,
                          scoring="neg_mean_absolute_error",
                          cv=5,n_jobs=-1)

    # log metrics
    mlflow.log_metric("training_error",mean_absolute_error(y_train,y_pred_train_org))
    mlflow.log_metric("test_error",mean_absolute_error(y_test,y_pred_test_org))
    mlflow.log_metric("training_r2",r2_score(y_train,y_pred_train_org))
    mlflow.log_metric("test_r2",r2_score(y_test,y_pred_test_org))
    mlflow.log_metric("cross_val",- scores.mean())

    #log model
    mlflow.sklearn.log_model('best_rf','model')

[I 2025-03-31 06:17:04,511] A new study created in memory with name: no-name-5194850d-ab67-448a-82a9-aef2c76f5aea


  0%|          | 0/50 [00:00<?, ?it/s]

🏃 View run exultant-sheep-376 at: https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow/#/experiments/5/runs/beaf7fc459de4a09aadea9ad4486e19e
🧪 View experiment at: https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow/#/experiments/5
[I 2025-03-31 06:17:52,895] Trial 1 finished with value: 5.985251919766326 and parameters: {'n_estimators': 100, 'max_depth': 26, 'min_samples_split': 9, 'min_samples_leaf': 1, 'bootstrap': False, 'min_impurity_decrease': 0.0883187949487964}. Best is trial 1 with value: 5.985251919766326.
🏃 View run calm-cod-2 at: https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow/#/experiments/5/runs/7144f0f3be984b8bbbf1c0bdb8321c0c
🧪 View experiment at: https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow/#/experiments/5
[I 2025-03-31 06:18:29,210] Trial 0 finished with value: 4.807091403135984 and parameters: {'n_estimators': 150, 'max_depth': 6, 'min_samples_split': 18, 'min_samples_leaf': 10, 



🏃 View run luminous-mule-405 at: https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow/#/experiments/5/runs/7c9aa6df6ef64206874511ebf1a67b5b
🧪 View experiment at: https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow/#/experiments/5
[I 2025-03-31 06:53:49,339] Trial 32 finished with value: 3.6179146911615376 and parameters: {'n_estimators': 400, 'max_depth': 19, 'min_samples_split': 14, 'min_samples_leaf': 6, 'bootstrap': True, 'max_samples': 0.8014166799089214, 'min_impurity_decrease': 0.0011475605026649229}. Best is trial 12 with value: 3.126047779273502.
🏃 View run hilarious-stoat-249 at: https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow/#/experiments/5/runs/bbba15828f88421f8e130e290826b1c7
🧪 View experiment at: https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow/#/experiments/5
[I 2025-03-31 06:53:49,801] Trial 33 finished with value: 3.6941007107784443 and parameters: {'n_estimators': 400, 'max_depth': 2

  return fit_method(estimator, *args, **kwargs)


🏃 View run best_model at: https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow/#/experiments/5/runs/b447eeac52764c1dbab84a9d889af6c8
🧪 View experiment at: https://dagshub.com/amitkumar981/swiggy-delivery-time-prediction.mlflow/#/experiments/5


In [26]:
study.best_params

{'n_estimators': 400,
 'max_depth': 21,
 'min_samples_split': 18,
 'min_samples_leaf': 10,
 'bootstrap': True,
 'max_samples': 0.7634340439115328,
 'min_impurity_decrease': 3.465395379320018e-05}

In [27]:
study.best_value

3.077974606151664

In [29]:
from optuna.visualization import plot_param_importances
# Generate the hyperparameter importance plot
fig = plot_param_importances(study)

# Show the plot
fig.show()