In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### 1. Data Loading, Splitting, and Scalling

In [6]:
data = pd.read_csv('D:\\GUVI\\visual_studio\\Amazon_Delivery\\Delivery_Time.csv')


In [8]:
data.drop('Unnamed: 0', inplace = True, axis = 1)
data.head()

Unnamed: 0,Agent_Age,Agent_Rating,Order_Date,Order_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category,weekdays,distance
0,37,4.9,738233,113000,Sunny,High,motorcycle,Urban,120.0,Clothing,5,3.025149
1,34,4.5,738239,194500,Stormy,Jam,scooter,Metropolitian,165.0,Electronics,4,20.18353
2,23,4.4,738233,83000,Sandstorms,Low,motorcycle,Urban,130.0,Sports,5,1.552758
3,38,4.7,738250,180000,Sunny,Medium,motorcycle,Metropolitian,105.0,Cosmetics,1,7.790401
4,32,4.6,738240,133000,Cloudy,High,scooter,Metropolitian,150.0,Toys,5,6.210138


In [30]:
x = data.drop(['Order_Date','Delivery_Time'], axis = 1)
y = data.Delivery_Time

In [31]:
x

Unnamed: 0,Agent_Age,Agent_Rating,Order_Time,Weather,Traffic,Vehicle,Area,Category,weekdays,distance
0,37,4.9,113000,Sunny,High,motorcycle,Urban,Clothing,5,3.025149
1,34,4.5,194500,Stormy,Jam,scooter,Metropolitian,Electronics,4,20.183530
2,23,4.4,83000,Sandstorms,Low,motorcycle,Urban,Sports,5,1.552758
3,38,4.7,180000,Sunny,Medium,motorcycle,Metropolitian,Cosmetics,1,7.790401
4,32,4.6,133000,Cloudy,High,scooter,Metropolitian,Toys,5,6.210138
...,...,...,...,...,...,...,...,...,...,...
43589,30,4.8,113500,Windy,High,motorcycle,Metropolitian,Home,3,1.489846
43590,21,4.6,195500,Windy,Jam,motorcycle,Metropolitian,Jewelry,2,11.007735
43591,30,4.9,235000,Cloudy,Low,scooter,Metropolitian,Home,4,4.657195
43592,20,4.7,133500,Cloudy,High,motorcycle,Metropolitian,Kitchen,0,6.232393


In [33]:
y

0        120.0
1        165.0
2        130.0
3        105.0
4        150.0
         ...  
43589    160.0
43590    180.0
43591     80.0
43592    130.0
43593    180.0
Name: Delivery_Time, Length: 43594, dtype: float64

In [32]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=123)

In [35]:
X_train = pd.get_dummies(X_train, dtype = int)
X_test = pd.get_dummies(X_test, dtype = int)

In [36]:
from scipy.stats import zscore
x_train = X_train.apply(zscore)
x_test = X_test.apply(zscore)

### 2. Creating Models

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

#### 1.1 Linear Regression Model

In [37]:
regression_model = LinearRegression()
regression_model.fit(x_train, y_train)

In [38]:
y_pred = regression_model.predict(x_test)

In [39]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
r_2 = r2_score(y_test,y_pred)
r_2

0.6062570138009423

In [40]:
mse_rg = mean_squared_error(y_test, y_pred)
mae_rg = mean_absolute_error(y_test, y_pred)
rmse_rg = np.sqrt(mse_rg)

print(f'mse = {mse_rg}, mae = {mae_rg}, rmse = {rmse_rg}')


mse = 1035.5953989733764, mae = 25.329168968762087, rmse = 32.18066809395629


In [64]:
regression_model.get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

#### 1.2 Decision Tree

In [41]:
model_dt = DecisionTreeRegressor()
model_dt.fit(x_train, y_train)

In [42]:
y_pred_train = model_dt.predict(x_train)
y_pred_test = model_dt.predict(x_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mse = mean_squared_error(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)
rmse = np.sqrt(mse)

print(f'Train Score {r2_train} Test Score {r2_test}')
print(f'mse = {mse}, mae = {mae}, rmse = {rmse}')


Train Score 1.0 Test Score 0.6336937113574539
mse = 963.4333065718546, mae = 22.97453836449134, rmse = 31.03922206776218


Decision Tree Reguralrization

In [61]:
dt_cv1_params = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [62]:
dt_cv1 = GridSearchCV(estimator=DecisionTreeRegressor(), param_grid=dt_cv1_params, cv=5)
dt_cv1.fit(x_train, y_train)

In [63]:
y_pred_train = dt_cv1.predict(x_train)
y_pred_test = dt_cv1.predict(x_test)

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

mse = mean_squared_error(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)
rmse = np.sqrt(mse)

print(f'Train Score {r2_train} Test Score {r2_test}')
print(f'mse = {mse}, mae = {mae}, rmse = {rmse}')

Train Score 0.8272832532927545 Test Score 0.8090976149548402
mse = 502.0981669140511, mae = 17.36947176421819, rmse = 22.407547097218185


In [67]:
dt_cv1.best_params_

{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}

#### 1.3 Gradient Boosting Regressor

In [43]:
#GRADIENT BOOSTING
from sklearn.ensemble import GradientBoostingRegressor
gbrg = GradientBoostingRegressor(loss = 'squared_error', n_estimators = 50 ,random_state=1)
gbrg.fit(x_train, y_train)

In [44]:
y_train_pred = gbrg.predict(x_train)
y_test_pred = gbrg.predict(x_test)
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r_scored = r2_score(y_test, y_test_pred)
rmse = np.sqrt(mse)
print('mae = ',mae)
print('mse = ', mse)
print('r2_score = ',r_scored)
print('rmse = ', rmse)

mae =  20.281804183971257
mse =  684.7942247862738
r2_score =  0.7396348774178476
rmse =  26.168573227944123


In [53]:
gbrg.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_iter_no_change': None,
 'random_state': 1,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

##### 1.3.1 Gradient Boosting CV

In [55]:
gbrg_params1 = {
 'learning_rate': [0.1, 0.2, 0.3],
 'max_depth': [3, 6, 8],
 'min_samples_leaf': [1, 3, 6],
 'min_samples_split': [2, 6, 10],
}

In [56]:
gbrg_cv1 = GridSearchCV(estimator=GradientBoostingRegressor(), param_grid=gbrg_params1, cv = 5)
gbrg_cv1.fit(x_train, y_train)

In [57]:
y_train_pred = gbrg_cv1.predict(x_train)
y_test_pred = gbrg_cv1.predict(x_test)
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r_scored = r2_score(y_test, y_test_pred)
rmse = np.sqrt(mse)
print('mae = ',mae)
print('mse = ', mse)
print('r2_score = ',r_scored)
print('rmse = ', rmse)

mae =  17.138456344384597
mse =  487.70703567569166
r2_score =  0.8145692566734027
rmse =  22.08409010296081


In [60]:
gbrg_cv1.best_params_

{'learning_rate': 0.1,
 'max_depth': 8,
 'min_samples_leaf': 6,
 'min_samples_split': 2}

#### 1.4 Random Forest

In [45]:
#Random forest
#2nd best
from sklearn.ensemble import RandomForestRegressor
rfrg = RandomForestRegressor()
rfrg.fit(x_train, y_train)


In [46]:
y_train_pred = rfrg.predict(x_train)
y_test_pred = rfrg.predict(x_test)
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r_scored = r2_score(y_test, y_test_pred)
rmse = np.sqrt(mse)

print('mae = ',mae)
print('mse = ', mse)
print('r2_score = ',r_scored)
print('rmse = ', rmse)

mae =  17.579616928546855
mse =  522.3678603280193
r2_score =  0.8013908893146268
rmse =  22.855368304361654


In [47]:
importances = rfrg.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': x_test.columns, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df)

                  Feature  Gini Importance
27       Category_Grocery         0.238948
1            Agent_Rating         0.171014
4                distance         0.111692
13           Traffic_Low          0.093833
0               Agent_Age         0.091151
9           Weather_Sunny         0.054668
5          Weather_Cloudy         0.043985
6             Weather_Fog         0.041913
2              Order_Time         0.039998
15    Vehicle_motorcycle          0.023825
3                weekdays         0.015640
14        Traffic_Medium          0.011185
18    Area_Metropolitian          0.004999
12           Traffic_Jam          0.002989
37          Category_Toys         0.002946
35        Category_Snacks         0.002768
34      Category_Skincare         0.002762
25     Category_Cosmetics         0.002738
8          Weather_Stormy         0.002738
26   Category_Electronics         0.002684
22       Category_Apparel         0.002648
33         Category_Shoes         0.002648
31      Cat

In [68]:
rfrg.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

##### 1.4.1 RandomForest Cross Validation

In [48]:
prams1 = {
         'n_estimators':[110, 125, 135],
         'min_samples_leaf': [10, 15, 20],
         'min_samples_split':[30, 45, 50]
         }

In [49]:
rfrg_cv1 = GridSearchCV(estimator=RandomForestRegressor(), param_grid = prams1, cv = 10, return_train_score=True)
rfrg_cv1.fit(x_train, y_train)

In [50]:
y_train_pred = rfrg_cv1.predict(x_train)
y_test_pred = rfrg_cv1.predict(x_test)
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
r_scored = r2_score(y_test, y_test_pred)
rmse = np.sqrt(mse)

print('mae = ',mae)
print('mse = ', mse)
print('r2_score = ',r_scored)
print('rmse = ', rmse)

mae =  17.217946975051255
mse =  490.2442907895819
r2_score =  0.813604568720673
rmse =  22.14146090007572


In [51]:
rfrg_cv1.best_params_

{'min_samples_leaf': 20, 'min_samples_split': 30, 'n_estimators': 110}

In [69]:
rfrg_cv1.get_params()

{'cv': 10,
 'error_score': nan,
 'estimator__bootstrap': True,
 'estimator__ccp_alpha': 0.0,
 'estimator__criterion': 'squared_error',
 'estimator__max_depth': None,
 'estimator__max_features': 1.0,
 'estimator__max_leaf_nodes': None,
 'estimator__max_samples': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__monotonic_cst': None,
 'estimator__n_estimators': 100,
 'estimator__n_jobs': None,
 'estimator__oob_score': False,
 'estimator__random_state': None,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': RandomForestRegressor(),
 'n_jobs': None,
 'param_grid': {'n_estimators': [110, 125, 135],
  'min_samples_leaf': [10, 15, 20],
  'min_samples_split': [30, 45, 50]},
 'pre_dispatch': '2*n_jobs',
 'refit': True,
 'return_train_score': True,
 'scoring': None,
 'verbose': 0}

MULTIPLE MODEL RUNNING

In [70]:
models = [
    (
        'LinearRegression',
        {'n_jobs': None},
        LinearRegression(),
        (x_train, y_train),
        (x_test, y_test)
    ),
    
    (
        'Decision Tree CV',
        {
        'max_depth': 10,
        'min_samples_leaf': 4,
        'min_samples_split': 10},
        DecisionTreeRegressor(),
        (x_train, y_train),
        (x_test, y_test)
    ),
    
    (
        'Gradient Boosting',
        {'learning_rate': 0.1,
        'max_depth': 3,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        },
        GradientBoostingRegressor(),
        (x_train, y_train),
        (x_test, y_test)
    ),
    
    (
        'Gradient Boosting CV',
        {'learning_rate': 0.1,
        'max_depth': 8,
        'min_samples_leaf': 6,
        'min_samples_split': 2,
        },
        GradientBoostingRegressor(),
        (x_train, y_train),
        (x_test, y_test)
    ),
    
    (
        'Random Forest',
        {'max_depth': None,
        'min_samples_leaf':1,
        'min_samples_split':2,
        'n_estimators': 100,
        },
        RandomForestRegressor(),
        (x_train, y_train),
        (x_test, y_test)
    ),
    
    (
        'Random Forest CV',
        {'max_depth': None,
        'min_samples_leaf':20,
        'min_samples_split':30,
        'n_estimators': 110,
        },
        RandomForestRegressor(),
        (x_train, y_train),
        (x_test, y_test)
    )
]

In [71]:
report = []
for model_name, params, model, train_set, test_set in models:
    x_train = train_set[0]
    y_train = train_set[1]
    x_test = test_set[0]
    y_test = test_set[1]
    model.set_params(**params)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r_score = r2_score(y_test, y_pred)
    
    report.append((model_name, rmse, mae,r_score ))
    

In [72]:
report

[('LinearRegression',
  np.float64(32.18066809395629),
  25.329168968762087,
  0.6062570138009423),
 ('Decision Tree CV',
  np.float64(22.353125473199324),
  17.343164141702474,
  0.8100237851898493),
 ('Gradient Boosting',
  np.float64(24.829002671355724),
  19.32710045944438,
  0.7656088181220612),
 ('Gradient Boosting CV',
  np.float64(22.084890163217494),
  17.1394094345883,
  0.8145558208963597),
 ('Random Forest',
  np.float64(22.810882889216977),
  17.56901479527469,
  0.802163277687509),
 ('Random Forest CV',
  np.float64(22.14504886846582),
  17.225479233596065,
  0.8135441539986594)]

In [None]:

import mlflow

In [76]:
mlflow.set_experiment('Amazon_Delivery_Experiment1')
mlflow.set_tracking_uri('http://127.0.0.1:5000')
for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    score = report[i]
    
    with mlflow.start_run(run_name = model_name):
        mlflow.log_params(params)
        mlflow.log_metrics({'RMSE':score[1],
                            'MAE':score[2],
                            'R2': score[3]})
        mlflow.sklearn.log_model(model, 'model')

2025/02/09 23:13:35 INFO mlflow.tracking.fluent: Experiment with name 'Amazon_Delivery_Experiment1' does not exist. Creating a new experiment.


🏃 View run LinearRegression at: http://127.0.0.1:5000/#/experiments/364454856457264698/runs/2e6876dd3e6e467eaa4c9d85cc89c51d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/364454856457264698




🏃 View run Decision Tree CV at: http://127.0.0.1:5000/#/experiments/364454856457264698/runs/a606bba7dc4b4901ba4c003c81053eb2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/364454856457264698




🏃 View run Gradient Boosting at: http://127.0.0.1:5000/#/experiments/364454856457264698/runs/41696fde58044854800a41757f9327fb
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/364454856457264698




🏃 View run Gradient Boosting CV at: http://127.0.0.1:5000/#/experiments/364454856457264698/runs/1d7c065b5d894afa853d3f4233daca64
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/364454856457264698




🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/364454856457264698/runs/f2f58fb210fc4f21ba2534022964be95
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/364454856457264698




🏃 View run Random Forest CV at: http://127.0.0.1:5000/#/experiments/364454856457264698/runs/9f500ecc48c1474bb3bb403e077621f5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/364454856457264698
