In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
data_df = pd.read_csv('./data/data_cleaned.csv')

In [3]:
data_df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),Delivery_distance,Time_to_pick,Day,Month,Time_of_Day_Ordered
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,DEH,46,10.28,15.0,2,12,Night
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,KOC,23,6.24,9.6,13,2,Evening
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,PUNE,21,13.79,10.2,3,4,Evening
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,LUDH,20,2.93,10.2,13,2,Morning
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,KNP,41,19.4,15.0,14,2,Night


In [4]:
data_df.drop(columns = 'Day', axis = 1, inplace = True)

One Hot Encode the categorical features as they do not have any inherent ordinality.

In [5]:
data_df.Festival.unique()

array(['No', 'Yes'], dtype=object)

In [6]:
festival = {'No': 0, 'Yes': 1}
months = {1: 'Jan',
          2: 'Feb',
          3: 'Mar',
          4: 'Apr',
          5: 'May',
          6: 'Jun',
          7: 'Jul',
          8: 'Aug',
          9: 'Sep',
          10: 'Oct',
          11: 'Nov',
          12: 'Dec'}

data_df.replace({"Festival": festival}, inplace=True)
data_df.replace({"Month": months}, inplace=True)

In [7]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33378 entries, 0 to 33377
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Delivery_person_Age      33378 non-null  float64
 1   Delivery_person_Ratings  33378 non-null  float64
 2   Weather_conditions       33378 non-null  object 
 3   Road_traffic_density     33378 non-null  object 
 4   Vehicle_condition        33378 non-null  int64  
 5   Type_of_order            33378 non-null  object 
 6   Type_of_vehicle          33378 non-null  object 
 7   multiple_deliveries      33378 non-null  float64
 8   Festival                 33378 non-null  int64  
 9   City                     33378 non-null  object 
 10  Time_taken (min)         33378 non-null  int64  
 11  Delivery_distance        33378 non-null  float64
 12  Time_to_pick             33378 non-null  float64
 13  Month                    33378 non-null  object 
 14  Time_of_Day_Ordered   

In [8]:
num_columns = ['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition', 'Festival',
               'multiple_deliveries', 'Delivery_distance', 'Time_to_pick']

cat_columns = ['Weather_conditions', 'Road_traffic_density', 'Type_of_order', 
              'Type_of_vehicle', 'City', 'Time_of_Day_Ordered', 'Month']

In [9]:
X = data_df.drop(columns = 'Time_taken (min)')
y = data_df[['Time_taken (min)']]

In [15]:
num_pipeline = Pipeline(
    steps = [
        ("Imputer", SimpleImputer(strategy = 'median')),
        ("Scaler", StandardScaler(with_mean=False))
    ]
)

cat_pipeline = Pipeline(
    steps = [
        ("Imputer", SimpleImputer(strategy='most_frequent')),
        ("Encoder", OneHotEncoder(drop = 'first', handle_unknown='ignore')),
        ("Scaler", StandardScaler(with_mean=False))
    ]
)

In [16]:
preprocessor = ColumnTransformer(
    [
        ("Numerical_Pipeline", num_pipeline, num_columns),
        ("Categorical_Pipeline", cat_pipeline, cat_columns)
    ]
)

In [31]:
# Imports for Model Training and Preparation

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [41]:
X_train_ = pd.DataFrame(preprocessor.fit_transform(X_train_), columns = preprocessor.)

In [45]:
X_train_ = X_train.head()

In [47]:
X_train_ = preprocessor.fit_transform(X_train_)

In [51]:
X_train.shape

(26702, 14)

In [57]:
preprocessor.named_transformers_.Categorical_Pipeline.get_feature_names_out()

AttributeError: Estimator Imputer does not provide get_feature_names_out. Did you mean to call pipeline[:-1].get_feature_names_out()?

In [50]:
X_train_

array([[ 7.24711793, 14.94035762,  1.33630621,  0.        ,  2.04124145,
         4.53687634,  1.23678483,  0.        ,  2.04124145,  0.        ,
         2.5       ,  0.        ,  0.        ,  2.04124145,  0.        ,
         0.        ,  0.        ,  0.        ,  2.5       ,  2.04124145,
         2.04124145,  0.        ],
       [ 6.68964732, 14.94035762,  2.67261242,  0.        ,  2.04124145,
         2.29455506,  2.47356966,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  2.5       ,  0.        ,  0.        ,  0.        ,
         2.04124145,  0.        ],
       [ 6.13217671, 13.74512901,  1.33630621,  0.        ,  0.        ,
         2.34678883,  1.39138293,  2.04124145,  0.        ,  2.04124145,
         0.        ,  0.        ,  2.04124145,  2.04124145,  0.        ,
         2.5       ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  2.5       ],
       [ 6.13217671