In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [12]:
data_df = pd.read_csv('./data/data_cleaned.csv')

In [13]:
data_df.head()

Unnamed: 0,Delivery_person_Age,Delivery_person_Ratings,Weather_conditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken (min),Delivery_distance,Time_to_pick,Day,Month,Time_of_Day_Ordered
0,36.0,4.2,Fog,Jam,2,Snack,motorcycle,3.0,No,DEH,46,10.28,15.0,2,12,Night
1,21.0,4.7,Stormy,High,1,Meal,motorcycle,1.0,No,KOC,23,6.24,9.6,13,2,Evening
2,23.0,4.7,Sandstorms,Medium,1,Drinks,scooter,1.0,No,PUNE,21,13.79,10.2,3,4,Evening
3,34.0,4.3,Sandstorms,Low,0,Buffet,motorcycle,0.0,No,LUDH,20,2.93,10.2,13,2,Morning
4,24.0,4.7,Fog,Jam,1,Snack,scooter,1.0,No,KNP,41,19.4,15.0,14,2,Night


In [14]:
data_df.drop(columns = 'Day', axis = 1, inplace = True)

One Hot Encode the categorical features as they do not have any inherent ordinality.

In [15]:
data_df.Festival.unique()

array(['No', 'Yes'], dtype=object)

In [16]:
festival = {'No': 0, 'Yes': 1}
months = {1: 'Jan',
          2: 'Feb',
          3: 'Mar',
          4: 'Apr',
          5: 'May',
          6: 'Jun',
          7: 'Jul',
          8: 'Aug',
          9: 'Sep',
          10: 'Oct',
          11: 'Nov',
          12: 'Dec'}

data_df.replace({"Festival": festival}, inplace=True)
data_df.replace({"Month": months}, inplace=True)

In [17]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33378 entries, 0 to 33377
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Delivery_person_Age      33378 non-null  float64
 1   Delivery_person_Ratings  33378 non-null  float64
 2   Weather_conditions       33378 non-null  object 
 3   Road_traffic_density     33378 non-null  object 
 4   Vehicle_condition        33378 non-null  int64  
 5   Type_of_order            33378 non-null  object 
 6   Type_of_vehicle          33378 non-null  object 
 7   multiple_deliveries      33378 non-null  float64
 8   Festival                 33378 non-null  int64  
 9   City                     33378 non-null  object 
 10  Time_taken (min)         33378 non-null  int64  
 11  Delivery_distance        33378 non-null  float64
 12  Time_to_pick             33378 non-null  float64
 13  Month                    33378 non-null  object 
 14  Time_of_Day_Ordered   

In [7]:
num_columns = ['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition', 'Festival',
               'multiple_deliveries', 'Delivery_distance', 'Time_to_pick']

cat_columns = ['Weather_conditions', 'Road_traffic_density', 'Type_of_order', 
              'Type_of_vehicle', 'City', 'Time_of_Day_Ordered', 'Month']

In [20]:
df = pd.get_dummies(data_df, columns=cat_columns, dtype=float)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33378 entries, 0 to 33377
Data columns (total 62 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Delivery_person_Age               33378 non-null  float64
 1   Delivery_person_Ratings           33378 non-null  float64
 2   Vehicle_condition                 33378 non-null  int64  
 3   multiple_deliveries               33378 non-null  float64
 4   Festival                          33378 non-null  int64  
 5   Time_taken (min)                  33378 non-null  int64  
 6   Delivery_distance                 33378 non-null  float64
 7   Time_to_pick                      33378 non-null  float64
 8   Weather_conditions_Cloudy         33378 non-null  float64
 9   Weather_conditions_Fog            33378 non-null  float64
 10  Weather_conditions_Sandstorms     33378 non-null  float64
 11  Weather_conditions_Stormy         33378 non-null  float64
 12  Weat

In [21]:
X = df.drop(columns = 'Time_taken (min)')
y = df[['Time_taken (min)']]

In [24]:
num_columns = list(X.columns)
# num_columns

# cat_columns = [x for x in X.columns if x not in num_columns]

In [25]:
num_pipeline = Pipeline(
    steps = [
        ("Imputer", SimpleImputer(strategy = 'median')),
        ("Scaler", StandardScaler(with_mean=False))
    ]
)

# cat_pipeline = Pipeline(
#     steps = [
#         ("Imputer", SimpleImputer(strategy='most_frequent')),
# #         ("Encoder", OneHotEncoder(sparse=False)),
#         ("Scaler", StandardScaler(with_mean=False))
#     ]
# )

In [26]:
preprocessor = ColumnTransformer(
    [
        ("Numerical_Pipeline", num_pipeline, num_columns),
        # ("Categorical_Pipeline", cat_pipeline, cat_columns)
    ]
)

In [27]:
# Imports for Model Training and Preparation

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [29]:
X_train = preprocessor.fit_transform(X_train)

In [30]:
X_train = pd.DataFrame(data=X_train, columns = X.columns)

In [31]:
X_test = pd.DataFrame(preprocessor.transform(X_test), columns= X.columns)

In [32]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'Elastic Net': ElasticNet()
}


model_name = []
model_score = []

for key, value in models.items():
    model = value
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    test_score = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    train_score = r2_score(y_train, model.predict(X_train))
    
    model_name.append(key)
    model_score.append(test_score)
    
    print(key)
    print(f"\tRoot Mean Squared Error: {rmse}\n\tMean Absolute Error: {mae}\n\tTest R2 Score: {round(test_score*100, 2)}%\n\tTrain R2 Score: {round(train_score*100, 2)}%")
    print("="*50)

Linear Regression
	Root Mean Squared Error: 5.9783560522502786
	Mean Absolute Error: 4.789572723187537
	Test R2 Score: 59.24%
	Train R2 Score: 58.86%
Lasso
	Root Mean Squared Error: 6.653543257115567
	Mean Absolute Error: 5.289722847902577
	Test R2 Score: 49.51%
	Train R2 Score: 49.49%
Ridge
	Root Mean Squared Error: 5.970299033889638
	Mean Absolute Error: 4.787124744593799
	Test R2 Score: 59.35%
	Train R2 Score: 59.01%
Elastic Net
	Root Mean Squared Error: 6.638652651544121
	Mean Absolute Error: 5.319650759413437
	Test R2 Score: 49.74%
	Train R2 Score: 49.65%
