In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

In [2]:
# cleaning of the dataset

df = pd.read_csv('../data/delivery.csv')
df.dropna(inplace=True)
df.rename(columns={'Time_taken(min)': 'Time_taken_min'}, inplace=True)

# Convert time columns to minutes from midnight for easier calculation
# combine this into a function for reusability
def time_to_minutes(time_str):
    try:
        parts = list(map(int, time_str.split(':')))
        if len(parts) == 2:
            return parts[0] * 60 + parts[1]
        elif len(parts) == 3:
            return parts[0] * 60 + parts[1] # Ignoring seconds if present
    except (ValueError, AttributeError):
        return np.nan # Return NaN if the format is incorrect

In [3]:
df['Time_Orderd_mins'] = df['Time_Orderd'].apply(time_to_minutes)
df['Time_Order_picked_mins'] = df['Time_Order_picked'].apply(time_to_minutes)

# Drop original time columns
df = df.drop(['Time_Orderd', 'Time_Order_picked'], axis=1)
df.dropna(subset=['Time_Orderd_mins', 'Time_Order_picked_mins'], inplace=True)

In [4]:
X = df[['Delivery_person_Age', 'Delivery_person_Ratings', 'Weatherconditions', 'Road_traffic_density', 'Vehicle_condition', 'Type_of_vehicle', 'multiple_deliveries', 'Festival', 'City']]
y = df['Time_taken_min']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:

# preprocessing pipeline
# This pipeline will handle converting text to numbers and scaling numerical data

categorical_features = ['Weatherconditions', 'Road_traffic_density', 'Type_of_vehicle', 'Festival', 'City']
numerical_features = ['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition', 'multiple_deliveries']

# OneHotEncoder converts categories into 0s and 1s
# StandardScaler scales numerical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [6]:

# Create and Train the Full Model Pipeline
# The pipeline first preprocesses the data, then trains the Linear Regression model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
model_pipeline.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [7]:

# Evaluate the model's performance
score = model_pipeline.score(X_test, y_test)
print(f"Model R-squared score on test data: {score:.2f}")

Model R-squared score on test data: 0.23


In [8]:
# Save trained model
joblib.dump(model_pipeline, '../model.pkl')
print("Model pipeline saved to model.pkl")

Model pipeline saved to model.pkl
