In [93]:
#Importing libraries for cleaning data
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [94]:
# Importing Raw data from CSV to dataframe and checking the first 5 rows
df = pd.read_csv('../data/raw/Food_Delivery_Times.csv')
print(df.head())

   Order_ID  Distance_km Weather Traffic_Level Time_of_Day Vehicle_Type  \
0       522         7.93   Windy           Low   Afternoon      Scooter   
1       738        16.42   Clear        Medium     Evening         Bike   
2       741         9.52   Foggy           Low       Night      Scooter   
3       661         7.44   Rainy        Medium   Afternoon      Scooter   
4       412        19.03   Clear           Low     Morning         Bike   

   Preparation_Time_min  Courier_Experience_yrs  Delivery_Time_min  
0                    12                     1.0                 43  
1                    20                     2.0                 84  
2                    28                     1.0                 59  
3                     5                     1.0                 37  
4                    16                     5.0                 68  


In [95]:
# Checking for missing values and data types of each column
cols = df.columns
for col in cols:
    print(f'{col}: {df[col].isnull().sum()}\t{df[col].dtype}')

Order_ID: 0	int64
Distance_km: 0	float64
Weather: 30	object
Traffic_Level: 30	object
Time_of_Day: 30	object
Vehicle_Type: 0	object
Preparation_Time_min: 0	int64
Courier_Experience_yrs: 30	float64
Delivery_Time_min: 0	int64


### **Vehicle Type Column: Imputation and Encoding Rationale**  

The `Vehicle_Type` column contains categorical data (Scooter, Bike, Car) with missing values. We address these gaps and encode the variable as follows:  

#### **1. Mode Imputation for Missing Values**  
        Missing values are filled using the **mode** (most frequent vehicle type).

#### **2. Ordinal Encoding for Delivery Speed Hierarchy**  
        Car < Scooter < Bike

#### **Practical Outcome**  
By imputing missing values with the mode and applying ordinal encoding, we maintain data integrity while enabling models to leverage the inherent speed differences between vehicle types—critical for accurate delivery time predictions.

In [96]:
# Defining vehicle types for ordinal encoding
vehicle_vals = ['Car', 'Scooter', 'Bike']
ordinalEncoding = OrdinalEncoder(categories=[vehicle_vals])
df['Vehicle_Type'] = ordinalEncoding.fit_transform(df[['Vehicle_Type']]) 


### **Time of the Day Column: Imputation and Encoding Strategy**

The `Time_of_Day` column contains categorical data (Night, Day, Evening, Afternoon) with missing values. We address these gaps and prepare the data for modeling as follows:

#### **1. Mode Imputation for Missing Values**
#### **2. One-Hot Encoding for Feature Representation**  

#### **3. Practical Outcome**  
By imputing missing values with the mode and applying one-hot encoding, we retain the flexibility to capture both direct and interactive effects of time of day on delivery performance—critical for building robust predictive models.

In [97]:
# Applying Mode Imputation on Time of the day Column
df['Time_of_Day'] = df['Time_of_Day'].fillna(df['Time_of_Day'].mode()[0])

#Applying OneHotEncoding on the Time_of_Day column
ohe = OneHotEncoder()
encoded_day = ohe.fit_transform(df[['Time_of_Day']]).toarray()
encoded_day_df = pd.DataFrame(encoded_day, columns=ohe.get_feature_names_out(['Time_of_Day']))
df = pd.concat([df, encoded_day_df], axis=1)
df.drop('Time_of_Day', axis=1, inplace=True)

### **Traffic Level Column Data Cleaning: Mode Imputation and Ordinal Encoding**
The Traffic_Level column contains categorical data with a natural ordinal hierarchy (Low → Medium → High) that directly influences delivery time. To address missing values and encode the variable appropriately:

### **1. Mode Imputation**
### **2. Ordinal Encoding**


In [98]:
# Imputing NaN values via Mode Imputation.
df['Traffic_Level'] = df['Traffic_Level'].fillna(df['Traffic_Level'].mode()[0])

#defining traffic levels for ordinal encoding
traffic_vals = ['Low', 'Medium', 'High']

#ordinal encoding for traffic levels
ordinalEncoder = OrdinalEncoder(categories=[traffic_vals])
df['Traffic_Level'] = ordinalEncoder.fit_transform(df[['Traffic_Level']])

### **Weather Column Data Cleaning: Mode Imputation and Ordinal Encoding**

The Weather column contains categorical data with an inherent hierarchical relationship (e.g., Clear → Windy → Foggy → Rainy → Snowy) that directly impacts delivery time. To address missing values and encode the variable appropriately:

### 1. **Mode Imputation**
### 2. **Ordinal Encoding**

In [99]:
# Applying Mode Imputation on Weather Column
df['Weather'] = df['Weather'].fillna(df['Weather'].mode()[0])

#Pre-processing the Weather column to have ordinal values for better model training
weather_vals = ['Clear', 'Windy', 'Foggy', 'Rainy', 'Snowy']
ordinal_encoder = OrdinalEncoder(categories=[weather_vals])
df['Weather'] = ordinal_encoder.fit_transform(df[['Weather']])


In [100]:
# Checking for missing values and data types of each column for validation of changes made above.
cols = df.columns
for col in cols:
    print(f'{col}: {df[col].isnull().sum()}\t{df[col].dtype}')

Order_ID: 0	int64
Distance_km: 0	float64
Weather: 0	float64
Traffic_Level: 0	float64
Vehicle_Type: 0	float64
Preparation_Time_min: 0	int64
Courier_Experience_yrs: 30	float64
Delivery_Time_min: 0	int64
Time_of_Day_Afternoon: 0	float64
Time_of_Day_Evening: 0	float64
Time_of_Day_Morning: 0	float64
Time_of_Day_Night: 0	float64


In [101]:
# Dropping Order_ID column as it is not useful for model training
df.drop('Order_ID', axis=1, inplace=True)

In [None]:
# Exporting cleaned data to CSV
df.to_csv('../data/processed/cleaned_food_delivery_data.csv', index=False)